In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, split, explode, count as spark_count
import pickle
from google.cloud import storage
import math

YOUR_BUCKET_NAME = 'final_project_ir_stav_hen_bucket'

print("Starting Title Index Build")

spark = SparkSession.builder \
    .appName("Title_Index_Build") \
    .getOrCreate()

path_to_data = f"gs://{YOUR_BUCKET_NAME}/multistream*_preprocessed.parquet"
df_data = spark.read.parquet(path_to_data).select("id", "title")

print("Data loaded. Processing Titles")


def process_titles(row):
    doc_id = row['id']
    title_text = row['title']
    if title_text is None:
        return []
    tokens = title_text.lower().split()
    tf_dict = {}
    for token in tokens:
        if token.isalnum(): 
            tf_dict[token] = tf_dict.get(token, 0) + 1
            
    result = []
    for term, tf in tf_dict.items():
        result.append((term, (doc_id, tf)))
    return result

rdd_titles = df_data.rdd.flatMap(process_titles)

print("Grouping by term")
title_index_rdd = rdd_titles.groupByKey().mapValues(list)

print("Collecting Title Index")
title_index_map = title_index_rdd.collectAsMap()

local_filename = 'index_title.pkl'
with open(local_filename, 'wb') as f:
    pickle.dump(title_index_map, f)

client = storage.Client()
blob = client.bucket(YOUR_BUCKET_NAME).blob(local_filename)
blob.upload_from_filename(local_filename)

print(f"Title Index Done. Saved to gs://{YOUR_BUCKET_NAME}/{local_filename}")
print(f"Total unique terms in titles: {len(title_index_map)}")

Starting Title Index Build


26/01/04 11:58:06 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

Data loaded. Processing Titles
Grouping by term
Collecting Title Index


                                                                                

Title Index Done. Saved to gs://final_project_ir_stav_hen_bucket/index_title.pkl
Total unique terms in titles: 1545872


In [1]:
import pyspark
from pyspark.sql import SparkSession
import pickle
from google.cloud import storage
import os

YOUR_BUCKET_NAME = 'final_project_ir_stav_hen_bucket'

spark = SparkSession.builder.appName("Create_ID_to_Title_Map").getOrCreate()

path_to_data = f"gs://{YOUR_BUCKET_NAME}/multistream*_preprocessed.parquet"
df_data = spark.read.parquet(path_to_data).select("id", "title")

print("Data loaded. Creating ID to Title Dictionary")

id_to_title_map = df_data.rdd.map(lambda x: (x.id, x.title)).collectAsMap()

local_filename = 'id_to_title.pkl'
with open(local_filename, 'wb') as f:
    pickle.dump(id_to_title_map, f)

client = storage.Client()
blob = client.bucket(YOUR_BUCKET_NAME).blob(f'postings_gcp/{local_filename}')
blob.upload_from_filename(local_filename)

print(f"Success! Saved id_to_title.pkl to gs://{YOUR_BUCKET_NAME}/postings_gcp/{local_filename}")
print(f"Total documents mapped: {len(id_to_title_map)}")

26/01/04 14:15:15 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
                                                                                

Data loaded. Creating ID to Title Dictionary


                                                                                

Success! Saved id_to_title.pkl to gs://final_project_ir_stav_hen_bucket/postings_gcp/id_to_title.pkl
Total documents mapped: 6348910
