# Imports

In [1]:
!gcloud dataproc clusters list --region us-central1

NAME            PLATFORM  WORKER_COUNT  PREEMPTIBLE_WORKER_COUNT  STATUS   ZONE           SCHEDULED_DELETE
sean-cluster-7  GCE       4                                       RUNNING  us-central1-a


In [2]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

[0m

In [3]:
%cd -q /home/dataproc
!ls inverted_index_gcp.py
from inverted_index_gcp import InvertedIndex

inverted_index_gcp.py


In [4]:
# These will already be installed in the testing environment so disregard the 
# amount of time (~1 minute) it takes to install. 
!pip install -q pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install -q graphframes



import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import math
from functools import reduce
from google.cloud import storage
from inverted_index_gcp import *


import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

# ''' code addition'''
# from flask import Flask, request, jsonify
# ''' code addition'''

nltk.download('stopwords')



[0mPackage openjdk-8-jdk-headless is not available, but is referred to by another package.
This may mean that the package is missing, has been obsoleted, or
is only available from another source

E: Package 'openjdk-8-jdk-headless' has no installation candidate
[0m

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# *PySpark*

In [5]:
!ls -l /usr/lib/spark/jars/graph*

-rw-r--r-- 1 root root 247882 Jan  8 00:27 /usr/lib/spark/jars/graphframes-0.8.2-spark3.1-s_2.12.jar


In [6]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SQLContext
from graphframes import *

In [7]:
spark

In [8]:
sc.addFile("/home/dataproc/inverted_index_gcp.py")
sys.path.insert(0,SparkFiles.getRootDirectory())
spark = SparkSession.builder.getOrCreate()

# Index Functions

In [9]:
# Calculating tf
def word_count_b(text, id):
    tokens = [token.group() for token in RE_WORD.finditer(text.lower())]
    dict_counter= Counter(tokens)
    new_list=[(k, (id,dict_counter[k])) for k in dict_counter if k not in all_stopwords]
    return new_list

# get list of (id, terms)
def doc_count_b(text, id):
    tokens = [token.group() for token in RE_WORD.finditer(text.lower())]
    return [(id,tokens)]
    
# Calculating tf for title
def word_count_t(text, id):
    tokens = [token.group() for token in RE_WORD.finditer(text.lower())]
    dict_counter = Counter(tokens)
    return [(k, (id,1)) for k in dict_counter if k not in all_stopwords]

# Sort posting list by wiki_id
def reduce_word_counts(unsorted_pl): return sorted(unsorted_pl, key=lambda k: k[0])

# Calculate df for each token in a posting list
def calculate_df(postings): return postings.map(lambda x: (x[0], len(x[1])))

#Write to the disk all posting lists locations
NUM_BUCKETS = 124
def token2bucket_id(token):
    return int(_hash(token),16) % NUM_BUCKETS
def partition_postings_and_write(postings,bucket_name):
    rd = postings.map(lambda x : (token2bucket_id(x[0]),(x[0],x[1])))
    rd = rd.groupByKey()
    return rd.map(lambda x: InvertedIndex.write_a_posting_list(x,bucket_name))




# Suppot cluster

In [10]:
# Stopwords
english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links", 
                    "may", "first", "see", "history", "people", "one", "two", 
                    "part", "thumb", "including", "second", "following", 
                    "many", "however", "would", "became"]
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)
all_stopwords = english_stopwords.union(corpus_stopwords)

# Creating Index

In [11]:
def createIndex(bucket_name, index_name):
    paths=[]
    client = storage.Client()
    bucket_name_title = bucket_name
    full_path = f"gs://{bucket_name}/"

    blobs = client.list_blobs(bucket_name)
    for b in blobs:
        if b.name.endswith("parquet"):
            paths.append(full_path+b.name)

    # Wikipidia
    parquetFile = spark.read.parquet(*paths)
    dict_title_id= parquetFile.select("id", "title").rdd
    if index_name == "anchor":
      doc_pairs = parquetFile.select("id",f"{index_name}_text").rdd 
    else:
      doc_pairs = parquetFile.select(f"{index_name}", "id").rdd


    word_counts = doc_pairs.flatMap(lambda x: word_count_t(x[0], x[1]))

    postings = word_counts.groupByKey().mapValues(reduce_word_counts)

    w2df = calculate_df(postings)

    w2df_dict = w2df.collectAsMap()

    posting_locs_list = partition_postings_and_write(postings, bucket_name).collect()

    super_posting_locs = defaultdict(list)
    for blob in client.list_blobs(bucket_name, prefix='postings_gcp'):
        if not blob.name.endswith("pickle"):
            continue
        with blob.open("rb") as f:
            posting_locs = pickle.load(f)
            for k, v in posting_locs.items():
                super_posting_locs[k].extend(v)

    inverted = InvertedIndex()
    inverted.posting_locs=super_posting_locs
    inverted.df=w2df_dict
    inverted.title_dict = dict_title_id.collectAsMap()

    inverted.write_index('.',  f'index_{index_name}')
    index_src =  f"index_{index_name}.pkl"
    index_dst = f'gs://{bucket_name}/postings_gcp/{index_src}'
    !gsutil cp $index_src $index_dst


In [12]:
createIndex("sean_bucket_title","title")

                                                                                

Copying file://index_title.pkl [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run          
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

/ [1 files][236.5 MiB/236.5 MiB]                                                
Operation completed over 1 objects/236.5 MiB.                                    


In [None]:
createIndex("sean_bucket_body","text")

                                                                                