# Imports

In [1]:
!gcloud dataproc clusters list --region us-central1

NAME            PLATFORM  WORKER_COUNT  PREEMPTIBLE_WORKER_COUNT  STATUS   ZONE           SCHEDULED_DELETE
sean-cluster-7  GCE       4                                       RUNNING  us-central1-a


In [2]:
!pip install -q google-cloud-storage==1.43.0
!pip install -q graphframes

[0m

In [3]:
%cd -q /home/dataproc
!ls inverted_index_gcp.py
from inverted_index_gcp import InvertedIndex

inverted_index_gcp.py


In [4]:
# These will already be installed in the testing environment so disregard the 
# amount of time (~1 minute) it takes to install. 
!pip install -q pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
!pip install -q graphframes



import pyspark
import sys
from collections import Counter, OrderedDict, defaultdict
import itertools
from itertools import islice, count, groupby
import pandas as pd
import os
import re
from operator import itemgetter
import nltk
from nltk.stem.porter import *
from nltk.corpus import stopwords
from time import time
from pathlib import Path
import pickle
import numpy as np
import pandas as pd
import math
from functools import reduce
from google.cloud import storage
from inverted_index_gcp import *

import hashlib
def _hash(s):
    return hashlib.blake2b(bytes(s, encoding='utf8'), digest_size=5).hexdigest()

# ''' code addition'''
# from flask import Flask, request, jsonify
# ''' code addition'''

nltk.download('stopwords')

[0mPackage openjdk-8-jdk-headless is not available, but is referred to by another package.
This may mean that the package is missing, has been obsoleted, or
is only available from another source

E: Package 'openjdk-8-jdk-headless' has no installation candidate
[0m

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# Stopwords
english_stopwords = frozenset(stopwords.words('english'))
corpus_stopwords = ["category", "references", "also", "external", "links", 
                    "may", "first", "see", "history", "people", "one", "two", 
                    "part", "thumb", "including", "second", "following", 
                    "many", "however", "would", "became"]
RE_WORD = re.compile(r"""[\#\@\w](['\-]?\w){2,24}""", re.UNICODE)
all_stopwords = english_stopwords.union(corpus_stopwords)

# Query Preparation

In [6]:
# tokenize & remove stop words.
def tokenize_and_remove_sw(text):
  tokens = [token.group() for token in RE_WORD.finditer(text.lower())]
  return [x for x in tokens if x not in all_stopwords]

# stemming  
def stemming(tokens):
  stemmer = PorterStemmer()
  return [stemmer.stem(x) for x in tokens]

# reading postinglist

# Index Preparation

In [7]:
TUPLE_SIZE = 6       
TF_MASK = 2 ** 16 - 1 # Masking the 16 low bits of an integer
from contextlib import closing

def read_posting_list(inverted, w, bucket_name):
  with closing(MultiFileReader()) as reader:
    locs = inverted.posting_locs[w]
    b = reader.read(locs, inverted.df[w] * TUPLE_SIZE, bucket_name)
    posting_list = []
    for i in range(inverted.df[w]):
      doc_id = int.from_bytes(b[i*TUPLE_SIZE:i*TUPLE_SIZE+4], 'big')
      tf = int.from_bytes(b[i*TUPLE_SIZE+4:(i+1)*TUPLE_SIZE], 'big')
      posting_list.append((doc_id, tf))
    return posting_list


# Search

In [None]:
# def search(query):
#     res = []
#     # query = request.args.get('query', '')
#     # if len(query) == 0:
#     #   return jsonify(res)
#     # BEGIN SOLUTION
#     tokens = tokenize_and_remove_sw(query)
#     stemmed_tokens = stemming(tokens)
#     title_results = ''' we call the search title function and give it our query & title index'''
#     body_results = ''' we call the search body function and give it our query & body index'''
#     anchor_results = ''' we call the search anchor function and give it our query & anchor index'''

#     ''' we normalize the 3 ranks & using getTopN (N=100)'''

#     '''we merge all the results into 1 list of tuples of (doc_id, title)'''

#     # END SOLUTION
#     return jsonify(res)

## Search Title 

In [9]:
# loading inverted index from title bucket
client = storage.Client()
my_bucket = client.bucket('sean_bucket_title')
idx_title = pickle.loads(my_bucket.get_blob('postings_gcp/index_title.pkl').download_as_string())

In [10]:
def search_title(query):
    res = []
    # if len(query) == 0:
    #   return jsonify(res)
    filtered_query = tokenize_and_remove_sw(query)
    all_posting_lists = {}
    for term in np.unique(filtered_query):
      if term in idx_title.df:
        try:
            res = read_posting_list(idx_title, term, 'sean_bucket_title')
            for doc_id, amount in res:
                try:
                    all_posting_lists[doc_id] += 1
                except:
                    all_posting_lists[doc_id] = 1
        except Exception as e:
            print('error in title index occured - ', e)
    res= sorted(all_posting_lists, key=all_posting_lists.get, reverse=True)
    res = [(id, idx_title.title_dict[id]) for id in res]
    return(res)
    # return jsonify(res)

In [None]:
# print(read_posting_list(idx_title, "alabama",bucket_name_title))
print(search_title("linkedin"))

## Search Body

In [None]:
# loading inverted index from body bucket
client = storage.Client()
my_bucket = client.bucket('sean_bucket_body')
idx_body = pickle.loads(my_bucket.get_blob('postings_gcp/index_b.pkl').download_as_string())

In [None]:
# Cosine_Similarity {id:cosine score}
def cosine_similarity(search_query, index):
    """ Returns: {id:cosine score} """
    dict_cosine_sim = {}
    # idx_body.nf = math.sqrt(sum([index.tf[x]**2 for x in search_query]))
    for term in search_query:
      if term in index.df.keys():
        pos_lst = read_posting_list(index,term)
        for d, t in pos_lst:
          if d in dict_cosine_sim.keys():
            dict_cosine_sim[d] += index.df[term]*t
          else:
            dict_cosine_sim[d] = index.df[term]*t
    for d in dict_cosine_sim.keys():
      dict_cosine_sim[d] *= (1/len(search_query) * index.nf[d])
    return dict_cosine_sim
 

In [None]:
# get list of top N ranked pairs (doc_id, score)
def get_top_n(sim_dict,N=3):
  return sorted([(doc_id,score) for doc_id, score in sim_dict.items()], key = lambda x: x[1],reverse=True)[:N]

In [None]:
def search_body(query):
    filtered_query = tokenize_and_remove_sw(query)
    cos_dct = cosine_similarity(filtered_query,idx_body)
    res = get_top_n(cos_dct,100)
    return [(id[0], idx_body.title_dict[id[0]]) for id in res]
  
    # res = []
    # query = request.args.get('query', '')
    # if len(query) == 0:
    #   return jsonify(res)
    # return jsonify(res)

## Search Anchor

In [16]:
client = storage.Client()
my_bucket = client.bucket('sean_bucket_anchor')
idx_anchor = pickle.loads(my_bucket.get_blob('postings_gcp/index_anchor.pkl').download_as_string())

In [17]:
def search_anchor(query):
  res = []

  # if len(query) == 0:
  #   return jsonify(res)
  filtered_query = tokenize_and_remove_sw(query)
  all_posting_lists = {}
  for term in np.unique(filtered_query):
    if term in idx_anchor.df:
      try:
          res = read_posting_list(idx_anchor, term, "sean_bucket_anchor")
          for doc_id, amount in res:
              try:
                  all_posting_lists[doc_id] += 1
              except:
                  all_posting_lists[doc_id] = 1
      except Exception as e:
          print('error in anchor index occured - ', e)
  res= sorted(all_posting_lists, key=all_posting_lists.get, reverse=True)
  
  res = [(id, idx_anchor.title_dict[id]) for id in res if id in idx_anchor.title_dict]
  return res
  # return jsonify(res)

In [19]:
search_anchor("political")

[(12, 'Anarchism'),
 (307, 'Abraham Lincoln'),
 (324, 'Academy Awards'),
 (580, 'Astronomer'),
 (594, 'Apollo'),
 (600, 'Andorra'),
 (624, 'Alaska'),
 (627, 'Agriculture'),
 (633, 'Algae'),
 (657, 'Asphalt'),
 (679, 'Animal (disambiguation)'),
 (689, 'Asia'),
 (717, 'Alberta'),
 (734, 'Actinopterygii'),
 (737, 'Afghanistan'),
 (738, 'Albania'),
 (746, 'Azerbaijan'),
 (771, 'American Revolutionary War'),
 (775, 'Algorithm'),
 (777, 'Annual plant'),
 (783, 'Alexander the Great'),
 (789, 'Asterales'),
 (824, 'Altaic languages'),
 (844, 'Amsterdam'),
 (854, 'Anatolia'),
 (856, 'Apple Inc.'),
 (857, 'Aberdeenshire'),
 (863, 'American Civil War'),
 (864, 'Andy Warhol'),
 (914, 'Author'),
 (951, 'Antigua and Barbuda'),
 (956, 'Asteraceae'),
 (1027, 'August 9'),
 (1078, 'Antisemitism'),
 (1093, 'Politics of Armenia'),
 (1111, 'Politics of American Samoa'),
 (1193, 'Agrarianism'),
 (1348, 'AK-47'),
 (1354, 'Andes'),
 (1367, 'Albertosaurus'),
 (1491, 'August 12'),
 (1495, 'Australian Labor Party