In [None]:
# download nltk stopwords
import nltk
from google.cloud import storage
nltk.download('stopwords')

In [None]:
# Install a particular version of `google-cloud-storage` because (oddly enough) 
# the  version on Colab and GCP is old. A dependency error below is okay.
!pip install -q google-cloud-storage==1.43.0
!pip install pyspark

In [3]:
# authenticate below for Google Storage access as needed
from google.colab import auth
auth.authenticate_user()

In [None]:
# Copy one wikidumps files 
import os
from pathlib import Path

project_id = 'assignment3-370517'

!gcloud config set project {project_id}

bucket_name = 'amit-chen-bucket-1'

postings_gcp_text = 'postings_gcp_text'
postings_gcp_anchor = 'postings_gcp_anchor'
postings_gcp_title = 'postings_gcp_title'
postings_gcp_text_stemmed = 'postings_gcp_text_stemmed'
postings_gcp_anchor_stemmed = 'postings_gcp_anchor_stemmed'
postings_gcp_title_stemmed = 'postings_gcp_title_stemmed'
doc_len = 'dl'
doc_title = 'dt'
nf_body = 'nf'

# uncoment data you want to upload from bucket 

!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_text}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_anchor}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_title}/ .  
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_text_stemmed}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_anchor_stemmed}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_title_stemmed}/ .  
!gsutil -m cp -r gs://{bucket_name}/{nf_body}/ . 
!gsutil -m cp -r gs://{bucket_name}/{doc_len}/ . 
!gsutil -m cp -r gs://{bucket_name}/{doc_title}/ . 

In [5]:
!mkdir pr/
!mkdir pv/
client = storage.Client()
blobs = client.list_blobs(f"{bucket_name}")
for b in blobs:
    if "part-00000-8b293cd5-fd79-47e7-a641-3d067da0c2b0-c000.csv.gz" in b.name:
      b.download_to_filename("pr/pr.csv.gz")
    if "pageview_pageviews-202108-user.pkl" in b.name:
      b.download_to_filename("pv/pv.pkl")

In [None]:
# install ngrok to emulate public IP / address
!wget -N https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip -O ngrok-stable-linux-amd64.zip
!unzip -u ngrok-stable-linux-amd64.zip

In [None]:
# TODO: sign up for an ngrok account
# then put your ngrok token below, uncomment, and execute
!./ngrok authtoken 2JsDuuUTH9LwAeBbnQqZXkbbdod_5NLnteyuKr4NCqq1Knyok

In [None]:
# install a ngrok python package and a version of flask that works with it in 
# colab
!pip -q install flask-ngrok
!pip -q install flask==0.12.2
# !pip -q install flask_restful

# Unzip training files
upload and run below code:

In [None]:
!unzip all_files_for_frontend.zip

# Loading data:

In [1]:
from flask import Flask, request, jsonify, render_template
import gzip
import pickle
import pandas as pd
from inverted_index_gcp import *

INDEX_FILE = "index"
POSTINGS_GCP_TEXT_INDEX_FOLDER_URL = "postings_gcp_text"
POSTINGS_GCP_ANCHOR_INDEX_FOLDER_URL = "postings_gcp_anchor"
POSTINGS_GCP_TITLE_INDEX_FOLDER_URL = "postings_gcp_title"
POSTINGS_GCP_TEXT_STEMMED_INDEX_FOLDER_URL = "postings_gcp_text_stemmed"
POSTINGS_GCP_ANCHOR_STEMMED_INDEX_FOLDER_URL = "postings_gcp_anchor_stemmed"
POSTINGS_GCP_TITLE_STEMMED_INDEX_FOLDER_URL = "postings_gcp_title_stemmed"
PAGE_RANK_URL = "pr/pr.csv.gz"
PAGE_VIEW_URL = "pv/pv.pkl"
DT_PATH = "dt/dt.pkl"
DL_PATH = "dl/dl.pkl"
NF_PATH = "nf/nf.pkl"

# open files (inverted indexes etc...)
inverted_index_body = InvertedIndex.read_index(POSTINGS_GCP_TEXT_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_anchor = InvertedIndex.read_index(POSTINGS_GCP_ANCHOR_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_title = InvertedIndex.read_index(POSTINGS_GCP_TITLE_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_body_stemmed = InvertedIndex.read_index(POSTINGS_GCP_TEXT_STEMMED_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_anchor_stemmed = InvertedIndex.read_index(POSTINGS_GCP_ANCHOR_STEMMED_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_title_stemmed = InvertedIndex.read_index(POSTINGS_GCP_TITLE_STEMMED_INDEX_FOLDER_URL, INDEX_FILE)

with open(DL_PATH, 'rb') as f:
    DL = pickle.load(f)
    DL_LEN = len(DL)

with open(DT_PATH, 'rb') as f:
    DT = pickle.load(f)

with open(NF_PATH, 'rb') as f:
    NF = pickle.load(f)

with open(PAGE_VIEW_URL, 'rb') as f:
    page_view = pickle.load(f)

with gzip.open(PAGE_RANK_URL) as f:
    page_rank = pd.read_csv(f, header=None, index_col=0).squeeze("columns").to_dict()
    max_pr_value = max(page_rank.values())
    page_rank = {doc_id: rank/max_pr_value for doc_id, rank in page_rank.items()}
    

# Utils:

In [2]:
import re
import math
from collections import Counter
from nltk.stem.porter import *
from nltk.corpus import stopwords


def tokenize(text, STEMMING=False):
    RE_WORD = re.compile(r"""[\#\@\w](['\-]?[\w,]?[\w.]?(?:['\-]?[\w,]?[\w])){0,24}""", re.UNICODE)
    english_stopwords = frozenset(stopwords.words('english'))
    corpus_stopwords = ["category", "references", "also", "external", "links",
                        "may", "first", "see", "history", "people", "one", "two",
                        "part", "thumb", "including", "second", "following",
                        "many", "however", "would", "became"]

    all_stopwords = english_stopwords.union(corpus_stopwords)

    tokens = [token.group() for token in RE_WORD.finditer(text.lower())]

    if STEMMING:
        stemmer = PorterStemmer()
        list_of_tokens = [stemmer.stem(x) for x in tokens if x not in all_stopwords]
    else:
        list_of_tokens = [x for x in tokens if x not in all_stopwords]
  
    return list_of_tokens


def BM25(tokens, K, B, AVGDL, inverted_index, index_folder_url, DL, DL_LEN):
    
    doc_BM25_value = Counter()

    for token in tokens:

        # calc idf for specific token
        try:
          token_df = inverted_index.df[token]
        except:
            continue
        token_idf = math.log(DL_LEN/token_df,10)

        # loading posting list with (word, (doc_id, tf))
        posting_list = inverted_index.read_posting_list(token, index_folder_url)
        for page_id, word_freq in posting_list:
            #normalized tf (by the length of document)
            try:
                numerator = word_freq*(K+1)
                denominator = word_freq + K*(1-B + (B*DL[page_id])/AVGDL)
                doc_BM25_value[page_id] += token_idf*(numerator/denominator)
            except:
                pass
        
    sorted_doc_BM25_value = doc_BM25_value.most_common()
    return sorted_doc_BM25_value


def cossim(tokens, inverted_index, index_folder_url, DL, DL_LEN, NF):
    
    # get frequency of each token in query
    query_freq = Counter(tokens)

    numerator = Counter()
    query_denominator = 0
    weight_token_query = 0

    query_len = len(tokens)
    for token in tokens:

        # calc idf for specific token
        try:
          token_df = inverted_index.df[token]
        except:
            continue
        token_idf = math.log(DL_LEN/token_df,10)

        # calc query_token_tf
        tf_of_query_token = query_freq[token]/query_len
        weight_token_query = tf_of_query_token*token_idf
        query_denominator += math.pow(weight_token_query ,2)

        # loading posting list with (word, (doc_id, tf))
        posting_list = inverted_index.read_posting_list(token, index_folder_url)
        for page_id, word_freq in posting_list:
            #normalized tf (by the length of document)
            try:
                tf = (word_freq/DL[page_id])
                weight_word_page = tf*token_idf
                numerator[page_id] += weight_word_page*weight_token_query
            except:
                pass

    cosim = Counter()
    for page_id in numerator.keys():
      cosim[page_id] = numerator[page_id]/((math.sqrt(query_denominator)*NF[page_id]))
    
    sorted_doc_cossim_value = cosim.most_common()
    return sorted_doc_cossim_value


def get_binary_score(tokens, inverted_index, index_folder_url):

    # loading posting list with (word, (doc_id, tf))
    posting_lists = inverted_index.get_posting_lists(tokens, index_folder_url)

    tf_dict = {}
    for posting in posting_lists:
        for doc_id, _ in posting:
            if doc_id in tf_dict:
                tf_dict[doc_id] += 1
            else:
                tf_dict[doc_id] = 1

    list_of_docs = sorted([(doc_id, score) for doc_id, score in tf_dict.items()], key=lambda x: x[1], reverse=True)   
    return list_of_docs

# Functions:

In [8]:
def search(query):
    res = []
    if len(query) == 0:
      return res

    # const bool  
    STEMMING = True
    COSSIM = False

    K = 1.2
    B = 0.75
    AVGDL = 341.0890174848911

    # tokenizing the query
    tokens = tokenize(query, STEMMING)

    if STEMMING:
        inverted_index = inverted_index_body_stemmed
        inverted_index_folder_url = POSTINGS_GCP_TEXT_STEMMED_INDEX_FOLDER_URL
    else:
        inverted_index = inverted_index_body
        inverted_index_folder_url = POSTINGS_GCP_TEXT_INDEX_FOLDER_URL

    if COSSIM:
        print(f"COSSIM {'STEMMED' if STEMMING else ''}:")
        sorted_doc_score_pairs = cossim(tokens, inverted_index, inverted_index_folder_url, DL, DL_LEN, NF)
    else:
        print(f"BM25 {'STEMMED' if STEMMING else ''}:")
        sorted_doc_score_pairs = BM25(tokens, K, B, AVGDL, inverted_index, inverted_index_folder_url, DL, DL_LEN)
    
    # take first 100 
    best = sorted_doc_score_pairs[:100]
    print(best)

    # take page titles according to id
    res = [(x[0], DT[x[0]]) for x in best]
    
    return res

def search_body(query):
    res = []
    if len(query) == 0:
      return res

    # tokenizing the query
    tokens = tokenize(query)

    # cossim
    sorted_doc_score_pairs = cossim(tokens, inverted_index_body, POSTINGS_GCP_TEXT_INDEX_FOLDER_URL, DL, DL_LEN, NF)
    
    # take first 100 
    best = sorted_doc_score_pairs[:100]
    print(best)

    # take page titles according to id
    res = [(x[0], DT[x[0]]) for x in best]

    return res


def search_title(query):
    res = []
    if len(query) == 0:
      return res

    # tokenizing the query
    tokens = tokenize(query)

    # get number of query tokens in doc_title
    list_of_docs = get_binary_score(tokens, inverted_index_title, POSTINGS_GCP_TITLE_INDEX_FOLDER_URL)
    print(list_of_docs)

    # generate doc_title for each doc_id
    for doc_id, _ in list_of_docs:
        try:
            res.append((doc_id, DT[doc_id]))
        except:
            pass   

    return res


def search_anchor(query):
    res = []
    if len(query) == 0:
      return res

    # tokenizing the query
    tokens = tokenize(query)

    # get number of query tokens in doc_anchor_text
    list_of_docs = get_binary_score(tokens, inverted_index_anchor, POSTINGS_GCP_ANCHOR_INDEX_FOLDER_URL)
    print(list_of_docs)

    # generate doc_title for each doc_id
    for doc_id, _ in list_of_docs:
        try:
            res.append((doc_id, DT[doc_id]))
        except:
            pass   

    return res


def get_pagerank(wiki_ids):
    res = []
    if len(wiki_ids) == 0:
      return res

    for wiki_id in wiki_ids:
      try:
        res.append(page_rank[wiki_id])
      except:
        res.append(None)

    return res


def get_pageview(wiki_ids):
    res = []
    if len(wiki_ids) == 0:
      return res
      
    for wiki_id in wiki_ids:
      try:
        res.append(page_rank[wiki_id])
      except:
        res.append(None)

    return res


# Run Quries


In [9]:
print(search("best marvel movies"))

BM25 STEMMED:
[(7722858, 7.953164309759178), (1499483, 7.8098590120984195), (1275470, 7.624724424499659), (1074657, 7.608064268579172), (6253998, 7.604818351446154), (910407, 7.54868017026951), (5676692, 7.483434395880227), (5454213, 7.466972675054993), (9110929, 7.421795855981193), (5720905, 7.365575933198417), (619048, 7.2581462252127515), (14577142, 7.234447168626588), (3483096, 7.230687714373347), (622415, 7.207202433747371), (4275087, 7.194794574753801), (743391, 7.193303949404867), (2117716, 7.190201294822571), (1535704, 7.188355178204184), (4667205, 7.151594010068294), (817441, 7.112124903095004), (18674251, 7.105145229391836), (1484143, 7.0890454783456125), (1305399, 7.061958687698432), (17101700, 7.049844398575196), (3146871, 7.043594702293403), (8807756, 7.019045221612669), (2173540, 7.0153309915519895), (2982611, 6.990844507520782), (1756788, 6.9622371255804065), (618979, 6.951650097802821), (1916589, 6.935051562757767), (2493991, 6.933253063110356), (2742150, 6.932574036421

# Run App

In [12]:
# you need to upload your implementation of search_app.py
import search_frontend as se

In [None]:
# uncomment the code below and execute to reload the module when you make 
# changes to search_frontend.py (after you upload again).
import importlib
importlib.reload(se)

In [15]:
from flask_ngrok import run_with_ngrok
run_with_ngrok(se.app) 
se.app.run()

INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://ee02-35-186-191-230.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


[2023-01-13 13:05:16,638] ERROR in app: Exception on / [GET]
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 1982, in wsgi_app
    response = self.full_dispatch_request()
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 1614, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 1517, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "/usr/local/lib/python3.8/dist-packages/flask/_compat.py", line 33, in reraise
    raise value
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 1612, in full_dispatch_request
    rv = self.dispatch_request()
  File "/usr/local/lib/python3.8/dist-packages/flask/app.py", line 1598, in dispatch_request
    return self.view_functions[rule.endpoint](**req.view_args)
  File "/content/search_frontend.py", line 60, in show_shmoogle
    return render_template('shmoogle.html')
  File

# Testing your app

Once your app is running you can query it. You can simply do that by navigating to the URL that ngrok gave you above or through code in a different python session. For example, once the frontend app is running, you can navigate to:
http://YOUR_SERVER_DOMAIN/search?query=hello+world where YOUR_SERVER_DOMAIN is something like XXXX-XX-XX-XX-XX.ngrok.io, which is printed above in Colab or that is your external IP on GCP.

The code below shows how to issue a query from python. This is also how our testing code will issue queries to your search engine, so make sure to test your search engine this way after you deploy it to GCP and before submission. Command line instructions for deploying your search engine to GCP are available at `run_frontend_in_gcp.sh`. Note that we will not only issue training queries to your search engine, but also test queries, i.e. queries that you've never seen before.

In [None]:
import json

with open('queries_train.json', 'rt') as f:
  queries = json.load(f)

In [None]:
def average_precision(true_list, predicted_list, k=40):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    precisions = []
    for i,doc_id in enumerate(predicted_list):        
        if doc_id in true_set:
            prec = (len(precisions)+1) / (i+1)            
            precisions.append(prec)
    if len(precisions) == 0:
        return 0.0
    return round(sum(precisions)/len(precisions),3)

In [None]:
import requests
from time import time
# url = 'http://35.232.59.3:8080'
# place the domain you got from ngrok or GCP IP below. 
url = 'http://cded-35-192-160-143.ngrok.io'

qs_res = []
for q, true_wids in queries.items():
  duration, ap = None, None
  t_start = time()
  try:
    res = requests.get(url + '/search_body', {'query': q}, timeout=35)
    duration = time() - t_start
    if res.status_code == 200:
      pred_wids, _ = zip(*res.json())
      ap = average_precision(true_wids, pred_wids)
  except:
    pass
  
  qs_res.append((q, duration, ap))
print(qs_res)