In [None]:
# download nltk stopwords
import nltk
from google.cloud import storage
nltk.download('stopwords')

In [2]:
# Install a particular version of `google-cloud-storage` because (oddly enough) 
# the  version on Colab and GCP is old. A dependency error below is okay.
!pip install -q google-cloud-storage==1.43.0
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# authenticate below for Google Storage access as needed
from google.colab import auth
auth.authenticate_user()

In [None]:
# Copy one wikidumps files 
import os
from pathlib import Path

project_id = 'assignment3-370517'

!gcloud config set project {project_id}

bucket_name = 'amit-chen-bucket-1'

postings_gcp_text = 'postings_gcp_text'
postings_gcp_anchor = 'postings_gcp_anchor'
postings_gcp_title = 'postings_gcp_title'
postings_gcp_text = 'postings_gcp_text_stemmed'
postings_gcp_anchor = 'postings_gcp_anchor_stemmed'
postings_gcp_title = 'postings_gcp_title_stemmed'
doc_len = 'dl'
doc_title = 'dt'
nf_body = 'nf'

# uncoment data you want to upload from bucket 

!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_text}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_anchor}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_title}/ .  
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_text_stemmed}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_anchor_stemmed}/ . 
!gsutil -m cp -r gs://{bucket_name}/{postings_gcp_title_stemmed}/ .  
!gsutil -m cp -r gs://{bucket_name}/{nf_body}/ . 
!gsutil -m cp -r gs://{bucket_name}/{doc_len}/ . 
!gsutil -m cp -r gs://{bucket_name}/{doc_title}/ . 

In [5]:
!mkdir pr/
!mkdir pv/
client = storage.Client()
blobs = client.list_blobs(f"{bucket_name}")
for b in blobs:
    if "part-00000-8b293cd5-fd79-47e7-a641-3d067da0c2b0-c000.csv.gz" in b.name:
      b.download_to_filename("pr/pr.csv.gz")
    if "pageview_pageviews-202108-user.pkl" in b.name:
      b.download_to_filename("pv/pv.pkl")

In [None]:
# install ngrok to emulate public IP / address
!wget -N https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip -O ngrok-stable-linux-amd64.zip
!unzip -u ngrok-stable-linux-amd64.zip

In [None]:
# TODO: sign up for an ngrok account
# then put your ngrok token below, uncomment, and execute
!./ngrok authtoken 2JsDuuUTH9LwAeBbnQqZXkbbdod_5NLnteyuKr4NCqq1Knyok

In [None]:
# install a ngrok python package and a version of flask that works with it in 
# colab
!pip -q install flask-ngrok
!pip -q install flask==0.12.2
# !pip -q install flask_restful

# Unzip training files
upload and run below code:

In [None]:
!unzip all_files_for_frontend.zip

# Messing Around 🦖

In [64]:
from flask import Flask, request, jsonify, render_template
import gzip
import pickle
import pandas as pd
from inverted_index_gcp import *

INDEX_FILE = "index"
POSTINGS_GCP_TEXT_INDEX_FOLDER_URL = "postings_gcp_text"
POSTINGS_GCP_ANCHOR_INDEX_FOLDER_URL = "postings_gcp_anchor"
POSTINGS_GCP_TITLE_INDEX_FOLDER_URL = "postings_gcp_title"
POSTINGS_GCP_TEXT_STEMMED_INDEX_FOLDER_URL = "postings_gcp_text_stemmed"
POSTINGS_GCP_ANCHOR_STEMMED_INDEX_FOLDER_URL = "postings_gcp_anchor_stemmed"
POSTINGS_GCP_TITLE_STEMMED_INDEX_FOLDER_URL = "postings_gcp_title_stemmed"
PAGE_RANK_URL = "pr/pr.csv.gz"
PAGE_VIEW_URL = "pv/pv.pkl"
DT_PATH = "dt/dt.pkl"
DL_PATH = "dl/dl.pkl"
NF_PATH = "nf/nf.pkl"

# open files (inverted indexes etc...)
inverted_index_body = InvertedIndex.read_index(POSTINGS_GCP_TEXT_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_anchor = InvertedIndex.read_index(POSTINGS_GCP_ANCHOR_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_title = InvertedIndex.read_index(POSTINGS_GCP_TITLE_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_body_stemmed = InvertedIndex.read_index(POSTINGS_GCP_TEXT_STEMMED_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_anchor_stemmed = InvertedIndex.read_index(POSTINGS_GCP_ANCHOR_STEMMED_INDEX_FOLDER_URL, INDEX_FILE)
inverted_index_title_stemmed = InvertedIndex.read_index(POSTINGS_GCP_TITLE_STEMMED_INDEX_FOLDER_URL, INDEX_FILE)

with open(DL_PATH, 'rb') as f:
    DL = pickle.load(f)
    DL_LEN = len(DL)

with open(DT_PATH, 'rb') as f:
    DT = pickle.load(f)

with open(NF_PATH, 'rb') as f:
    NF = pickle.load(f)

with open(PAGE_VIEW_URL, 'rb') as f:
    page_view = pickle.load(f)

with gzip.open(PAGE_RANK_URL) as f:
    page_rank = pd.read_csv(f, header=None, index_col=0).squeeze("columns").to_dict()
    max_pr_value = max(page_rank.values())
    page_rank = {doc_id: rank/max_pr_value for doc_id, rank in page_rank.items()}

# Utils


In [65]:
import re
import math
from collections import Counter
from nltk.corpus import stopwords

def tokenize(text):
    """
    This function aims in tokenize a text into a list of tokens. Moreover, it filter stopwords.
    Parameters:
    -----------
    text: string , represting the text to tokenize.
    Returns:
    -----------
    list of tokens (e.g., list of tokens).
    """
    RE_WORD = re.compile(r"""[\#\@\w](['\-]?[\w,]?[\w.]?(?:['\-]?[\w,]?[\w])){0,24}""", re.UNICODE)
    english_stopwords = frozenset(stopwords.words('english'))
    corpus_stopwords = ["category", "references", "also", "external", "links",
                        "may", "first", "see", "history", "people", "one", "two",
                        "part", "thumb", "including", "second", "following",
                        "many", "however", "would", "became"]

    all_stopwords = english_stopwords.union(corpus_stopwords)

    list_of_tokens = [token.group() for token in RE_WORD.finditer(text.lower()) if token.group() not in all_stopwords]
    return list_of_tokens

def BM25(tokens, K, B, AVGDL, inverted_index, index_folder_url, DL, DL_LEN):
    
    doc_BM25_value = Counter()

    for token in tokens:

        # calc idf for specific token
        try:
          token_df = inverted_index.df[token]
        except:
            continue
        token_idf = math.log(DL_LEN/token_df,10)

        # loading posting list with (word, (doc_id, tf))
        posting_list = inverted_index.read_posting_list(token, index_folder_url)
        for page_id, word_freq in posting_list:
            #normalized tf (by the length of document)
            numerator = word_freq*(K+1)
            denominator = word_freq + K*(1-B + (B*DL[page_id])/AVGDL)
            doc_BM25_value[page_id] += token_idf*(numerator/denominator)
        
    sorted_doc_BM25_value = doc_BM25_value.most_common()
    return sorted_doc_BM25_value

def cossim(tokens, inverted_index, index_folder_url, DL, DL_LEN, NF):
    
    # get frequency of each token in query
    query_freq = Counter(tokens)

    numerator = Counter()
    query_denominator = 0
    weight_token_query = 0

    query_len = len(tokens)
    for token in tokens:

        # calc idf for specific token
        try:
          token_df = inverted_index.df[token]
        except:
            continue
        token_idf = math.log(DL_LEN/token_df,10)

        # calc query_token_tf
        tf_of_query_token = query_freq[token]/query_len
        weight_token_query = tf_of_query_token*token_idf
        query_denominator += math.pow(weight_token_query ,2)

        # loading posting list with (word, (doc_id, tf))
        posting_list = inverted_index.read_posting_list(token, index_folder_url)
        for page_id, word_freq in posting_list:
            #normalized tf (by the length of document)
            try:
                tf = (word_freq/DL[page_id])
                weight_word_page = tf*token_idf
                numerator[page_id] += weight_word_page*weight_token_query
            except:
                pass

    cosim = Counter()
    for page_id in numerator.keys():
      cosim[page_id] = numerator[page_id]/((math.sqrt(query_denominator)*NF[page_id]))
    
    sorted_doc_cossim_value = cosim.most_common()
    return sorted_doc_cossim_value


In [66]:
def search(query):
    res = []
    if len(query) == 0:
      return res

    # tokenizing the query
    tokens = tokenize(query)

    # cossim without stemming
    sorted_doc_score_pairs = cossim(tokens, inverted_index_body, POSTINGS_GCP_TEXT_INDEX_FOLDER_URL, DL, DL_LEN, NF)
    
    # BM25
    # K = ?
    # B = ?
    # AVGDL = 341.0890174848911
    # sorted_doc_score_pairs = BM25(query, K, B, AVGDL, inverted_index_body, POSTINGS_GCP_TEXT_INDEX_FOLDER_URL, DL, DL_LEN)

    # take first 100 
    best = sorted_doc_score_pairs[:100]
    print(best)

    # take page titles according to id
    res = [(x[0], DT[x[0]]) for x in best]


def search_body(query):
    res = []
    if len(query) == 0:
      return res
    
    # tokenizing the query
    tokens = tokenize(query)

    # cossim
    sorted_doc_score_pairs = cossim(tokens, inverted_index_body, POSTINGS_GCP_TEXT_INDEX_FOLDER_URL, DL, DL_LEN, NF)
    
    # take first 100 
    best = sorted_doc_score_pairs[:100]
    print(best)

    # take page titles according to id
    res = [(x[0], DT[x[0]]) for x in best]

    return res


def search_title(query):
    res = []
    if len(query) == 0:
      return res
    # tokenizing the query
    tokens = tokenize(query)

    # loading posting list with (word, (doc_id, tf))
    posting_lists = inverted_index_title.get_posting_lists(tokens, POSTINGS_GCP_TITLE_INDEX_FOLDER_URL)

    tf_dict = {}
    for posting in posting_lists:
        for doc_id, tf in posting:
            if doc_id in tf_dict:
                tf_dict[doc_id] += 1
            else:
                tf_dict[doc_id] = 1

    list_of_docs = sorted([(doc_id, score) for doc_id, score in tf_dict.items()], key=lambda x: x[1], reverse=True)
    print(list_of_docs)
    res = [(doc_id, DT[doc_id]) for doc_id, score in list_of_docs]
    return res


def search_anchor(query):
    res = []
    if len(query) == 0:
      return res
    # tokenizing the query
    tokens = tokenize(query)

    # loading posting list with (word, (doc_id, tf))
    posting_lists = inverted_index_anchor.get_posting_lists(tokens, POSTINGS_GCP_ANCHOR_INDEX_FOLDER_URL)
    
    tf_dict = {}
    for posting in posting_lists:
        for doc_id, tf in posting:
            if doc_id in tf_dict:
                tf_dict[doc_id] += 1
            else:
                tf_dict[doc_id] = 1

    list_of_docs = sorted([(doc_id, score) for doc_id, score in tf_dict.items()], key=lambda x: x[1], reverse=True)[:100]
    print(list_of_docs)
    for doc_id, score in list_of_docs:
        try:
            res.append((doc_id, DT[doc_id]))
        except:
            pass
    return res


def get_pagerank(wiki_ids):
    res = []
    if len(wiki_ids) == 0:
      return res

    for wiki_id in wiki_ids:
      try:
        res.append(page_rank[wiki_id])
      except:
        res.append(None)

    return res


def get_pageview(wiki_ids):
    res = []
    if len(wiki_ids) == 0:
      return res
      
    for wiki_id in wiki_ids:
      try:
        res.append(page_rank[wiki_id])
      except:
        res.append(None)

    return res


# Run Quries


In [None]:
print(search_anchor("information retrieval"))
# print(search_title("information retrieval"))
# print(get_pagerank([123456, 679125, 3978521]))
# print(get_pageview([45687, 778985, 887795]))

# Run App

In [6]:
# you need to upload your implementation of search_app.py
import search_frontend as se

In [None]:
# uncomment the code below and execute to reload the module when you make 
# changes to search_frontend.py (after you upload again).
import importlib
importlib.reload(se)

In [None]:
from flask_ngrok import run_with_ngrok
run_with_ngrok(se.app) 
se.app.run()

INFO:werkzeug: * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://527f-34-171-74-87.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:44] "[37mGET /search?query=best+marvel+movie HTTP/1.1[0m" 200 -


[(5027882, 0.7418604775444455), (39632709, 0.7386383876549714), (306960, 0.727912433618154), (1074657, 0.6954088809011663), (5727147, 0.692150715416282), (403579, 0.6704012023885402), (66113901, 0.6702747131550605), (20966, 0.6647393424182793), (8188167, 0.6526015206684286), (17448401, 0.6407888813584854), (15311317, 0.6395489475492644), (35871824, 0.6317080109629694), (17960196, 0.6280988925505265), (26552494, 0.6249622890377099), (33308367, 0.6136749489600388), (55994110, 0.606806116171739), (894151, 0.6044946941877017), (2453882, 0.6021268827554088), (41008758, 0.6016420599892961), (842924, 0.5990730161478861), (1047719, 0.5964203850912859), (44611175, 0.5915988925865847), (49287801, 0.5900108756833728), (5020000, 0.5850338052498854), (60132806, 0.5849072011312106), (42721570, 0.5811697812701522), (2802921, 0.5799848765278884), (66069085, 0.5792783868089857), (58488308, 0.577647024263518), (9728333, 0.5774331653897649), (10181183, 0.5758524365825953), (17586817, 0.5689786462985459),

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:48] "[37mGET /search?query=How+do+kids+come+to+world%3F HTTP/1.1[0m" 200 -


[(67353110, 0.6327325007068665), (37443868, 0.614093813315551), (54876830, 0.6120893626303608), (64465609, 0.5896588155142352), (32395701, 0.5770661658006019), (11144433, 0.5546529436617497), (2975520, 0.5544220339607618), (41926149, 0.5466112887880099), (63685864, 0.540088457735357), (36478536, 0.5291481458028973), (16378615, 0.5237052590977344), (14662779, 0.5224544819019318), (28035406, 0.5224054696852408), (44042056, 0.5141905013328242), (53493472, 0.5133994865967695), (52386471, 0.5024463907172028), (20982288, 0.49841247452867876), (43512740, 0.4963222740643153), (60924049, 0.48054005145025264), (12192723, 0.4768768304859761), (49671399, 0.46891121798410945), (61005804, 0.46412581211610343), (20971583, 0.45950300323450133), (28831741, 0.4588922933742863), (49207084, 0.44967846194731087), (6036970, 0.44965111323991447), (12582498, 0.4455036832088262), (11258329, 0.4387369641199225), (18790188, 0.4358839197325953), (8978727, 0.4349660482793815), (809425, 0.4349083802562859), (436577

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:49] "[37mGET /search?query=Information+retrieval HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:49] "[37mGET /search?query=LinkedIn HTTP/1.1[0m" 200 -


[(4840292, 0.9120781902233024), (15271, 0.824446489006415), (296950, 0.7549076527343074), (751691, 0.745894629314485), (1514191, 0.7033165714851726), (1981660, 0.6980471561215286), (18550455, 0.6899249714135194), (731640, 0.675536436937295), (10218640, 0.6552475176168817), (762092, 0.6190255440851121), (66284747, 0.6001039773040694), (2816394, 0.5801074317251137), (509624, 0.5480192001634694), (47901077, 0.5430970740165433), (905659, 0.5288420463675932), (1897206, 0.5159902273180529), (3328506, 0.5032356041696785), (60384473, 0.498519583986595), (24963841, 0.49633318169260565), (21106742, 0.4884811775752547), (743971, 0.4794820312178609), (9376116, 0.4742198206309924), (442684, 0.47094510960362146), (35901496, 0.46804696236933446), (24963451, 0.4646245157377308), (59558412, 0.46410375643437396), (59280201, 0.460699238375462), (20632884, 0.45393103157360887), (36794719, 0.44030837004295653), (16635934, 0.4401910887106289), (33516090, 0.43773160345893053), (10328235, 0.43627686962793016)

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:50] "[37mGET /search?query=How+to+make+coffee%3F HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:50] "[37mGET /search?query=Ritalin HTTP/1.1[0m" 200 -


[(5612891, 0.8378245053464398), (604727, 0.8225849205868629), (13873200, 0.8049109552383734), (5612952, 0.792630505921637), (51613538, 0.7879825025259348), (23451648, 0.7872122130613156), (19399083, 0.7738771114743702), (27326872, 0.7719601584206971), (18931461, 0.7713503869684891), (667037, 0.7712887628559706), (42311274, 0.7596690203196034), (4506407, 0.7575697988089535), (482824, 0.7534951520550568), (7687063, 0.751477936345265), (62192965, 0.7503965795019629), (2165666, 0.7477231557273184), (38992003, 0.7453739388584312), (54428590, 0.7435907305280848), (2093706, 0.740090566087884), (33236555, 0.7338250275354503), (31605606, 0.7330777601088713), (27301828, 0.7316933315734168), (273707, 0.729957386451428), (32239401, 0.7271159122310752), (4604645, 0.7260806492026957), (17753292, 0.7248714283407236), (63520964, 0.722823586656893), (39365975, 0.7197791698031076), (52428480, 0.71680203166373), (34103233, 0.7146570145581055), (56923463, 0.7138989391163476), (33746534, 0.7138442675975553

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:53] "[37mGET /search?query=How+to+make+wine+at+home%3F HTTP/1.1[0m" 200 -


[(4435069, 0.7429935630387207), (17991512, 0.7407188946541282), (27215002, 0.7344838682034899), (10342514, 0.7325028745367013), (38409410, 0.7320420302886558), (28742611, 0.7228389370271483), (7199869, 0.7213941905352801), (3880987, 0.7166805087417486), (67886744, 0.7115837030022866), (4389159, 0.7097866472898814), (41406891, 0.7097715879237874), (5400822, 0.7079405057183944), (32961, 0.7073941439426429), (19246495, 0.7000083854720175), (28887305, 0.6891644773754341), (8177057, 0.6853222007934948), (934026, 0.6826529208929455), (22779492, 0.6814872811103334), (18475503, 0.6768382757542233), (19244911, 0.6742272989451549), (2061874, 0.6719741878287496), (62815430, 0.6714065082856981), (3432446, 0.6700504528854961), (5654394, 0.6646322862030457), (15035526, 0.6640699946585439), (619575, 0.6630696888445576), (1942258, 0.6612236359213507), (14001249, 0.6607183479233285), (25788047, 0.6591818723586375), (17503317, 0.6583754871377472), (8814501, 0.6583030144066973), (17236054, 0.655474633861

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:56] "[37mGET /search?query=Most+expensive+city+in+the+world HTTP/1.1[0m" 200 -


[(14900757, 0.7941962462806903), (1180765, 0.31930887447202966), (12028369, 0.3102117106787619), (5698231, 0.29937717361543037), (43613188, 0.2804103340482748), (5619296, 0.26182800146427265), (41050559, 0.2571711686037835), (48481721, 0.24386574473724143), (6137184, 0.2305063525796864), (11875674, 0.22252290230582042), (33467288, 0.2212853840615157), (37034773, 0.2116782827093187), (35133400, 0.21007636913951053), (2859572, 0.2100650021266578), (49543223, 0.20971813252205326), (41363048, 0.20489038708545174), (59509649, 0.20367735554837463), (261790, 0.20220864313291637), (38911950, 0.20053793558991978), (5924085, 0.19787935725158487), (3138655, 0.1932182275430871), (6801673, 0.19163249420613754), (898670, 0.1912438144536621), (66898061, 0.19063051576433243), (59115, 0.18928708053120294), (19385355, 0.18916656727118628), (14839052, 0.1869249064439023), (3718947, 0.1847892025687845), (48799578, 0.18342599206951002), (17522480, 0.18337517928270483), (68068565, 0.18271468878551003), (533

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:57] "[37mGET /search?query=India HTTP/1.1[0m" 200 -


[(16891417, 0.8768018933168197), (5864614, 0.8273291621474819), (4635441, 0.7944352094182655), (17774253, 0.7695645832436495), (31322278, 0.7452607396854641), (14604, 0.6940092265235832), (32567500, 0.6931012737121116), (4635597, 0.6877250707754151), (20522221, 0.6810644886463572), (9264722, 0.6806960680776916), (2307026, 0.6761747180225219), (19242991, 0.6731443194111776), (26408492, 0.6711152177216567), (32114973, 0.6658123181449714), (4108998, 0.6592654150691949), (19394173, 0.6547700775220622), (4330279, 0.6475869433490851), (1928513, 0.6453576099384258), (19232015, 0.6293855365572889), (3776800, 0.6233238498155437), (1020923, 0.6160005288457078), (24106259, 0.6149534464692409), (3315459, 0.5993567060839773), (23635692, 0.5984748720666486), (1417832, 0.5978257011500094), (14533, 0.5952676848842745), (18328956, 0.5896322318356663), (71232, 0.5871069425472776), (1248983, 0.584060700004344), (30986954, 0.5796055154748847), (6331742, 0.5782669114539837), (283772, 0.5774693157282711), (

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:59] "[37mGET /search?query=how+to+make+money+fast%3F HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:35:59] "[37mGET /search?query=Netflix HTTP/1.1[0m" 200 -


[(1899768, 0.543646382608942), (739435, 0.5020327252609942), (2287447, 0.48640209355259584), (1276539, 0.46833510033018333), (928908, 0.4591557748177819), (1870228, 0.417809247512572), (1276547, 0.4149911191508804), (16085822, 0.41061055840109734), (19944291, 0.3923469837491508), (390000, 0.3913885045999164), (10191516, 0.37518086000659684), (2371905, 0.37440567905044314), (1516835, 0.36769730636304976), (5844373, 0.3630816396663571), (217752, 0.362709167850898), (360101, 0.36236316601958624), (8983183, 0.3615106403961316), (3230600, 0.36010344921424786), (54923, 0.35749425311967065), (8905576, 0.3536081460795147), (23830729, 0.34303928222039004), (1407354, 0.34183283168608125), (2697263, 0.3414904119800352), (1132487, 0.3341283880837566), (12542421, 0.332545340847135), (1928632, 0.3307799305463567), (5921826, 0.32929879313648114), (333695, 0.3270569071061732), (211982, 0.32551227776580594), (8898866, 0.32441135652290526), (2407356, 0.3221114964522178), (586034, 0.3209230496331396), (2

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:00] "[37mGET /search?query=Apple+computer HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:00] "[37mGET /search?query=The+Simpsons HTTP/1.1[0m" 200 -


[(5078775, 0.7878636877212544), (1344, 0.7754785194148389), (36071326, 0.7395569464601333), (17761044, 0.7258822178899054), (2117, 0.7208607399351628), (3896290, 0.705280870327566), (46378222, 0.702081029120931), (856, 0.6961358931153299), (2116, 0.6951136768930607), (6771637, 0.6926260348327609), (2593693, 0.6828068617900372), (38117537, 0.6766309487438485), (26328591, 0.6715659779016285), (646886, 0.6662221451511888), (3540442, 0.6641054307970343), (3043698, 0.6608593469834905), (54566027, 0.6561182421503527), (20116828, 0.6530242664477188), (60337415, 0.6468482918336086), (23818735, 0.6414635656572553), (37656556, 0.6398611846978234), (60328532, 0.6367930268262246), (46924608, 0.6337215778991295), (4399093, 0.6308551415882568), (43248659, 0.6150510354995944), (27099898, 0.6136444247752633), (13567431, 0.6117736108482763), (8953690, 0.6106534238006522), (6963544, 0.6099731217540388), (333284, 0.6063983958955618), (2843516, 0.6033598912124838), (65322537, 0.6015451193302547), (1137731

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:03] "[37mGET /search?query=World+cup HTTP/1.1[0m" 200 -


[(33727, 0.7383037596177142), (88178, 0.6388973638871162), (511825, 0.6025589435367817), (584860, 0.5521974567996458), (310124, 0.5382519895537976), (66968, 0.5316960393899222), (88184, 0.5268909200097175), (1154950, 0.5120406569659313), (701381, 0.5102926272765452), (1007536, 0.5098027739551978), (250687, 0.5032023349688715), (1108921, 0.4871543617683444), (11370, 0.4863589374632522), (617795, 0.4671250991157131), (592115, 0.45058469929581496), (655973, 0.4473342660719063), (865679, 0.44446729007608365), (748071, 0.43571729599057885), (377435, 0.4185034734700975), (266217, 0.4171639267638736), (463373, 0.40857278445166906), (827488, 0.40768194037212735), (860510, 0.40755994685001806), (52622, 0.40669219927377), (157233, 0.40223208849318315), (516542, 0.3960777128519465), (7239, 0.39491856964792577), (263861, 0.39009496073300176), (426859, 0.3898755977262871), (250204, 0.388841547369266), (20485, 0.3884777671770658), (509620, 0.3881530204633049), (40377, 0.3874730775883979), (60986, 0.

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:04] "[37mGET /search?query=How+to+lose+weight%3F HTTP/1.1[0m" 200 -


[(65484056, 0.687254953377173), (36661251, 0.6153332395632272), (42169986, 0.6060148588508968), (35375144, 0.5850703779370712), (3549164, 0.5787064481831276), (4763277, 0.5771880629518572), (62693167, 0.5706146887390631), (3674162, 0.5689773205472354), (5926921, 0.567355467966247), (49299122, 0.5621457555783571), (6182877, 0.56096968484036), (3008060, 0.5502728080121212), (43574875, 0.5483656030302346), (30176489, 0.5407192431964554), (7659797, 0.5369038491125943), (47427091, 0.5105034705449424), (68230723, 0.5056320539932604), (43158470, 0.4957551308857894), (26435514, 0.4932773095180763), (5024962, 0.47624665044227926), (50870843, 0.47339719555873666), (44132626, 0.4716459704780696), (27248907, 0.470991827606374), (48732546, 0.47032461615464227), (2114141, 0.4679859131291644), (23413960, 0.46058244016968297), (48934337, 0.45937478348404664), (4158437, 0.4555103565869669), (28498472, 0.45444019487498444), (8652125, 0.4519599237853307), (23414617, 0.44478910532600036), (60211493, 0.439

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:04] "[37mGET /search?query=Java HTTP/1.1[0m" 200 -


[(15628, 0.9419987335697285), (61252439, 0.93526528082216), (16528, 0.90275005889844), (15881, 0.8676548578788508), (4718446, 0.8577746604831041), (16812303, 0.8235090367291813), (69336, 0.8154459191566746), (57820409, 0.8104436312509697), (2257691, 0.8075996234263502), (54993399, 0.8004465249410151), (41893580, 0.7967822058397968), (38134332, 0.7871694495516345), (47555381, 0.7788901734337671), (10069674, 0.7767747282108759), (24070385, 0.7680913875242565), (16831098, 0.7620237977004082), (5516020, 0.752706096208621), (66978953, 0.7469317340014401), (8786357, 0.7420949172635593), (59426888, 0.7373810164269934), (8244918, 0.7347173675176059), (1763516, 0.7332697095677282), (16718543, 0.7289600163046359), (1688277, 0.7263813304309431), (5457138, 0.7187053695722354), (2070564, 0.7178706677559525), (35642213, 0.7174345446761347), (8422052, 0.7170938815355146), (1688253, 0.7139782038294576), (67302429, 0.7107753680640679), (2641984, 0.706786806038002), (20553764, 0.6993411773398021), (2400

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:06] "[37mGET /search?query=Air+Jordan HTTP/1.1[0m" 200 -


[(1394509, 0.8831845433291612), (18941209, 0.7880298792303365), (150109, 0.7787550103923073), (11681334, 0.767468771763938), (16278877, 0.7655926996767949), (57238888, 0.7330737583231786), (34915647, 0.7315104751393646), (2855993, 0.6995618259493835), (36415003, 0.6934836756681587), (25020485, 0.690845366955473), (6199097, 0.6838978961194547), (480455, 0.6678582418081651), (31583084, 0.6660174036997776), (59986033, 0.6594368563608828), (59805922, 0.6585734525940498), (59986028, 0.6585734525940498), (2012410, 0.652934479699707), (30974750, 0.6501817208019562), (15332689, 0.6463206581565017), (6032831, 0.6462193978186322), (26172998, 0.6461236635703446), (12323589, 0.6445600670862692), (15723, 0.6426163997334411), (57250904, 0.6422370694894437), (30974753, 0.640849132848664), (2437337, 0.6405806175854467), (30974744, 0.6400657989801051), (2308773, 0.6360446080872878), (1371219, 0.633833252643675), (26173045, 0.6287598329240056), (26173024, 0.6287367168597005), (26173029, 0.62864254690380

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:06] "[37mGET /search?query=how+to+deal+with+depression%3F HTTP/1.1[0m" 200 -


[(90460, 0.6802296843011926), (2663493, 0.6343098775930264), (27989318, 0.5580908124914724), (56519274, 0.5552444165097304), (13190302, 0.5507498636571341), (38289, 0.5417380853491812), (8389, 0.5377248747188815), (367358, 0.5174373247278633), (57325596, 0.5078760465830444), (42730418, 0.491527613453708), (20558148, 0.48791829524455915), (19283335, 0.4683189372690346), (7527514, 0.4683178248030964), (34161835, 0.4677906243639473), (51590116, 0.4651045213907333), (25258288, 0.45936124212948914), (39354276, 0.45585500083062275), (2367697, 0.45401167921709434), (22481627, 0.4529148638538715), (18550003, 0.45051517477418207), (38108482, 0.45012863517966173), (40819579, 0.44840520181841464), (11893034, 0.4479086041773392), (43600438, 0.44369951538271074), (21211994, 0.43907079915285674), (16280876, 0.43836634067636787), (39323552, 0.43808651870145454), (33585190, 0.43756273247891275), (2343883, 0.43557119162592106), (840273, 0.4317420283855649), (31841434, 0.42860504490109247), (35500653, 0

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:08] "[37mGET /search?query=How+do+you+make+gold HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:08] "[37mGET /search?query=Marijuana HTTP/1.1[0m" 200 -


[(680582, 0.6838722408372057), (12240, 0.6803141599116121), (1291393, 0.5507653828571742), (248084, 0.5322922769199611), (37412, 0.5210120214721966), (1551612, 0.519536448151967), (1020809, 0.49884802194484745), (140672, 0.4942407717096928), (1714688, 0.4860732280941407), (59248, 0.4715827592462173), (251087, 0.4672064773486843), (491320, 0.46697890404842834), (1056205, 0.4619085083743154), (1816430, 0.4445620560780353), (55719, 0.4372113076368793), (1333268, 0.4328049963767584), (1457194, 0.42330464535725454), (59251, 0.4177752840010157), (2104478, 0.41348849649460273), (58296, 0.412405978323274), (1073399, 0.4028208852866632), (1713508, 0.4019007889994232), (1109819, 0.40108246743731524), (250687, 0.40083534488102157), (2416673, 0.3964600198426959), (469488, 0.3941829111394496), (1532191, 0.38961484976938343), (1714658, 0.38655481078046683), (2043300, 0.38574604187191136), (156331, 0.3804918390458169), (1686492, 0.37863783365561837), (1919886, 0.3746454480967526), (1875159, 0.3731236

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:09] "[37mGET /search?query=How+to+make+hummus HTTP/1.1[0m" 200 -


[(75065, 0.8301056421815538), (48876576, 0.7394545022637522), (24230253, 0.6215703042235067), (56494240, 0.5814057566057937), (16162840, 0.5649728328268548), (52682605, 0.5471377650105922), (10122293, 0.4776531385688064), (37785018, 0.4711215411159379), (5764910, 0.3789920402359637), (67420826, 0.36256399162063796), (7329519, 0.33101163950932566), (47886970, 0.3006051899066869), (24480734, 0.22864581218082292), (67648017, 0.18264864495950417), (32282924, 0.18122522080222742), (10607956, 0.17610733071077625), (7897152, 0.17592444509592206), (11146241, 0.17199220966064904), (48044401, 0.16887988761575254), (62119914, 0.16188851040900817), (11287682, 0.16179957024817393), (33050336, 0.15768158227576692), (53830078, 0.1502375247939319), (35870536, 0.149896472499566), (45221944, 0.1463431234382978), (53903055, 0.14388940576589626), (49643204, 0.14254030735954884), (3508935, 0.14228551008368037), (11447140, 0.14210610741462332), (22736969, 0.13932148901422983), (5327757, 0.1387026408980645),

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:09] "[37mGET /search?query=Winter HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:09] "[37mGET /search?query=Rick+and+Morty HTTP/1.1[0m" 200 -


[(34061, 0.5996274497371072), (34069, 0.4323414892516225), (54012, 0.3959824265653829), (58564, 0.39005988315561707), (49369, 0.34614570069780914), (10990, 0.3363205733571281), (38950, 0.32816896531712375), (38862, 0.2981230373246185), (40381, 0.29150495632935614), (777, 0.27627333891043415), (72319, 0.270383888319352), (33055, 0.2654504890404091), (42195, 0.24355488026896055), (40385, 0.2420427638871932), (65561, 0.24150599971529196), (40629, 0.23371767102463567), (32336, 0.2334818181146282), (71557, 0.23328466485655433), (22171, 0.2271298173512515), (80508, 0.217841838792754), (38843, 0.21338214285875706), (49643, 0.2047856358963378), (49096, 0.19340101566816037), (38678, 0.1886550114151062), (36772, 0.18062192861587914), (39641, 0.18054453229586134), (42004, 0.17632126845172705), (40591, 0.1653928124139127), (39634, 0.1633564248743669), (52997, 0.16103717481256588), (40017, 0.1466424450127125), (40048, 0.14263798524388424), (39693, 0.14227027598032882), (31908, 0.13765528801102014),

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:11] "[37mGET /search?query=Natural+Language+processing HTTP/1.1[0m" 200 -


[(2891758, 0.7078977508425989), (486551, 0.6763833079007732), (4045546, 0.6302121911051711), (35857899, 0.5871247591218158), (546083, 0.5450194167350728), (9900070, 0.5437307176227674), (6277593, 0.5302697294812071), (1661566, 0.4921785190494869), (2853312, 0.48251732122542534), (2854975, 0.48251732122542534), (2854980, 0.48251732122542534), (2854984, 0.48251732122542534), (3632204, 0.48251732122542534), (27608493, 0.48251732122542534), (28285013, 0.48251732122542534), (31731899, 0.48251732122542534), (32235818, 0.48251732122542534), (38193383, 0.48251732122542534), (42848709, 0.48251732122542534), (42848744, 0.48251732122542534), (2853319, 0.4825173212254253), (13711549, 0.4825173212254253), (33214093, 0.4825173212254253), (35233764, 0.4825173212254253), (40484086, 0.4825173212254253), (45155414, 0.48228143750637564), (36323189, 0.47732178151459903), (30972465, 0.4748476809961815), (4151133, 0.46483032924382384), (20715140, 0.46445043942786274), (21173, 0.46419960731566806), (34741517

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:14] "[37mGET /search?query=World+Cup+2022 HTTP/1.1[0m" 200 -


[(62872154, 0.7959228764811981), (63163166, 0.7609282473373377), (40250945, 0.7212078743418093), (48768943, 0.6920044187073328), (68313940, 0.6868407397768517), (67445313, 0.6759250793039283), (67177420, 0.6756260899937299), (67569202, 0.6463716993278468), (60254944, 0.6440532969812657), (68265989, 0.6265928347419845), (68305631, 0.6154647262035434), (67397628, 0.6127330351261987), (64153522, 0.6028649388610073), (62490330, 0.6022802109986036), (65152088, 0.5979597961734713), (861667, 0.5849699572887361), (65197246, 0.5849371618139806), (60397115, 0.5802757589647567), (67241646, 0.5772478861513912), (65153103, 0.5767233863493876), (65138499, 0.5760613112187312), (68265274, 0.5719966393772091), (65189433, 0.56898080233078), (51453600, 0.5681247495572822), (65138585, 0.5676617091400751), (65104566, 0.5659442284660932), (67403091, 0.5636574111272), (65153029, 0.5625524720905528), (65197214, 0.5621140188361238), (65104861, 0.5568559551548079), (65105515, 0.5549930402688068), (65105782, 0.5

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:14] "[37mGET /search?query=Dolly+the+sheep HTTP/1.1[0m" 200 -
INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:14] "[37mGET /search?query=cigarettes HTTP/1.1[0m" 200 -


[(192685, 0.7337788681828463), (62605848, 0.7035571022033261), (9146, 0.6640014224874895), (15525547, 0.6255643174381109), (684894, 0.5953248851803702), (1462860, 0.5874458962846749), (7931740, 0.584676531859316), (17158563, 0.5744173780242439), (66021052, 0.5623944189380948), (3260611, 0.554330525273228), (51194108, 0.5468643618698482), (31315948, 0.5451572877553359), (53734671, 0.5236893059994859), (13676135, 0.5207233655302438), (1895130, 0.5069110093715692), (9649607, 0.5011349192407352), (1843778, 0.5010394325253376), (2968233, 0.49730320815442475), (38951684, 0.49713714860862424), (15632671, 0.4959278347385602), (20672385, 0.4954715997241534), (690469, 0.493229748574384), (20876683, 0.49121523170599973), (11073540, 0.4887606750152122), (43217052, 0.4873497358644372), (27484164, 0.4833721706966498), (22644717, 0.47973030006383494), (3512162, 0.4796590246581956), (20396076, 0.4783122111544194), (99032, 0.47530668079098104), (39792862, 0.4697938315725317), (37237191, 0.4694697720448

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:19] "[37mGET /search?query=What+is+the+best+place+to+live+in%3F HTTP/1.1[0m" 200 -


[(1861944, 0.645210979915685), (2347978, 0.5177365555704602), (6902155, 0.4930020242635897), (5617195, 0.4781133687572856), (4789487, 0.4658955156341635), (7081866, 0.46288553297333157), (538871, 0.4580292765779094), (10905655, 0.438307098866161), (2250249, 0.4315462618232814), (9860955, 0.42689773401540687), (6680309, 0.4244035513509471), (2403481, 0.4225335665490819), (810230, 0.4212820354967698), (6236356, 0.41844175689488977), (7317283, 0.4184001538012158), (5681949, 0.4095901656269586), (6873605, 0.4057795419505675), (4151001, 0.40217148397123054), (5221695, 0.40155257298160024), (172707, 0.39998708040251413), (4118257, 0.3956071093415681), (978461, 0.39447684613539613), (8010446, 0.3937978105125669), (7758365, 0.3921622778453916), (1668452, 0.3862698671730832), (2065009, 0.38316243434743635), (4634538, 0.38308209787824027), (2573205, 0.38306189181855554), (7528266, 0.3776267274576423), (5857965, 0.37255034748113824), (12259587, 0.371842324792009), (2493224, 0.36891711395327975), 

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:19] "[37mGET /search?query=Elon+musk HTTP/1.1[0m" 200 -


[(41309419, 0.7646797367763), (43407192, 0.7598646256681968), (66430882, 0.7193937299058891), (50399439, 0.7173041328786111), (803102, 0.6847561965686744), (65212863, 0.6812022910581487), (48778030, 0.6739679848726008), (65175052, 0.660447383994081), (4666466, 0.6412584201264464), (909036, 0.636831432373363), (33986468, 0.625045276232102), (22248960, 0.6182486150027602), (220806, 0.6166157276584527), (29414816, 0.6105698615512644), (58173986, 0.6082444976045213), (127628, 0.5958248428616006), (47190535, 0.5886617105267613), (21523886, 0.573477895137428), (68289769, 0.5734283470243533), (1001122, 0.570716826777269), (20339194, 0.5524507923660585), (41318207, 0.5463486470062628), (21450611, 0.5446081981485498), (45186981, 0.5409909509652617), (60955975, 0.539076591511531), (32151967, 0.5360295533520133), (6973013, 0.5354259821250418), (58212588, 0.534838738516725), (4555268, 0.5305579494112737), (61490363, 0.5290911181554261), (48703999, 0.5241650607796684), (5399134, 0.5202516165556192)

INFO:werkzeug:127.0.0.1 - - [13/Jan/2023 10:36:20] "[37mGET /search?query=How+do+you+breed+flowers%3F HTTP/1.1[0m" 200 -


[(9507921, 0.7072223773176642), (39676695, 0.6931306004992936), (801773, 0.6669144095164994), (895650, 0.658343329569042), (13780186, 0.6510321689725893), (20897928, 0.6446115467224508), (15446490, 0.6269068769762832), (44066718, 0.6259695015147796), (60649381, 0.6170155448043436), (51118248, 0.5818676638384633), (64576596, 0.5763163004891468), (3550726, 0.5685086220197231), (32908252, 0.5608336937392411), (24016757, 0.5605059318141773), (59673454, 0.5591305244944705), (17843999, 0.5539164484612574), (30510881, 0.5498781410675636), (37570236, 0.547173132660456), (5116464, 0.5420410538421726), (921491, 0.5103395854033316), (5861432, 0.5063389283043987), (6947021, 0.5027451246884785), (1810335, 0.5020757359856827), (37260437, 0.4970848739469803), (21957462, 0.4968248786070023), (62781095, 0.49620880078404084), (20041575, 0.4933006639945657), (7525682, 0.49024362518720915), (798084, 0.48302323111916756), (4914967, 0.4812593134370859), (43426910, 0.4779331376123281), (39588381, 0.475614715

# Testing your app

Once your app is running you can query it. You can simply do that by navigating to the URL that ngrok gave you above or through code in a different python session. For example, once the frontend app is running, you can navigate to:
http://YOUR_SERVER_DOMAIN/search?query=hello+world where YOUR_SERVER_DOMAIN is something like XXXX-XX-XX-XX-XX.ngrok.io, which is printed above in Colab or that is your external IP on GCP.

The code below shows how to issue a query from python. This is also how our testing code will issue queries to your search engine, so make sure to test your search engine this way after you deploy it to GCP and before submission. Command line instructions for deploying your search engine to GCP are available at `run_frontend_in_gcp.sh`. Note that we will not only issue training queries to your search engine, but also test queries, i.e. queries that you've never seen before.

In [None]:
import json

with open('queries_train.json', 'rt') as f:
  queries = json.load(f)

In [None]:
def average_precision(true_list, predicted_list, k=40):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    precisions = []
    for i,doc_id in enumerate(predicted_list):        
        if doc_id in true_set:
            prec = (len(precisions)+1) / (i+1)            
            precisions.append(prec)
    if len(precisions) == 0:
        return 0.0
    return round(sum(precisions)/len(precisions),3)

In [None]:
import requests
from time import time
# url = 'http://35.232.59.3:8080'
# place the domain you got from ngrok or GCP IP below. 
url = 'http://cded-35-192-160-143.ngrok.io'

qs_res = []
for q, true_wids in queries.items():
  duration, ap = None, None
  t_start = time()
  try:
    res = requests.get(url + '/search_body', {'query': q}, timeout=35)
    duration = time() - t_start
    if res.status_code == 200:
      pred_wids, _ = zip(*res.json())
      ap = average_precision(true_wids, pred_wids)
  except:
    pass
  
  qs_res.append((q, duration, ap))
print(qs_res)