In [1]:
# download nltk stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ofekzini/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
import json

with open('queries_train.json', 'rt') as f:
  queries = json.load(f)

In [27]:
def average_precision(true_list, predicted_list, k=40):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    precisions = []
    for i,doc_id in enumerate(predicted_list):
        if doc_id in true_set:
            prec = (len(precisions)+1) / (i+1)
            precisions.append(prec)
    if len(precisions) == 0:
        return 0.0
    return round(sum(precisions)/len(precisions),3)

In [29]:
def precision_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(predicted_list) == 0:
        return 0.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(predicted_list), 3)
def recall_at_k(true_list, predicted_list, k):
    true_set = frozenset(true_list)
    predicted_list = predicted_list[:k]
    if len(true_set) < 1:
        return 1.0
    return round(len([1 for doc_id in predicted_list if doc_id in true_set]) / len(true_set), 3)
def f1_at_k(true_list, predicted_list, k):
    p = precision_at_k(true_list, predicted_list, k)
    r = recall_at_k(true_list, predicted_list, k)
    if p == 0.0 or r == 0.0:
        return 0.0
    return round(2.0 / (1.0/p + 1.0/r), 3)
def results_quality(true_list, predicted_list):
    p5 = precision_at_k(true_list, predicted_list, 5)
    print(p5)
    f1_30 = f1_at_k(true_list, predicted_list, 30)
    if p5 == 0.0 or f1_30 == 0.0:
        return 0.0
    return round(2.0 / (1.0/p5 + 1.0/f1_30), 3)

assert precision_at_k(range(10), [1,2,3] , 2) == 1.0
assert recall_at_k(   range(10), [10,5,3], 2) == 0.1
assert precision_at_k(range(10), []      , 2) == 0.0
assert precision_at_k([],        [1,2,3],  5) == 0.0
assert recall_at_k(   [],        [10,5,3], 2) == 1.0
assert recall_at_k(   range(10), [],       2) == 0.0
assert f1_at_k(       [],        [1,2,3],  5) == 0.0
assert f1_at_k(       range(10), [],       2) == 0.0
assert f1_at_k(       range(10), [0,1,2],  2) == 0.333
assert f1_at_k(       range(50), range(5), 30) == 0.182
assert f1_at_k(       range(50), range(10), 30) == 0.333
assert f1_at_k(       range(50), range(30), 30) == 0.75
assert results_quality(range(50), range(5))  == 0.308
assert results_quality(range(50), range(10)) == 0.5
assert results_quality(range(50), range(30)) == 0.857
assert results_quality(range(50), [-1]*5 + list(range(5,30))) == 0.0

1.0
1.0
1.0
0.0


In [None]:
import requests
from time import time
# In GCP the public URL for your engine should look like this:
# url = 'http://35.232.59.3:8080'
# In colab, we are going to send HTTP requests to localhost (127.0.0.1)
# and direct them to port where the server is listening (5000).
url = 'http://34.59.160.208:8080'

# rq = None
qs_res = []
for q, true_wids in queries.items():
  duration, ap = None, None
  t_start = time()
  try:
    res = requests.get(url + '/search', {'query': q}, timeout=35)
#     print(res.status_code)
    duration = time() - t_start
    print(q)
    if res.status_code == 200:
      pred_wids, _ = zip(*res.json())
      # print(pred_wids)
      rq = results_quality(true_wids, pred_wids)
  except:
    pass

  qs_res.append((q, duration, rq))

Human Genome Project DNA mapping
0.8
Italian pasta varieties and recipes
0.2
Street food around the world
0.4
Surrealism art Salvador Dalí
0.8


In [None]:
print(qs_res)

In [71]:
import pandas as pd
from datetime import datetime
import os
text_weight = 1.5
title_weight = 0.6
anchor_weight = 0.1
pr_weight = 0.4
pv_weight = 0.5

run_desc = f"CSanchor BMtitle, text_weight, c200 = {text_weight}, title_weight ={title_weight}, anchor_weight = {anchor_weight},pr_weight = {pr_weight},pv_weight = {pv_weight}"
# Add timestamp column
timestamped_qs_res = [(q, duration, rq, datetime.now().strftime('%Y-%m-%d %H:%M:%S'),run_desc) for q, duration, rq in qs_res]

# Convert results to a DataFrame (same as before)
results_df = pd.DataFrame(timestamped_qs_res, columns=['Query', 'Duration', 'Results Quality', 'Timestamp','type'])

# File path
file_path = r'query_results.csv'

# Check if the file exists
if os.path.exists(file_path):
    # Append to existing file
    results_df.to_csv(file_path, mode='a', header=False, index=False)
else:
    # Create a new file with header
    results_df.to_csv(file_path, mode='w', header=True, index=False)