Thomas Rhemrev (s1045660), Boudewijn van Gils (s1045276), Harm Jacobs (s1019253)

# 1 Loading the data

### 1.1 Imports

In [1]:
import os
import pandas as pd
import csv
import ast
import collections
from tqdm import tqdm
import json
import math
import numpy as np

### 1.2 Importing the files
This cell only needs to be run once.


In [2]:
file_path = '/content/collection.tsv'

if not os.path.exists(file_path):
    # Download all the files if collection.tsv is in content
    !wget https://transfer.sh/eyMUhyJHnd/queries.dev.small.tsv -P /content/
    !wget https://transfer.sh/MLJmo5hG1Q/collection.tsv -P /content/
    !wget https://transfer.sh/8J3xKmO7Qb/qrels.dev.small.tsv -P /content/

    !wget https://transfer.sh/P2pjXPrj1L/queries.dev.tsv -P /content/
    !wget https://transfer.sh/vF2elQ2avA/queries.eval.tsv -P /content/
    !wget https://transfer.sh/EFVqqUL32r/queries.train.tsv -P /content/
    !wget https://transfer.sh/54I2wsGupJ/qrels.train.tsv -P /content/
    !wget https://transfer.sh/yESAJ7R4wy/queries.eval.small.tsv -P /content/

else:
    print("collection.tsv is already loaded")


collection.tsv is already loaded


### 1.3 Creating the dictionaries

Load the collection, queries and qrel data libraries are created from the tsv files.

In [3]:
queries_dict = {}
collection_dict = {}
qrels_dict = {}

def load_data_tsv(filename, process_row):
    #opens a tsv file and applies the process_row function to every row
    with open(filename, 'r', encoding='utf-8') as file:
        csv_reader = csv.reader(file, delimiter="\t")
        for row in csv_reader:
            process_row(row)

def process_qrels_row(row):
    if int(row[0]) not in qrels_dict:
            qrels_dict[int(row[0])] = [int(row[2])]
    else: qrels_dict[int(row[0])].append(int(row[2]))

def process_queries_row(row):
    queries_dict[int(row[0])] = row[1]

def process_collection_row(row):
    collection_dict[int(row[0])] = row[1]

keep = [1578304, 1578305, 1578306, 1578307, 1578311, 1578312, 1578313, 739802, 739803, 739804, 739805, 739806, 739807, 739811, 2754977, 2754978, 2754979, 2754980, 2754981, 2754983, 2754984, 
        3008361, 3008362, 3008363, 3008364, 3008366, 3008367, 3008370, 2180634, 3614173, 4152760, 2302160, 4152761, 2180637, 4152762]
   
def process_collection_row_smaller(row):
    if row[0].endswith('1') or int(row[0]) in keep:
        collection_dict[int(row[0])] = row[1]


load_data_tsv("collectionandqueries/queries.dev.small.tsv", process_queries_row)
load_data_tsv("collectionandqueries/collection.tsv", process_collection_row)
load_data_tsv("collectionandqueries/qrels.dev.small.tsv", process_qrels_row)

# queries_dict[0] = "apple phone"
# collection_dict[0] = "phone samsung phone"
# collection_dict[1] = "Apple and a Pear"
# collection_dict[2] = "Apple phone"



We remove all stopwords

In [4]:
STOPWORDS = set(['a', 'an', 'and', 'are', 'as', 'at', 'be', 'but', 'by', 'for', 'if', 'in', 'into', 'is', 'it', 'no', 'not', 'of', 'on', 'or', 'such', 'that', 'the', 'their', 'then', 'there', 'these', 'they', 'this', 'to', 'was', 'will', 'with'])

def process(text):
    terms = []
    # Remove special characters
    chars = ['\'', '.', ':', ',', '!', '?', '(', ')']
    for ch in chars:
        if ch in text:
            text = text.replace(ch, ' ')

    # Lowercasing and stopword removal
    for term in text.split():
        term = term.lower()
        if term not in STOPWORDS:
            terms.append(term)
    return terms

We then create the indexing

In [5]:
index = {}

for document_id, content in tqdm(collection_dict.items()):
    processed_terms = process(content)
    collection = collections.Counter(processed_terms)
    for key, value in collection.items():
        if key in index:
            index[key][document_id] = value
        else:
            index[key] = {document_id: value}

counter = 0
for key, value in index.items():
    print(key, value)
    if counter > 10:
        break
    counter+=1

 87%|████████▋ | 7663562/8841823 [09:01<01:13, 16105.21it/s]

# 2 Exploring the data

### 2.1 Printing the heads of the dictionaries
Just to check whether they load correctly.

In [None]:
def print_dict_head(d, num_items=5):
    count = 0
    for key, value in d.items():
        if count < num_items:
            print(key, ":", value)
            count += 1
        else:
            break
print(len(queries_dict))
print("QUERIES:")
print_dict_head(queries_dict)
print("COLLECTION:")
print_dict_head(collection_dict)
print("QRELS:")
print_dict_head(qrels_dict)

### 2.2 Finding queries with multiple relevant documents.

In [None]:
max = 0
biggest = []
for key, value in qrels_dict.items():
    if (len(value)> max):
        biggest = [key]
        max = len(value)
    elif (len(value) == max):
        biggest.append(key)
print(max)
print(biggest)

### 3 Ranking

Defining the IDF

In [None]:
def idf(term, n_documents):
    n = len(index[term])
    return math.log((n_documents-n+0.5)/(n+0.5)+1)

def idf2(term, n_documents):
    n = len(index[term])
    return math.log((n_documents+1)/(n+1))

def idfProb(term, n_documents):
    n = len(index[term])
    return math.log((n_documents-n+0.5)/(n+0.5)+1)*(1-(n/n_documents))

def idfProb2(term, n_documents):
    n = len(index[term])
    return math.log((n_documents+1)/(n+1))*(1-(n/n_documents))

Defining bm25

In [None]:
def bm25(query_id, doc_id, k1, b, n_documents, avgdl):
    sum = 0
    query = queries_dict[query_id]
    for q in query.split(' '):
        if(q in index):
            idf_score = index[q]['idf']
            if(doc_id in index[q]):
                f = index[q][doc_id]
                sum += idf_score * ((f * (k1+1))/(f + k1 *(1 - b + b*(len(collection_dict[doc_id])/avgdl))))
    return sum

Adding IDF to Index

In [None]:
for term in tqdm(index.keys()):
    idf_val = idfProb2(term, len(collection_dict))
    index[term]['idf'] = idf_val

Setting important parameters

In [None]:
n_documents = len(collection_dict)
avgdl = np.mean([len(text.split(' ')) for text in collection_dict.values()])
k1 = 1.2
b = 0.75

print(avgdl)
print(n_documents)

Defining the ranking function

In [None]:
#qq is the query ID
qq = 504335
def rank(query_id, count, k1, b, n_documents, avgdl):
    scores = []
    for doc_id in collection_dict.keys():
        score = bm25(query_id, doc_id, k1, b, n_documents, avgdl)
        scores.append((doc_id, score))
        
    ranked_documents = sorted(scores, key=lambda x: x[1], reverse=True)
    return ranked_documents[:count]



ranking = rank(qq, n_documents, k1, b, n_documents, avgdl)
    

# 3.1 Exploring the results

In [None]:
relevant = qrels_dict[qq]
def findRelevant(relevant, ranking):
    found = []
    counter = 0
    for doc in ranking:
        if doc[0] in relevant:
            found.append((counter, doc))
        if len(found) == len(relevant):
            return found
        counter+=1


print(findRelevant(relevant, ranking))


In [None]:
print(ranking[:10])

In [None]:
print(queries_dict[qq])
print("relevant")
for i in relevant:
    print(collection_dict[i])

print("top 10")
for i in ranking[:10]:
    print(collection_dict[i[0]])
    



# 4 Evaluation Metrics

Define functions to calculate recall and reciprocal rank for each query

In [None]:

def get_relevancy_labels(query, top_results):
  relevancy_labels = []
  for r in top_results:
    if r[0] in qrels_dict[query]:
      relevancy_labels.append(1)
    else:
      relevancy_labels.append(0)
  return relevancy_labels

def recall(query, query_relevancy_labels):
    if sum(query_relevancy_labels) == 0:
      return 0
    return sum(query_relevancy_labels)/len(qrels_dict[query])
  
def reciprocalRank(relevant, ranking, k):
    counter = 0
    for doc in ranking:
      counter+=1
      if doc[0] in relevant or counter == k:
        return 1/counter
        
  

Calculate the recall and reciprocal rank

In [None]:

query = 457622
ranking = rank(query, 1000, k1, b, n_documents, avgdl)
lab = get_relevancy_labels(query, ranking)
print(recall(query, lab))
print(reciprocalRank(qrels_dict[query], ranking, 10))


Calculating the average recall en MMR over n_queries 

In [None]:
recalls = []
rrs = []
counter = 0
n_queries = 10
queries = []
for query in queries_dict.keys():
    queries.append(query)
    counter+=1
    if counter == n_queries:
        break
    
for query in tqdm(queries):
    ranking = rank(query, 1000, k1, b, n_documents, avgdl)
    lab = get_relevancy_labels(query, ranking)
    recalls.append(recall(query, lab))
    rrs.append(reciprocalRank(qrels_dict[query], ranking, 10))


model_recall = np.mean(np.array(recalls))
mrr = np.mean(np.array(rrs))    

print("recall@1000 = ", model_recall, "mrr@10 = ", mrr)