In [3]:
import csv
import pandas as pd
import numpy as np
import pylab
import matplotlib
import math
from statistics import mean

In [4]:
# function to read tsv file
def readfile(filename):
    tsv_file = open(filename)
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    return list(read_tsv)

In [5]:
# SE - from list to df
se1 = readfile("part_1_2__Results_SE_1.tsv")
se_1 = pd.DataFrame(se1[1:],columns=['Query_ID','Doc_ID','Rank']) #convert the SE1 into a dataframe
se2 = readfile("part_1_2__Results_SE_2.tsv")
se_2 = pd.DataFrame(se2[1:],columns=['Query_ID','Doc_ID','Rank']) #convert the SE2 into a dataframe
se3 = readfile("part_1_2__Results_SE_3.tsv")
se_3 = pd.DataFrame(se3[1:],columns=['Query_ID','Doc_ID','Rank']) #convert the SE3 into a dataframe
# GT - from list to df
gt = readfile("part_1_2__Ground_Truth.tsv")
ground_truth = pd.DataFrame(gt[1:],columns=['Query_ID','Relevant_Doc_id']) #convert the gt list into dataframe in order to extract the items in the col 'Query_id'

In [8]:
#function used in P@k 
def relevant_docs(search_engine,ground_truth,k):
    rel_doc = 0
    for i in search_engine[:k]:
        for j in ground_truth:
            if i==j:
                rel_doc +=1
    return rel_doc

# P@k 
def p_at_k(se,gt,k):
    p_list = []
    Q=set(gt['Query_ID'].unique())
    for i in Q:
        seID = se['Doc_ID'].loc[se['Query_ID'] == i].tolist()
        gtID = gt['Relevant_Doc_id'].loc[gt['Query_ID'] == i].tolist()
        num = relevant_docs(seID,gtID,k) # numerator
        den = min(k,len(gtID)) # denominator
        p_list.append(num/den)
    return (mean(p_list))

In [9]:
p_at_k(se_1,ground_truth,9)

0.43659010742344073

In [10]:
p_at_k(se_2,ground_truth,9)

0.3374899791566458

In [11]:
p_at_k(se_3,ground_truth,9)

0.4212281545614879

In [12]:
# r precision 
def r_precision(se,gt):
    r_list = []
    k = len(gt)
    Q=set(gt['Query_ID'].unique())
    for i in Q:
        seID = se['Doc_ID'].loc[se['Query_ID'] == i].tolist()
        gtID = gt['Relevant_Doc_id'].loc[gt['Query_ID'] == i].tolist()
        num = relevant_docs(seID,gtID,k) # numerator
        den = len(gtID) # denominator
        r_list.append(num/den)
    return (mean(r_list))

In [13]:
r_precision(se_1, ground_truth)

0.8399094133158919

In [14]:
r_precision(se_2, ground_truth)

0.7871965871312788

In [15]:
r_precision(se_3, ground_truth)

0.8305983791354742

In [18]:
def n_dcg(se,gt,k):
    rel=0
    dcg=0
    idcg=0
    ndcg=[]
    Q=set(gt['Query_ID'].unique())
    for i in Q:
        seID = se['Doc_ID'].loc[se['Query_ID'] == i].tolist()
        gtID = gt['Relevant_Doc_id'].loc[gt['Query_ID'] == i].tolist()
        for el in gtID:
            for it in seID[:k]:
                rel=1 if it==el else 0
        for p in range(1,k+1):
            dcg+=rel/(math.log2(p+1))
            idcg+=1/(math.log2(p+1))
        ndcg.append(dcg/idcg)
    return (mean(ndcg))

In [19]:
n_dcg(se_1,ground_truth,9)

0.03104389744438862

In [20]:
n_dcg(se_2,ground_truth,9)

0.005391037561017673

In [25]:
n_dcg(se_3,ground_truth,9)

0.013898940881302583

In [148]:
def MRR(sr1,gt):
    mrr=0
    relevant_doc=[] #list to store the relevant doc_ids for every query id in Q
    dd_se= {} 
    dd_gt= {} 
    Q=set(gt['Query_ID'].unique()) #number of unique queries in the ground truth
    for i in Q:
        #key=Query_id,value=list of document ids from SE result
        dd_se[i]=sr1[sr1['Query_ID']==i]['Doc_ID'].tolist()
        #key=Query_id,value=list of relevant document ids from ground truth
        dd_gt[i]=gt[gt['Query_ID']==i]['Relevant_Doc_id'].tolist()
    
    for q in Q: 
        relevant_doc=dd_gt[q]
        #for each doc in the set of queries
        for i in range(len(dd_se[q])): 
            #if the doc_id is in the list of the relevant doc
            if dd_se[q][i] in relevant_doc: 
                mrr+=(1/(i+1)) #update the MRR value (+1 cause ranking starts with 1)
                break #once we get the first doc id from the relevant doc ids in the GT, we can stop
    mrr=mrr/(len(Q)) #compute the avg of the sum of reciprocal ranks
    return mrr

In [21]:
n_dcg(se_3,ground_truth,9)

0.013898940881302583

In [149]:
MRR(se_1, ground_truth)

0.4981941080106553

In [150]:
MRR(se_2, ground_truth)

0.4168479516002677

In [151]:
MRR(se_3, ground_truth)

0.5033451107520734