In [2]:
import csv
import pandas as pd
import numpy as np
import pylab
import matplotlib
import math

In [3]:
# function to read tsv file
def readfile(filename):
    tsv_file = open(filename)
    read_tsv = csv.reader(tsv_file, delimiter="\t")
    return list(read_tsv)

In [4]:
# SE - from list to df
se1 = readfile("part_1_2__Results_SE_1.tsv")
se_1 = pd.DataFrame(se1[1:],columns=['Query_ID','Doc_ID','Rank']) #convert the SE1 into a dataframe
se2 = readfile("part_1_2__Results_SE_2.tsv")
se_2 = pd.DataFrame(se2[1:],columns=['Query_ID','Doc_ID','Rank']) #convert the SE2 into a dataframe
se3 = readfile("part_1_2__Results_SE_3.tsv")
se_3 = pd.DataFrame(se3[1:],columns=['Query_ID','Doc_ID','Rank']) #convert the SE3 into a dataframe
# GT - from list to df
gt = readfile("part_1_2__Ground_Truth.tsv")
ground_truth = pd.DataFrame(gt[1:],columns=['Query_ID','Relevant_Doc_id']) #convert the gt list into dataframe in order to extract the items in the col 'Query_id'

In [22]:
# P@k 
def p_at_k(sr,gt,k):
    #n_query = sr1['Query_id'].max()
    Q=set(sr['Query_ID'])
    for i in Q:#range(1, n_query+1):
        # filter the rows of the df with QUERY_ID = i
        seID = sr['Doc_ID'].loc[sr['Query_ID'] == i].tolist()
        gtID = gt['Relevant_Doc_id'].loc[gt['Query_ID'] == i].tolist()
        #because not all queries are considered in the Ground Truth CSV
        if len(gtID) == 0:
            continue
        numerator = sum(el in seID for el in gtID) #do the summation of the relevant doc in the SE
        denominator = min(k,len(gtID))
    return (numerator/denominator)

In [23]:
p_at_k(se_1,ground_truth,9)

1.0

In [14]:
p_at_k(se_2,ground_truth,9)

1.0

In [15]:
p_at_k(se_3,ground_truth,9)

1.0

In [32]:
# R-precision
def r_precision(se, gt):
    #r_pre=[]
    #n_query = se['Query_id'].max()
    Q=set(gt['Query_ID'].unique())
    for i in Q:#range (1, n_query+1):
        seID = se['Doc_ID'].loc[se['Query_ID'] == i].tolist()
        gtID = gt['Relevant_Doc_id'].loc[gt['Query_ID'] == i].tolist()
        # because not all queries are considered in the Ground Truth CSV
        if len(gtID) == 0:
            continue
        num = sum(el in seID for el in gtID) #do the summation of the relevant doc in the SEengine
        r = num/(len(gtID))
        #r_pre.append(r)
    return (r)

In [33]:
r_precision(se_1, ground_truth)

1.0

In [34]:
r_precision(se_2, ground_truth)

1.0

In [35]:
r_precision(se_3, ground_truth)

1.0

In [30]:
# normalized Discounted Cumulative Gain (nDCG)
def n_dcg(se,gt,k):
    result = 0
    #n_query = se['Query_id'].max()
    #for i in range(1,n_query+1):
    Q=set(gt['Query_ID'].unique()) #number of unique queries in the ground truth
    for i in Q:
        top_result = pd.DataFrame()
        # get top k relevant docs from the SE1
        top_se = se[['Doc_ID', 'Rank']][:k]
        top_gt = gt[['Relevant_Doc_id']][:k]
        # store into top_result only relevant docs
        top_result = top_gt.merge(top_se, how='inner', left_on='Relevant_Doc_id', right_on='Doc_ID') 
        # (every element in the list has relevance = 1)
        rank = top_result['Rank'].tolist() # store into a list the ranks
        ranks = [int(i) for i in rank] #convert into int
        # compute the discounted cumulative gain
        dcg = 0
        for i in ranks: #if the query id is in the GT than relevance = 1
            dcg += 1/(math.log2(i+2))
        # compute the ideal discounted cumulative gain
        idcg = 0
        for j in range(1, k+1):
            idcg += 1/(math.log2(j+2))
        return (dcg/idcg)

In [31]:
n_dcg(se_1,ground_truth,9)

0.21560522020745643

In [36]:
n_dcg(se_2,ground_truth,9)

0.08157471026683315

In [37]:
n_dcg(se_3,ground_truth,9)

0.2670745823795745

In [43]:
# Mean Reciprocal Rank (MRR)
def MRR(sr1,gt):
    mrr=0
    rel_doc_ids=list() #list to store the relevant doc_ids for every query id in Q
    dd_se= {} 
    dd_gt= {} 
    Q=set(gt['Query_ID'].unique()) #number of unique queries in the ground truth
    for i in Q:
        #key=Query_id,value=list of document ids from SE result
        dd_se[i]=list(sr1[sr1['Query_ID']==i]['Doc_ID'])
    for i in Q:
        #key=Query_id,value=list of relevant document ids from ground truth
        dd_gt[i]=list(gt[gt['Query_ID']==i]['Relevant_Doc_id'])
    
    for q in Q: 
        rel_doc_ids=dd_gt[q]
        #for each doc_id in query_id q
        for i in range(len(dd_se[q])): 
            #if doc_id is in the list of the relevant doc_ids 
            if dd_se[q][i] in rel_doc_ids: #[i] is index of list
                mrr=mrr+(1/(i+1)) #MRR value is sum on Reciprocal Ranks (+1 cause ranking starts with 1)
                break #once we get the first doc id from the relevant doc ids in the GT, we can stop
    mrr=mrr/(len(Q)) #compute the avg of the sum of reciprocal ranks
    return mrr


In [44]:
MRR(se_1, ground_truth)

0.49819410801065533

In [45]:
MRR(se_2, ground_truth)

0.4168479516002677

In [46]:
r_precision(se_3, ground_truth)

1.0