# Evaluation

In this part, we use our "IRByBasicRetrieve.txt" as test datasets to evaluate diffients results files which got form different models.  
We used Precison,Recall,P@K, R_precision, MAP, nDCG to evaluate our results.  
(AP and MAP are a little time-consuming. If you run please waite for a while.)

In [88]:
import pandas as pd
import sys 
import numpy as np

In [89]:
#Golden Standard
test= pd.read_csv('IRByBasicRetrieve.txt', sep=' ',encoding='utf-8', engine='python',names = ["QUERY_ID", "DOC_ID", "RELEVANCE_LEVEL"])
test.head(3)

Unnamed: 0,QUERY_ID,DOC_ID,RELEVANCE_LEVEL
0,PLAIN-10,MED-2054,0.274678
1,PLAIN-10,MED-1985,0.271303
2,PLAIN-10,MED-2058,0.25834


In [90]:
# read results data 
results = pd.read_csv('IRByTieredIndex result2.txt', sep=' ',encoding='utf-8', engine='python',names = ["QUERY_ID", "DOC_ID", "sim__results"])
results.head(3)

Unnamed: 0,QUERY_ID,DOC_ID,sim__results
0,PLAIN-10,MED-2494,0.374199
1,PLAIN-10,MED-5101,0.278249
2,PLAIN-10,MED-3880,0.259749


In [91]:
#manully check whether there are duplicate query and doc
results_dup = results.loc[:,['QUERY_ID','DOC_ID']]
results_dup = results_dup.drop_duplicates()
results_dup.head(3)

Unnamed: 0,QUERY_ID,DOC_ID
0,PLAIN-10,MED-2494
1,PLAIN-10,MED-5101
2,PLAIN-10,MED-3880


## Precision (P) & Recall (R) 

Merge test and results datasets. One of parameters of pd.merge is "how='outer'" to get Union dateset.

In [92]:
# 1.Merge results data with test data 
# 2.Replace Nan with 0
merge = pd.merge(results, test, how='outer', on=['QUERY_ID','DOC_ID'])
merge.fillna(0, inplace=True)
merge.head(3)

Unnamed: 0,QUERY_ID,DOC_ID,sim__results,RELEVANCE_LEVEL
0,PLAIN-10,MED-2494,0.374199,0.237224
1,PLAIN-10,MED-5101,0.278249,0.049426
2,PLAIN-10,MED-3880,0.259749,0.040855


In [93]:
#manully check whether there are duplicate query and doc
merge_dup = merge.loc[:,['QUERY_ID','DOC_ID']]
merge_dup = merge_dup.drop_duplicates()
merge_dup.head(3)

Unnamed: 0,QUERY_ID,DOC_ID
0,PLAIN-10,MED-2494
1,PLAIN-10,MED-5101
2,PLAIN-10,MED-3880


tp: retrieved & relevant—— sim\__results !=0 & RELEVANCE\_LEVEL !=0  
fp: retrieved & not relevant—— sim\__results !=0 & RELEVANCE\_LEVEL==0  
fn: not retrieved & relevant—— sim\__results ==0 & RELEVANCE\_LEVEL !=0  
tn: not retrieved & not relevant—— sim\__results ==0 & RELEVANCE\_LEVEL ==0

In [94]:
tp = merge.loc[(merge["sim__results"] != 0)& (merge["RELEVANCE_LEVEL"] != 0)] 
tp=tp.shape[0] 
fp = merge.loc[(merge["sim__results"] != 0)& (merge["RELEVANCE_LEVEL"] == 0)]
fp=fp.shape[0] 
fn = merge.loc[(merge["sim__results"] == 0)& (merge["RELEVANCE_LEVEL"] != 0)]
fn=fn.shape[0] 
tn = merge.loc[(merge["sim__results"] == 0)& (merge["RELEVANCE_LEVEL"] == 0)]
tn=tn.shape[0] 

In [95]:
# precision =tp/(tp+fp)
tpfp = tp+fp
precision = tp/tpfp
print ("%.3f" % precision)

0.817


In [96]:
#recall = tp/tp+fn
tpfn = tp+fn
recall = tp/tpfn
print ("%.3f" % recall)

0.057


## P@K & R_precision

In [97]:
# 1.Merge results data with test data 
# 2.Replace Nan with 0
merge1 = pd.merge(results, test, how='right', on=['QUERY_ID','DOC_ID'])
merge1.sort_values(['QUERY_ID', 'RELEVANCE_LEVEL'], ascending=[True, False], inplace=True)#merge1.fillna(0, inplace=True)
merge1.fillna(0, inplace=True)
merge1.head(3)

Unnamed: 0,QUERY_ID,DOC_ID,sim__results,RELEVANCE_LEVEL
38196,PLAIN-10,MED-2054,0.0,0.274678
38197,PLAIN-10,MED-1985,0.0,0.271303
38198,PLAIN-10,MED-2058,0.0,0.25834


In [98]:
r = merge1['sim__results']
r =np.array(r)
r

array([0., 0., 0., ..., 0., 0., 0.])

In [99]:
def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)
c = precision_at_k(r,10)
print("P@K:", c )

P@K: 0.2


In [100]:
def r_precision(r):
    r = np.asarray(r) != 0
    z = r.nonzero()[0]
    if not z.size:
        return 0.
    return np.mean(r[:z[-1] + 1])
c = r_precision(r)
print("R_precision:",c)

R_precision: 0.05713379683367836


## AP & MAP

In [101]:
#join 'QUERY_ID' and 'DOC_ID' as 'QUERY_DOC', in order to cpmpare in the next steps
test['QUERY_DOC'] = test[['QUERY_ID', 'DOC_ID']].apply(lambda x: ''.join(x), axis=1)
QD_test = test
QD_test.head(3)

Unnamed: 0,QUERY_ID,DOC_ID,RELEVANCE_LEVEL,QUERY_DOC
0,PLAIN-10,MED-2054,0.274678,PLAIN-10MED-2054
1,PLAIN-10,MED-1985,0.271303,PLAIN-10MED-1985
2,PLAIN-10,MED-2058,0.25834,PLAIN-10MED-2058


In [102]:
#join 'QUERY_ID' and 'DOC_ID' as 'QUERY_DOC', in order to cpmpare in the next steps
results['QUERY_DOC'] = results[['QUERY_ID', 'DOC_ID']].apply(lambda x: ''.join(x), axis=1)
results1 = results
results1.head(3)

Unnamed: 0,QUERY_ID,DOC_ID,sim__results,QUERY_DOC
0,PLAIN-10,MED-2494,0.374199,PLAIN-10MED-2494
1,PLAIN-10,MED-5101,0.278249,PLAIN-10MED-5101
2,PLAIN-10,MED-3880,0.259749,PLAIN-10MED-3880


In [103]:
#Transform to list
QD_results = results1['QUERY_DOC']
QD_results = QD_results.tolist()
print(QD_results[:3])

['PLAIN-10MED-2494', 'PLAIN-10MED-5101', 'PLAIN-10MED-3880']


In [104]:
# Caluate average precison
def ap(QD_results1, QD_test1):
    for i,p in enumerate(QD_test1): 
        score = 0
        num_hits = 0
        if p in QD_results1 and p not in QD_test1[:i]:
            num_hits += 1  
            score += num_hits / (i+1) 
    if not QD_results1:
        return 0
    return score / len(QD_results1)
E = ap(QD_results, QD_test) 
print ("AP:","%.5f" % E)

AP: 0.00000


In [105]:
# Caluate mean average precison
def map(actual, predicted):
    return np.mean([ap(a,p) for a,p in zip(actual, predicted)])                    
D = map(QD_results, QD_test) 
print ("MAP:","%.5f" % D)

MAP 0.00195


## DCG & nDCG

In [106]:
#If we use test 3-2-1.qurl as our test dataset, we need to normalize results datasets.
#normalize similarity of results dataset, deveded them into 3 levels according to percentile(number)
'''
results_sim = results['sim__results']
results_sim = np.percentile(results_sim, [33,67])
results_sim #array([0.03233572, 0.08731153])
results['Nor__results']=results['sim__results']  
results.loc[ results['sim__results'] <= 0.03233572,'Nor__results']=1 
results.loc[ (results['sim__results'] > 0.03233572) & (results['sim__results'] <= 0.08731153),'Nor__results']=2
results.loc[ results['sim__results'] > 0.08731153,'Nor__results']=3 
results.head()
'''

"\nresults_sim = results['sim__results']\nresults_sim = np.percentile(results_sim, [33,67])\nresults_sim #array([0.03233572, 0.08731153])\nresults['Nor__results']=results['sim__results']  \nresults.loc[ results['sim__results'] <= 0.03233572,'Nor__results']=1 \nresults.loc[ (results['sim__results'] > 0.03233572) & (results['sim__results'] <= 0.08731153),'Nor__results']=2\nresults.loc[ results['sim__results'] > 0.08731153,'Nor__results']=3 \nresults.head()\n"

In [107]:
# Prepare input. It should be array
sim_results = results['sim__results']
#sim_results = merge1['sim__results']
sim_test = test['RELEVANCE_LEVEL']
sim_results = np.array(sim_results)
sim_test = np.array(sim_test)
print(sim_results[:3])

[0.3741988  0.27824872 0.25974862]


In [108]:
def dcg_score(a, b,k=10,gains="exponential"):
    order = np.argsort(-b)[::-1]
    a = np.take(a, order[:k])

    if gains == "exponential":
        gains = 2 ** a - 1 
    elif gains == "linear":
        gains = a
    else:
        raise ValueError("Invalid gains option.")

    # highest rank is 1 so +2 instead of +1
    discounts = np.log2(np.arange(len(a)) + 2)
    return np.sum(gains / discounts)

    best = dcg_score(a, a,gains)
    actual = dcg_score(a, b,gains)
    return actual / best

In [109]:
def ndcg_score(a, b, gains="exponential"):
    best = dcg_score(a, a,gains)
    actual = dcg_score(a, b,gains)
    return actual / best
c = dcg_score(sim_test, sim_results,gains="exponential")#(np.all
print("nDCG:",c)

nDCG: 0.05915260576948616
