In [1]:
import numpy as np
import pandas as pd

In [None]:
"""
Description:
the below code implements reranking and evaluates the model using MAP and NDCG@10
to do the final ranking it uses the reranked file with scores given by the model, 
then sorts them to give the final ranking 
original relevance labels are taken from the test.txt file used as an input to do reranking

"""

## New Ranking by RankNet Model: df_new_rank

Data Cleaning: test.txt file (df_test)

In [2]:
#original test.txt (Fold 1) with relevance values:
df_test = pd.read_csv('test.txt', sep=" ", header = None)
pd.set_option('display.max_columns', 500)

#data cleaning:
df_test[1] = df_test[1].map(lambda x: x.lstrip('qid:'))      
df_test.insert(0, 'quid', df_test[1])
df_test.insert(1, 'rel', df_test[0])
df_test = pd.concat([df_test['quid'], df_test['rel']], axis=1, keys=['quid', 'rel'])
df_test.head(10)

Unnamed: 0,quid,rel
0,13,2
1,13,1
2,13,3
3,13,1
4,13,0
5,13,0
6,13,1
7,13,0
8,13,0
9,13,2


Ranking Produced by RankNet Model (model) (on test.txt data):

In [3]:
model = pd.read_csv('ModelsRerankingFinal-rank/reranking_model_5_old.txt', sep="	", header = None)
model.columns = ['quid_model','count_model', 'rank_model']
model.head(10)

Unnamed: 0,quid_model,count_model,rank_model
0,13,0,0.473997
1,13,1,0.473997
2,13,2,0.473997
3,13,3,0.431526
4,13,4,0.442372
5,13,5,0.485951
6,13,6,0.473997
7,13,7,0.473997
8,13,8,0.489055
9,13,9,0.473997


Data Frame (df) with quids, original relevance labels and new ranking ('rank') produced by RankNet model:

In [4]:
df = pd.concat([df_test['quid'], df_test['rel'], model['rank_model']], axis=1)
df.head(10)

Unnamed: 0,quid,rel,rank_model
0,13,2,0.473997
1,13,1,0.473997
2,13,3,0.473997
3,13,1,0.431526
4,13,0,0.442372
5,13,0,0.485951
6,13,1,0.473997
7,13,0,0.473997
8,13,0,0.489055
9,13,2,0.473997


In [5]:
#Sort by the rank_model (score provided as output by RankNet model RankLib) and compare with orignal relevance labels:
df_new_rank = df.groupby('quid').apply(lambda x: x.sort_values(['rank_model'],ascending = False))
df_new_rank['rank'] = df_new_rank.groupby('quid').cumcount()+1
df_new_rank.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,quid,rel,rank_model,rank
quid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10003,79837,10003,2,0.511708,1
10003,79799,10003,1,0.511708,2
10003,79865,10003,0,0.511708,3
10003,79862,10003,0,0.511708,4
10003,79808,10003,0,0.511708,5
10003,79776,10003,1,0.511708,6
10003,79830,10003,0,0.511708,7
10003,79855,10003,0,0.511708,8
10003,79853,10003,1,0.511708,9
10003,79832,10003,3,0.511708,10


## Evaluation: MAP

In [12]:
df_MAP = pd.concat([df_new_rank['quid'], df_new_rank['rel'], df_new_rank['rank_model'],df_new_rank['rank']], axis=1)

#expressing relevance in binary format
df_MAP['rel'] = np.where(df_MAP['rel'] >=1, 1,0)
#inserting column with cumulative sum of relevance:
df_MAP['rel_cum_sum'] = df_MAP.groupby('quid')['rel'].cumsum()

# Precision:
df_MAP['precision'] = (df_MAP['rel_cum_sum']*df_MAP['rel'])/df_MAP['rank']

def MAP(df):
    # AP - average precision per query:
    precision_sum = df_MAP.groupby('quid')['precision'].sum()
    rel_sum = df_MAP.groupby('quid')['rel'].sum()
    AP = precision_sum / rel_sum
    
    # MAP - mean average precision of the system for all queries
    MAP = AP.sum() / len(AP)
    return MAP

MAP(df_MAP)

0.4651467459670899

## Evaluation: NDCG@10

Discounted Cumulative Gain (DCG):

In [20]:
#creating a new dataframe with quid, rel and rank for DCG Dicounted Cumulative Gain
df_NDCG = pd.concat([df_new_rank['quid'], df_new_rank['rel'], df_new_rank['rank']], axis=1)
df_NDCG = df_NDCG.groupby('quid').head(10)

#Gain:
df_NDCG['gain'] = (2**df_NDCG['rel'])-1

df_NDCG['rank+1'] = df_NDCG['rank']+1

#Discount by Rank:
df_NDCG['discount'] = np.log2(df_NDCG['rank+1'])

#Discounted Gain:
df_NDCG['discounted_gain'] = df_NDCG['gain']/df_NDCG['discount']

#DCG - Discounted Cumulative Gain per quid:
DCG = df_NDCG.groupby(by=['quid'])['discounted_gain'].sum()

Optimal Discounted Cumulitave Gain (optDCG):

In [21]:
#optDCG: optimal Dicounted Cumulative Gain
df_opt_NDCG = pd.concat([df_new_rank['quid'], df_new_rank['rel']], axis=1, keys=['quid', 'rel'])

opt_NDCG = df_opt_NDCG.groupby('quid')['rel'].apply(lambda x: x.order(ascending=False)).reset_index()
opt_NDCG['rank'] = opt_NDCG.groupby('quid').cumcount()+1
opt_NDCG = opt_NDCG.groupby('quid').head(10)




In [22]:
#opt DCG: Dicounted Cumulative Gain
#Gain:
opt_NDCG['gain'] = (2**opt_NDCG['rel'])-1

opt_NDCG['rank+1'] = opt_NDCG['rank']+1

#Discount by Rank:
opt_NDCG['discount'] = np.log2(opt_NDCG['rank+1'])

#Discounted Gain:
opt_NDCG['discounted_gain'] = opt_NDCG['gain']/opt_NDCG['discount']

#opt DCG - Discounted Cumulative Gain per quid:
optDCG = opt_NDCG.groupby(by=['quid'])['discounted_gain'].sum()


Normalized Discounted Cumulative Gain (NDCG):

In [26]:
#avg NDCG for the system:
NDCG = DCG/optDCG
NDCG = np.nan_to_num(NDCG)
avg_NDCG = sum(NDCG)/len(NDCG)
avg_NDCG

0.2058348591733164