## AdaRank Evaluation

In [11]:
%matplotlib inline
from os.path import join
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score

from sklearn.linear_model import LinearRegression

These scores are obtained from the run of AdaRank on Fold5 with -noeq flag and -max number of consecutively sampled features set to 1. The analysis investigated why the scores have negative values and ranks them.

In [110]:
test_data = pd.read_csv('/Users/ralucageorgescu/Documents/MSLR-WEB10K/Fold5/test.txt', sep=" ", header = None)
test_data[1] = test_data[1].map(lambda x:x.lstrip('qid:'))
test_data_q_rel = pd.concat([test_data[1], test_data[0]], axis=1, keys=['qid', 'relevance_label'])
test_data_q_rel[:5]

Unnamed: 0,qid,relevance_label
0,10,0
1,10,0
2,10,1
3,10,0
4,10,1


In [111]:
data = pd.read_table('/Users/ralucageorgescu/Documents/scores.txt')
data['relevance_label'] = test_data[0]
data[:10]

Unnamed: 0,qid,docid,score,relevance_label
0,10,0,-5.294102,0
1,10,1,0.77786,0
2,10,2,2.40469,1
3,10,3,0.330672,0
4,10,4,2.337713,1
5,10,5,4367.257274,2
6,10,6,3.473539,1
7,10,7,1.213004,1
8,10,8,-5.941709,0
9,10,9,-5.002998,0


## Stats on test set

Number of queries that have at most 10 documents associated

In [101]:
number = 0
for count in doc_counts_per_query:
    if count <= 10:
        number += 1
print(number)

39


Number of queries in the test set

In [99]:
groupby_query = qs.groupby('qid')
groups = groupby_query.groups
len(groups)

2000

In [21]:
len(data)

235259

In [95]:
docs_per_query = data.groupby('qid').docid.count()
docs_per_query[:10]

qid
10      93
25      58
40      84
55      58
70     163
85     117
100     88
115    138
130     84
145     97
Name: docid, dtype: int64

## Sort rankings by query id

Order scores
- group by qid
- sort by score in descresing order

In [113]:
qs = pd.concat([data['qid'], data['score'], data['relevance_label']], axis=1, keys=['qid','score', 'relevance_label'])

In [127]:
rank_by_qid = qs.groupby('qid').apply(lambda x: x.sort_values(['score'],ascending = False))
rank_by_qid['position'] = rank_by_qid.groupby('qid').cumcount()
rank_by_qid[:10]

Unnamed: 0_level_0,Unnamed: 1_level_0,qid,score,relevance_label,position
qid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,73,10,4498.56591,4,0
10,5,10,4367.257274,2,1
10,61,10,35.755612,1,2
10,59,10,12.754565,3,3
10,77,10,3.772885,0,4
10,53,10,3.477178,2,5
10,6,10,3.473539,1,6
10,74,10,3.301133,1,7
10,68,10,3.292962,2,8
10,60,10,3.265408,2,9


# Evaluation Metrics

Use test data as ground truth

## MAP

In [133]:
MAP = pd.concat([rank_by_qid['qid'], rank_by_qid['relevance_label'], rank_by_qid['score'], rank_by_qid['position']], axis=1)
MAP['relevance_binary'] = np.where(MAP['relevance_label'] >= 1,1,0)
MAP['relevance_cumulative'] = MAP.groupby('qid')['relevance_label'].cumsum()
MAP['precision'] = (MAP['relevance_cumulative']*MAP['relevance_binary'])/(MAP['position']+1)

def mean_average_precision(data):
    precision_sum = MAP.groupby('qid')['precision'].sum()
    relevance_sum = MAP.groupby('qid')['relevance_binary'].sum()
    average_precision = precision_sum/relevance_sum
    mean_average_precision = average_precision.sum()/len(average_precision)
    return mean_average_precision

mean_average_precision(MAP)

0.90446700566880622

## NDCG@10

In [134]:
NDCG = pd.concat([rank_by_qid['qid'], rank_by_qid['relevance_label'], rank_by_qid['position']], axis=1)
NDCG = NDCG.groupby('qid').head(10)

NDCG['gain'] = (2**NDCG['relevance_label'])-1
NDCG['rank+1'] = NDCG['position']+2
NDCG['discount'] = np.log2(NDCG['rank+1'])
NDCG['discounted_gain'] = NDCG['gain']/NDCG['discount']

DCG = NDCG.groupby('qid')['discounted_gain'].sum()

In [136]:
NDCG_ideal = pd.concat([rank_by_qid['qid'], rank_by_qid['relevance_label']], axis=1, keys=['qid', 'relevance_label'])

NDCG_ideal = NDCG_ideal.groupby('qid')['relevance_label'].apply(lambda x: x.sort_values(ascending=False)).reset_index()
NDCG_ideal['position'] = NDCG_ideal.groupby('qid').cumcount()+1
NDCG_ideal = NDCG_ideal.groupby('qid').head(10)

In [138]:
NDCG_ideal['gain'] = (2**NDCG_ideal['relevance_label'])-1
NDCG_ideal['rank+1'] = NDCG_ideal['position']+1
NDCG_ideal['discount'] = np.log2(NDCG_ideal['rank+1'])
NDCG_ideal['discounted_gain'] = NDCG_ideal['gain']/NDCG_ideal['discount']

optDCG = NDCG_ideal.groupby('qid')['discounted_gain'].sum()

In [139]:
normalised_DCG = DCG/optDCG
normalised_DCG = np.nan_to_num(normalised_DCG)
avg_NDCG = sum(normalised_DCG)/len(normalised_DCG)
avg_NDCG

0.40594866798268447