This notebook will help to compute the NDCG for the BERT re-ranking results on the MS MARCO dataset making use of the newly acquired relevance label dataset.

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook
import math

In [2]:
data_dir = 'data/'

# graded relevance file
new_qrels_filename = 'thesis_dataset_graded_relevance.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

In [3]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels_df = pd.read_csv(data_dir + new_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels_df.columns = ['query_id','passage_id','graded_label']

models_dict = {"bm25": bm25_df, "bert": bert_df}

## Helper Functions

In [4]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

In [5]:
def get_top_n_ranking(dataframe,rank_column,n):
    top_n_ranking = dataframe[dataframe[rank_column] <= n].sort_values(by=[rank_column])
    return top_n_ranking

In [6]:
def get_relevance_labels(qrels_query_subset,top_n_ranking,model):
    relevant_items_rank_df = top_n_ranking.merge(qrels_query_subset,how='left',on=['query_id','passage_id'])
    relevant_items_rank_df['graded_label'].fillna(1.0, inplace=True)
    relevant_items_rank_df['graded_label'] = relevant_items_rank_df['graded_label'].astype(int)
    relevance_labels = relevant_items_rank_df.sort_values(by=['%s_rank'%(model)])['graded_label'].values.tolist()
    return relevance_labels

In [7]:
def compute_discounted_gain(index,rel):
    dg = rel/math.log((index+2),2)
    return dg

In [8]:
def compute_discounted_cumulative_gain(relevance_labels):
    dcg = 0.0
    for index, rel in enumerate(relevance_labels):
        dcg += compute_discounted_gain(index,rel)
    return dcg

In [9]:
def compute_ndcg(relevance_labels):
    dcg = compute_discounted_cumulative_gain(relevance_labels)
    idcg = compute_discounted_cumulative_gain(sorted(relevance_labels,reverse=True))
    ndcg = dcg/idcg
    return round(ndcg,3)

# Test Case

This section helps to check if the NDCG is correctly computed. We follow the example found on this page: https://medium.com/swlh/rank-aware-recsys-evaluation-metrics-5191bba16832

In [10]:
n = 6
model = "test"
ground_truth_df = pd.DataFrame({"query_id":[1,1,1,1,1,1,1], "passage_id":[1,2,3,4,5,6,7], "graded_label":[3,2,3,0,1,2,3]},columns=["query_id","passage_id","graded_label"])
test_rank_df = pd.DataFrame({"query_id":[1,1,1,1,1,1,1], "passage_id":[1,2,3,4,5,6,7], "test_rank": [1,2,3,4,5,6,7]},columns=["query_id","passage_id","test_rank"])

In [11]:
ground_truth_df

Unnamed: 0,query_id,passage_id,graded_label
0,1,1,3
1,1,2,2
2,1,3,3
3,1,4,0
4,1,5,1
5,1,6,2
6,1,7,3


In [12]:
test_rank_df

Unnamed: 0,query_id,passage_id,test_rank
0,1,1,1
1,1,2,2
2,1,3,3
3,1,4,4
4,1,5,5
5,1,6,6
6,1,7,7


In [13]:
ndcg = 0.0

query_ids = get_query_ids(ground_truth_df)

for query_id in tqdm_notebook(query_ids):
    gt_subset = ground_truth_df[ground_truth_df['query_id'] == query_id].copy()
    query_subset = test_rank_df[test_rank_df['query_id'] == query_id].copy()
    
    top_n_ranking = get_top_n_ranking(query_subset,'%s_rank'%(model),n)
    relevance_labels = get_relevance_labels(gt_subset,top_n_ranking,model)
    ndcg += compute_ndcg(relevance_labels)

ndcg

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




0.961

Like on the webpage we consulted, we achieve a NDCG of 0.961. So we implemented the NDCG function correctly.

# Settings

Here you can decide what for cutoff you want to use to compute the NDCG. For example NDCG@20.

In [28]:
N = 5

# Evaluate BM25

## Compute NDCG

In [29]:
model = "bm25"

model_df = models_dict[model].copy()

ndcg = 0.0

query_ids = get_query_ids(new_qrels_df)
model_subset_df = model_df[model_df['query_id'].isin(query_ids)].copy()

for query_id in tqdm_notebook(query_ids):
    new_qrels_query_subset = new_qrels_df[new_qrels_df['query_id'] == query_id].copy()
    query_subset = model_subset_df[model_subset_df['query_id'] == query_id].copy()
    
    top_n_ranking = get_top_n_ranking(query_subset,'%s_rank'%(model),N)
    relevance_labels = get_relevance_labels(new_qrels_query_subset,top_n_ranking,model)
    ndcg += compute_ndcg(relevance_labels)
    
ndcg = round((ndcg/len(query_ids))*100,1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [30]:
print(ndcg)

93.8


In [31]:
len(query_ids)

43

# Evaluate BERT

## Compute NDCG

In [32]:
model = "bert"

model_df = models_dict[model].copy()

ndcg = 0.0

query_ids = get_query_ids(new_qrels_df)
model_subset_df = model_df[model_df['query_id'].isin(query_ids)].copy()

for query_id in tqdm_notebook(query_ids):
    new_qrels_query_subset = new_qrels_df[new_qrels_df['query_id'] == query_id].copy()
    query_subset = model_subset_df[model_subset_df['query_id'] == query_id].copy()
    
    top_n_ranking = get_top_n_ranking(query_subset,'%s_rank'%(model),N)
    
    relevance_labels = get_relevance_labels(new_qrels_query_subset,top_n_ranking,model)
    ndcg += compute_ndcg(relevance_labels)
    
ndcg = round((ndcg/len(query_ids))*100,1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [33]:
ndcg

93.3

In [34]:
len(query_ids)

43