This notebook will help to compute the NDCG for all experiment settings.

## Settings

In [1]:
metric = "ndcg"
models = ["bm25", "bert"]
datasets = ["graded"]
cutoffs = [5, 10, 20]

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook
import math

## Load Data

In [3]:
data_dir = 'data/'

# graded relevance file
new_qrels_filename = 'thesis_dataset_graded_relevance.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

In [4]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels_df = pd.read_csv(data_dir + new_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels_df.columns = ['query_id','passage_id','graded_label']

models_dict = {"bm25": bm25_df, "bert": bert_df}
dataset_dict = {"ms_marco": og_qrels_df, "graded": new_qrels_df}

## Helper Functions

In [32]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

def get_top_n_ranking(dataframe,rank_column,n):
    top_n_ranking = dataframe[dataframe[rank_column] <= n].sort_values(by=[rank_column])
    return top_n_ranking

def get_relevance_labels(qrels_query_subset,top_n_ranking,model):
    relevant_items_rank_df = top_n_ranking.merge(qrels_query_subset,how='left',on=['query_id','passage_id'])
    relevant_items_rank_df['graded_label'].fillna(1.0, inplace=True)
    relevant_items_rank_df['graded_label'] = relevant_items_rank_df['graded_label'].astype(int)
    relevance_labels = relevant_items_rank_df.sort_values(by=['%s_rank'%(model)])['graded_label'].values.tolist()
    return relevance_labels

def compute_discounted_gain(index,rel):
    dg = rel/math.log((index+2),2)
    return dg

def compute_discounted_cumulative_gain(relevance_labels):
    dcg = 0.0
    for index, rel in enumerate(relevance_labels):
        dcg += compute_discounted_gain(index,rel)
    return dcg

def compute_dcg(qrels_query_subset,top_n_ranking,model):
    relevance_labels = get_relevance_labels(qrels_query_subset,top_n_ranking,model)
    dcg = compute_discounted_cumulative_gain(relevance_labels)
    return dcg

def compute_idcg(qrels_query_subset,N):
    relevance_labels = qrels_query_subset['graded_label'].values.tolist()
    ideal_list = sorted(relevance_labels,reverse=True)
    idcg = compute_discounted_cumulative_gain(ideal_list[:N])
    return idcg

def compute_ndcg(relevance_labels):
    dcg = compute_discounted_cumulative_gain(relevance_labels)
    idcg = compute_discounted_cumulative_gain(sorted(relevance_labels,reverse=True))
    ndcg = dcg/idcg
    return round(ndcg,3)

## Compute NDCG

In [33]:
scores = []
for model in models:
    for dataset in datasets:
        for cutoff in cutoffs:
            output_file = "output/%s_scores_%s_%s_N%s.txt"%(metric,model,dataset,cutoff)
            
            model_df = models_dict[model].copy()
            dataset_df = dataset_dict[dataset].copy()

            N = cutoff

            ndcg_scores = []

            summed_ndcg = 0.0

            query_ids = get_query_ids(new_qrels_df)

            for query_id in tqdm_notebook(query_ids):
                model_query_subset = model_df[model_df['query_id'] == query_id].copy()
                qrels_query_subset = dataset_df[dataset_df['query_id'] == query_id].copy()

                top_n_ranking = get_top_n_ranking(model_query_subset,'%s_rank'%(model),N)
                
                dcg = compute_dcg(qrels_query_subset,top_n_ranking,model)
                
                idcg = compute_idcg(qrels_query_subset,N)
                
                ndcg = round((dcg/idcg),3)

                summed_ndcg += ndcg

                ndcg_scores.append((query_id,ndcg))

            ndcg = round((summed_ndcg/len(query_ids))*100,1)
            
            score_data = "model: %s; dataset: %s; cutoff: %s; %s_score: %s\n"%(model,dataset,cutoff,metric,ndcg)
            scores.append(score_data)
        
            with open(output_file,'w') as outfile:
                for (query_id, ndcg) in ndcg_scores:
                    line = "%s\t%s\n"%(query_id,ndcg)
                    outfile.write(line)
with open("output/%s_scores.txt"%(metric),'w') as outfile:
    for score_data in scores:
        outfile.write(score_data)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




In [34]:
for score_data in scores:
    print(score_data)

model: bm25; dataset: graded; cutoff: 5; ndcg_score: 78.7

model: bm25; dataset: graded; cutoff: 10; ndcg_score: 82.8

model: bm25; dataset: graded; cutoff: 20; ndcg_score: 91.5

model: bert; dataset: graded; cutoff: 5; ndcg_score: 78.0

model: bert; dataset: graded; cutoff: 10; ndcg_score: 78.4

model: bert; dataset: graded; cutoff: 20; ndcg_score: 82.4

