This notebook will help to compare NDCG and Precision scores for the BERT re-ranking results on the MS MARCO dataset making use of the newly acquired relevance label dataset.

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook
import math

## Load Files

In [11]:
data_dir = 'data/'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# graded relevance file
new_qrels_filename = 'thesis_dataset_graded_relevance.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

In [13]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_graded_qrels_df = pd.read_csv(data_dir + new_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
new_graded_qrels_df.columns = ['query_id','passage_id','graded_label']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

models_dict = {"bm25": bm25_df, "bert": bert_df}
new_qrels_dict = {"graded": new_graded_qrels_df, "binary2": new_qrels2_df, "binary3": new_qrels3_df}

## Helper Functions

In [4]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

In [9]:
def get_passage_ids(dataframe):
    relevant_passages = dataframe['passage_id'].values.tolist()
    return relevant_passages

In [5]:
def get_top_n_ranking(dataframe,rank_column,n):
    top_n_ranking = dataframe[dataframe[rank_column] <= n].sort_values(by=[rank_column])
    return top_n_ranking

### NDCG

In [6]:
def compute_discounted_gain(index,rel):
    dg = rel/math.log((index+2),2)
    return dg

In [7]:
def compute_discounted_cumulative_gain(relevance_labels):
    dcg = 0.0
    for index, rel in enumerate(relevance_labels):
        dcg += compute_discounted_gain(index,rel)
    return dcg

In [8]:
def compute_ndcg(relevance_labels):
    dcg = compute_discounted_cumulative_gain(relevance_labels)
    idcg = compute_discounted_cumulative_gain(sorted(relevance_labels,reverse=True))
    ndcg = dcg/idcg
    return round(ndcg,3)

### Precision

In [10]:
def compute_precision(gt,ranking,n):
    precision = 0.0
    nr_relevant_items = 0
    for index, row in ranking.iterrows():
        if row['passage_id'] in gt:
            nr_relevant_items += 1
    precision = nr_relevant_items/n
    return precision

## Settings

Here you can decided for what cutoff you want to compute the precision or ndcg. For example if you want to compute P@5, set N to 5.

In [14]:
N = 5

## Check query lists

Now the relevance label dataset contain different number of query ids. This has to do with certain decisions that were made. For example, by removing query ids for which the binary agreed assessor label was set to irrelevant while the ms marco label was originally relevant.

Here we are going to check if the different label datasets actually contain different query ids.

In [20]:
graded_qrels_df = new_qrels_dict["graded"].copy()
binary2_qrels_df = new_qrels_dict["binary2"].copy()
binary3_qrels_df = new_qrels_dict["binary3"].copy()

In [21]:
graded_query_ids = get_query_ids(graded_qrels_df)
binary2_query_ids = get_query_ids(binary2_qrels_df)
binary3_query_ids = get_query_ids(binary3_qrels_df)

First check if all binary query ids are present in the graded dataset.

In [26]:
for query_id in binary2_query_ids:
    if not query_id in graded_query_ids:
        print(query_id)

In [27]:
for query_id in binary3_query_ids:
    if not query_id in graded_query_ids:
        print(query_id)

No query ids are printed so the entire binary query id list is in the graded list. This will probably also mean that both binary datasets exist of the same query ids. But lets check this.

In [28]:
for query_id in binary3_query_ids:
    if not query_id in binary2_query_ids:
        print(query_id)

Again no query ids are printed so, while there are a few query ids that either belong to the binary2 and graded dataset or only to graded dataset. All other query ids are shared among the three datasets.

In other words, each smaller dataset is a subset of the larger datasets.
Exact numbers:

binary3_dataset nr queries = 42

binary2_dataset nr queries = 46

graded_dataset nr queries = 47

# Evaluate BM25 