This notebook will help to compute the MRR for the BERT re-ranking on the MS MARCO dataset making use of both the old relevance label dataset and the newly acquired relevance label dataset. 

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook

## Load Data

In [2]:
data_dir = 'data/'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

In [3]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

models_dict = {"bm25": bm25_df, "bert": bert_df}
new_qrels_dict = {"threshold=2": new_qrels2_df, "threshold=3": new_qrels3_df}

## Helper Functions

In [4]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

In [5]:
def get_top_n_ranking(dataframe,rank_column,n):
    top_n_ranking = dataframe[dataframe[rank_column] <= n].sort_values(by=[rank_column])
    return top_n_ranking

In [6]:
def get_passage_ids(dataframe):
    relevant_passages = dataframe['passage_id'].values.tolist()
    return relevant_passages

In [7]:
def compute_mrr(gt,ranking,model,n):
    score = 0.0
    best_rank = n+1
    for index, row in ranking.iterrows():
        current_rank = row['%s_rank'%(model)]
        if row['passage_id'] in gt:
            if current_rank < best_rank:
                score = 1.0 / (row['%s_rank'%(model)])
                best_rank = current_rank
    return score

# Test Case

This section helps to check if the MRR is correctly computed.
We follow the example found on this page: https://medium.com/swlh/rank-aware-recsys-evaluation-metrics-5191bba16832

In [8]:
n = 3
model = "test"
ground_truth_df = pd.DataFrame({"query_id":[1,2,2,3,3], "passage_id":[3,5,6,7,8]},columns=["query_id","passage_id"])
test_rank_df = pd.DataFrame({"query_id":[1,1,1,2,2,2,3,3,3], "passage_id":[1,2,3,4,5,6,7,8,9], "test_rank": [1, 2, 3, 1, 2, 3, 1, 2, 3]},columns=["query_id","passage_id","test_rank"])

In [9]:
ground_truth_df

Unnamed: 0,query_id,passage_id
0,1,3
1,2,5
2,2,6
3,3,7
4,3,8


In [10]:
test_rank_df

Unnamed: 0,query_id,passage_id,test_rank
0,1,1,1
1,1,2,2
2,1,3,3
3,2,4,1
4,2,5,2
5,2,6,3
6,3,7,1
7,3,8,2
8,3,9,3


In [11]:
mrr = 0.0
query_ids = list(np.unique(test_rank_df['query_id'].values.tolist()))
for query_id in query_ids:
    gt = ground_truth_df[ground_truth_df['query_id'] == query_id].copy()
    test_ranking = test_rank_df[test_rank_df['query_id'] == query_id].copy()
    
    gt_values = gt['passage_id'].values.tolist()
    mrr += compute_mrr(gt_values,test_ranking,model,n)
mrr = round(mrr/len(query_ids),2)

In [12]:
mrr

0.61

Like on the webpage we consulted, we achieve a MRR of 0.61. So we implemented the MRR function correctly.

# Settings

In [13]:
N = 10

# Evaluate BM25

## Compute MRR (threshold = 2)

In [14]:
# Irrelevant < 2; relevant >= 2
binary_threshold = 2

model = "bm25"

model_df = models_dict[model].copy()

# Get new relevance dataset
new_qrels_df = new_qrels_dict["threshold=%s"%(binary_threshold)].copy()

# Original MRR and new MRR set to zero
# Original == ms marco relevance dataset; new == new relevance dataset
og_mrr = 0.0
new_mrr = 0.0

# The new relevance dataset contains less query ids than the original msmarco dataset.
# So get these query ids and use them to create subsets of the original dataset and the bm25 output.
query_ids = get_query_ids(new_qrels_df)
og_qrels_subset_df = og_qrels_df[og_qrels_df['query_id'].isin(query_ids)].copy()
model_subset_df = model_df[model_df['query_id'].isin(query_ids)].copy()

for query_id in tqdm_notebook(query_ids):
    # Get query specific ranking and relevance datasets
    query_subset = model_subset_df[model_subset_df['query_id'] == query_id].copy()
    og_qrels_query_subset = og_qrels_subset_df[og_qrels_subset_df['query_id'] == query_id]
    new_qrels_query_subset = new_qrels_df[new_qrels_df['query_id'] == query_id]
    
    top_n_ranking = get_top_n_ranking(query_subset,'%s_rank'%(model),N)
    
    og_relevant_passages = get_passage_ids(og_qrels_query_subset)
    new_relevant_passages = get_passage_ids(new_qrels_query_subset)
    
    og_mrr += compute_mrr(og_relevant_passages,top_n_ranking,model,N)
    new_mrr += compute_mrr(new_relevant_passages,top_n_ranking,model,N)
    
og_mrr = round((og_mrr/len(query_ids))*100,1)
new_mrr = round((new_mrr/len(query_ids))*100,1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [15]:
print(og_mrr)

45.6


In [16]:
print(new_mrr)

97.7


In [17]:
print(len(query_ids))

43


## Compute MRR (threshold = 3)

In [18]:
# Irrelevant < 3; relevant >= 3
binary_threshold = 3

model = "bm25"

model_df = models_dict[model].copy()

# Get new relevance dataset
new_qrels_df = new_qrels_dict["threshold=%s"%(binary_threshold)].copy()

# Original MRR and new MRR set to zero
# Original == ms marco relevance dataset; new == new relevance dataset
og_mrr = 0.0
new_mrr = 0.0

# The new relevance dataset contains less query ids than the original msmarco dataset.
# So get these query ids and use them to create subsets of the original dataset and the bm25 output.
query_ids = get_query_ids(new_qrels_df)
og_qrels_subset_df = og_qrels_df[og_qrels_df['query_id'].isin(query_ids)].copy()
model_subset_df = model_df[model_df['query_id'].isin(query_ids)].copy()

for query_id in tqdm_notebook(query_ids):
    # Get query specific ranking and relevance datasets
    query_subset = model_subset_df[model_subset_df['query_id'] == query_id].copy()
    og_qrels_query_subset = og_qrels_subset_df[og_qrels_subset_df['query_id'] == query_id]
    new_qrels_query_subset = new_qrels_df[new_qrels_df['query_id'] == query_id]
    
    top_n_ranking = get_top_n_ranking(query_subset,'%s_rank'%(model),N)
    
    og_relevant_passages = get_passage_ids(og_qrels_query_subset)
    new_relevant_passages = get_passage_ids(new_qrels_query_subset)
    
    og_mrr += compute_mrr(og_relevant_passages,top_n_ranking,model,N)
    new_mrr += compute_mrr(new_relevant_passages,top_n_ranking,model,N)
    
og_mrr = round((og_mrr/len(query_ids))*100,1)
new_mrr = round((new_mrr/len(query_ids))*100,1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [19]:
print(og_mrr)

45.6


In [20]:
print(new_mrr)

90.7


In [21]:
print(len(query_ids))

43


# Evaluate BERT

## Compute MRR (threshold = 2)

In [23]:
# Irrelevant < 2; relevant >= 2
binary_threshold = 2

model = "bert"

model_df = models_dict[model].copy()

# Get new relevance dataset
new_qrels_df = new_qrels_dict["threshold=%s"%(binary_threshold)].copy()

# Original MRR and new MRR set to zero
# Original == ms marco relevance dataset; new == new relevance dataset
og_mrr = 0.0
new_mrr = 0.0

# The new relevance dataset contains less query ids than the original msmarco dataset.
# So get these query ids and use them to create subsets of the original dataset and the bm25 output.
query_ids = get_query_ids(new_qrels_df)
og_qrels_subset_df = og_qrels_df[og_qrels_df['query_id'].isin(query_ids)].copy()
model_subset_df = model_df[model_df['query_id'].isin(query_ids)].copy()

for query_id in tqdm_notebook(query_ids):
    # Get query specific ranking and relevance datasets
    query_subset = model_subset_df[model_subset_df['query_id'] == query_id].copy()
    og_qrels_query_subset = og_qrels_subset_df[og_qrels_subset_df['query_id'] == query_id]
    new_qrels_query_subset = new_qrels_df[new_qrels_df['query_id'] == query_id]
    
    top_n_ranking = get_top_n_ranking(query_subset,'%s_rank'%(model),N)
    
    og_relevant_passages = get_passage_ids(og_qrels_query_subset)
    new_relevant_passages = get_passage_ids(new_qrels_query_subset)
    
    og_mrr += compute_mrr(og_relevant_passages,top_n_ranking,model,N)
    new_mrr += compute_mrr(new_relevant_passages,top_n_ranking,model,N)
    
og_mrr = round((og_mrr/len(query_ids))*100,1)
new_mrr = round((new_mrr/len(query_ids))*100,1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [24]:
print(og_mrr)

68.6


In [25]:
print(new_mrr)

91.4


In [26]:
len(query_ids)

43

## Compute MRR@10 (threshold = 3)

In [27]:
# Irrelevant < 3; relevant >= 3
binary_threshold = 3

model = "bert"

model_df = models_dict[model].copy()

# Get new relevance dataset
new_qrels_df = new_qrels_dict["threshold=%s"%(binary_threshold)].copy()

# Original MRR and new MRR set to zero
# Original == ms marco relevance dataset; new == new relevance dataset
og_mrr = 0.0
new_mrr = 0.0

# The new relevance dataset contains less query ids than the original msmarco dataset.
# So get these query ids and use them to create subsets of the original dataset and the bm25 output.
query_ids = get_query_ids(new_qrels_df)
og_qrels_subset_df = og_qrels_df[og_qrels_df['query_id'].isin(query_ids)].copy()
model_subset_df = model_df[model_df['query_id'].isin(query_ids)].copy()

for query_id in tqdm_notebook(query_ids):
    # Get query specific ranking and relevance datasets
    query_subset = model_subset_df[model_subset_df['query_id'] == query_id].copy()
    og_qrels_query_subset = og_qrels_subset_df[og_qrels_subset_df['query_id'] == query_id]
    new_qrels_query_subset = new_qrels_df[new_qrels_df['query_id'] == query_id]
    
    top_n_ranking = get_top_n_ranking(query_subset,'%s_rank'%(model),N)
    
    og_relevant_passages = get_passage_ids(og_qrels_query_subset)
    new_relevant_passages = get_passage_ids(new_qrels_query_subset)
    
    og_mrr += compute_mrr(og_relevant_passages,top_n_ranking,model,N)
    new_mrr += compute_mrr(new_relevant_passages,top_n_ranking,model,N)
    
og_mrr = round((og_mrr/len(query_ids))*100,1)
new_mrr = round((new_mrr/len(query_ids))*100,1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [28]:
print(og_mrr)

68.6


In [29]:
print(new_mrr)

88.4


In [30]:
len(query_ids)

43