This notebook will help to compute the MAP for all experiment settings.

## Settings

In [1]:
metric = "map"
models = ["bm25", "bert"]
datasets = ["ms_marco", "threshold=2", "threshold=3"]
cutoffs = [5,10]

## Import Libraries

In [2]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook

## Load Data

In [3]:
data_dir = 'data/'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

In [4]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

models_dict = {"bm25": bm25_df, "bert": bert_df}
dataset_dict = {"ms_marco": og_qrels_df, "threshold=2": new_qrels2_df, "threshold=3": new_qrels3_df}

## Helper Functions

In [5]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

def get_top_n_ranking(dataframe,rank_column,n):
    top_n_ranking = dataframe[dataframe[rank_column] <= n].sort_values(by=[rank_column])
    return top_n_ranking

def get_ranks_relevant_passages(qrels_query_subset,top_n_ranking,model):
    ranks_relevant_items = sorted(qrels_query_subset.merge(top_n_ranking,how='left',on=['query_id','passage_id'])['%s_rank'%(model)].values.tolist())
    cleaned_rank_relevant_items = sorted([int(item) for item in ranks_relevant_items if ~np.isnan(item)])
    return cleaned_rank_relevant_items

def compute_precision(index,rank):
    precision = index/rank
    return precision

def compute_average_precision(sorted_ranks_relevant_items):
    average_precision = 0.0
    if not (len(sorted_ranks_relevant_items) == 0):
        summed_precision = 0.0
        for index,rank in enumerate(sorted_ranks_relevant_items):
            summed_precision += compute_precision(index+1,rank)
        average_precision = summed_precision/len(sorted_ranks_relevant_items)
    return average_precision

## Compute MAP

In [6]:
scores = []

for model in models:
    for dataset in datasets:
        for cutoff in cutoffs:
            output_file = "output/%s_scores_%s_%s_N%s.txt"%(metric,model,dataset,cutoff)
            
            model_df = models_dict[model].copy()
            dataset_df = dataset_dict[dataset].copy()

            N = cutoff

            map_scores = []

            summed_map = 0.0

            query_ids = get_query_ids(new_qrels2_df)

            for query_id in tqdm_notebook(query_ids):
                model_query_subset = model_df[model_df['query_id'] == query_id].copy()
                qrels_query_subset = dataset_df[dataset_df['query_id'] == query_id].copy()

                top_n_ranking = get_top_n_ranking(model_query_subset,"%s_rank"%(model),N)

                relevant_items = get_ranks_relevant_passages(qrels_query_subset,top_n_ranking,model)

                ap = compute_average_precision(relevant_items)

                summed_map += ap

                map_scores.append((query_id,ap))

            mean_ap = round((summed_map/len(query_ids))*100,1)
            score_data = "model: %s; dataset: %s; cutoff: %s; %s_score: %s\n"%(model,dataset,cutoff,metric,mean_ap)
            scores.append(score_data)
        
            with open(output_file,'w') as outfile:
                for (query_id, mean_ap) in map_scores:
                    line = "%s\t%s\n"%(query_id,mean_ap)
                    outfile.write(line)
with open("output/%s_scores.txt"%(metric),'w') as outfile:
    for score_data in scores:
        outfile.write(score_data)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=42.0), HTML(value='')))




In [7]:
for score_data in scores:
    print(score_data)

model: bm25; dataset: ms_marco; cutoff: 5; map_score: 47.1

model: bm25; dataset: ms_marco; cutoff: 10; map_score: 49.1

model: bm25; dataset: threshold=2; cutoff: 5; map_score: 94.6

model: bm25; dataset: threshold=2; cutoff: 10; map_score: 92.1

model: bm25; dataset: threshold=3; cutoff: 5; map_score: 88.2

model: bm25; dataset: threshold=3; cutoff: 10; map_score: 81.1

model: bert; dataset: ms_marco; cutoff: 5; map_score: 71.8

model: bert; dataset: ms_marco; cutoff: 10; map_score: 72.8

model: bert; dataset: threshold=2; cutoff: 5; map_score: 88.1

model: bert; dataset: threshold=2; cutoff: 10; map_score: 83.4

model: bert; dataset: threshold=3; cutoff: 5; map_score: 85.9

model: bert; dataset: threshold=3; cutoff: 10; map_score: 81.8

