## Import Libraries

In [1]:
import pandas as pd
import json
import numpy as np
from collections import Counter

## Load Data

In [2]:
data_dir = 'data/'

query_subset_filename = 'queries.dev.small.tsv'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

# Experiment query ids
thesis_query_subset = 'experiment_query_subset.tsv'

In [17]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

query_subset = pd.read_csv(data_dir + thesis_query_subset,delimiter='\t',encoding='utf-8',header=None)
query_subset.columns = ['query_id', 'query']

models_dict = {"bm25": bm25_df, "bert": bert_df}
new_qrels_dict = {"threshold=2": new_qrels2_df, "threshold=3": new_qrels3_df}

In [18]:
firebase_answer_types = {}
counter_answer_types = 0
counter_no_answer_types = 0
with open("data/firebase_answer_types.txt", "r") as infile:
    for line in infile:
        if("\t" in line.rstrip()):
            counter_answer_types += 1
            [query_text,answer_type] = line.rstrip().split("\t")
            firebase_answer_types[query_text] = answer_type.split(":")[0]
        else:
            if not line.rstrip() == "":
                counter_no_answer_types += 1
                firebase_answer_types[line.rstrip()] = ""

In [19]:
with open("data/manual_answer_types.txt", "r", encoding='utf-8') as infile:
    for line in infile:
        [query_text,answer_type] = line.rstrip().split("\t")
        firebase_answer_types[query_text] = answer_type

In [20]:
answer_types = list(np.unique(list(firebase_answer_types.values())))

## Helper Functions

In [7]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

In [8]:
def get_top_n_ranking(dataframe,rank_column,n):
    top_n_ranking = dataframe[dataframe[rank_column] <= n].sort_values(by=[rank_column])
    return top_n_ranking

In [9]:
def get_ranks_relevant_passages(qrels_query_subset,top_n_ranking,model):
    ranks_relevant_items = sorted(qrels_query_subset.merge(top_n_ranking,how='left',on=['query_id','passage_id'])['%s_rank'%(model)].values.tolist())
    cleaned_rank_relevant_items = sorted([int(item) for item in ranks_relevant_items if ~np.isnan(item)])
    return cleaned_rank_relevant_items

In [10]:
def compute_precision(index,rank):
    precision = index/rank
    return precision

In [11]:
def compute_average_precision(sorted_ranks_relevant_items):
    average_precision = 0.0
    if not (len(sorted_ranks_relevant_items) == 0):
        summed_precision = 0.0
        for index,rank in enumerate(sorted_ranks_relevant_items):
            summed_precision += compute_precision(index+1,rank)
        average_precision = summed_precision/len(sorted_ranks_relevant_items)
    return average_precision

## Compute Dict for answer types and query ids

In [22]:
query_ids = get_query_ids(query_subset)

In [23]:
answer_type_query_id_dict = {}
for query_id in query_ids:
    query_text = query_subset[query_subset['query_id'] == query_id]['query'].values.tolist()[0]
    answer_type = firebase_answer_types[query_text]
    if not answer_type in answer_type_query_id_dict.keys():
        answer_type_query_id_dict[answer_type] = [query_id]
    else:
        answer_type_query_id_dict[answer_type].append(query_id)

In [24]:
for answer_type in answer_types:
    print(answer_type)
    print(len(answer_type_query_id_dict[answer_type]))

DESC
26
ENTY
1
HUM
3
LOC
1
NUM
12


In [25]:
N = 10

## Compute MAP per answer type MS MARCO

In [29]:
bm25_map_answer_types = {}
bert_map_answer_types = {}

for answer_type in answer_types:
    query_ids = answer_type_query_id_dict[answer_type]
    qrels_subset_df = og_qrels_df[og_qrels_df['query_id'].isin(query_ids)].copy()
    bm25_subset_df = bm25_df[bm25_df['query_id'].isin(query_ids)].copy()
    bert_subset_df = bert_df[bert_df['query_id'].isin(query_ids)].copy()
    
    bm25_map = 0.0
    bert_map = 0.0
    
    for query_id in query_ids:
        bm25_subset = bm25_df[bm25_df['query_id'] == query_id].copy()
        bert_subset = bert_df[bert_df['query_id'] == query_id].copy()
        qrels_subset = qrels_subset_df[qrels_subset_df['query_id'] == query_id].copy()
        
        top_n_bm25_ranking = get_top_n_ranking(bm25_subset,'bm25_rank',N)
        top_n_bert_ranking = get_top_n_ranking(bert_subset,'bert_rank',N)
        
        bm25_relevant_items = get_ranks_relevant_passages(qrels_subset,top_n_bm25_ranking,"bm25")
        bert_relevant_items = get_ranks_relevant_passages(qrels_subset,top_n_bert_ranking,"bert")
        
        bm25_map += compute_average_precision(bm25_relevant_items)
        bert_map += compute_average_precision(bert_relevant_items)
        
    bm25_map_answer_types[answer_type] = round((bm25_map/len(query_ids))*100,1)
    bert_map_answer_types[answer_type] = round((bert_map/len(query_ids))*100,1)

In [30]:
bm25_map_answer_types

{'DESC': 45.2, 'ENTY': 100.0, 'HUM': 66.7, 'LOC': 50.0, 'NUM': 36.4}

In [31]:
bert_map_answer_types

{'DESC': 71.2, 'ENTY': 50.0, 'HUM': 77.8, 'LOC': 50.0, 'NUM': 63.7}

## Compute MAP per answer type T=2

In [32]:
bm25_map_answer_types = {}
bert_map_answer_types = {}

for answer_type in answer_types:
    query_ids = answer_type_query_id_dict[answer_type]
    qrels_subset_df = new_qrels2_df[new_qrels2_df['query_id'].isin(query_ids)].copy()
    bm25_subset_df = bm25_df[bm25_df['query_id'].isin(query_ids)].copy()
    bert_subset_df = bert_df[bert_df['query_id'].isin(query_ids)].copy()
    
    bm25_map = 0.0
    bert_map = 0.0
    
    for query_id in query_ids:
        bm25_subset = bm25_df[bm25_df['query_id'] == query_id].copy()
        bert_subset = bert_df[bert_df['query_id'] == query_id].copy()
        qrels_subset = qrels_subset_df[qrels_subset_df['query_id'] == query_id].copy()
        
        top_n_bm25_ranking = get_top_n_ranking(bm25_subset,'bm25_rank',N)
        top_n_bert_ranking = get_top_n_ranking(bert_subset,'bert_rank',N)
        
        bm25_relevant_items = get_ranks_relevant_passages(qrels_subset,top_n_bm25_ranking,"bm25")
        bert_relevant_items = get_ranks_relevant_passages(qrels_subset,top_n_bert_ranking,"bert")
        
        bm25_map += compute_average_precision(bm25_relevant_items)
        bert_map += compute_average_precision(bert_relevant_items)
        
    bm25_map_answer_types[answer_type] = round((bm25_map/len(query_ids))*100,1)
    bert_map_answer_types[answer_type] = round((bert_map/len(query_ids))*100,1)

In [33]:
bm25_map_answer_types

{'DESC': 89.6, 'ENTY': 100.0, 'HUM': 100.0, 'LOC': 95.3, 'NUM': 94.7}

In [34]:
bert_map_answer_types

{'DESC': 82.2, 'ENTY': 93.3, 'HUM': 72.9, 'LOC': 85.5, 'NUM': 75.4}

## Compute MAP per answer type T=3

In [35]:
bm25_map_answer_types = {}
bert_map_answer_types = {}

for answer_type in answer_types:
    query_ids = answer_type_query_id_dict[answer_type]
    qrels_subset_df = new_qrels3_df[new_qrels3_df['query_id'].isin(query_ids)].copy()
    bm25_subset_df = bm25_df[bm25_df['query_id'].isin(query_ids)].copy()
    bert_subset_df = bert_df[bert_df['query_id'].isin(query_ids)].copy()
    
    bm25_map = 0.0
    bert_map = 0.0
    
    for query_id in query_ids:
        bm25_subset = bm25_df[bm25_df['query_id'] == query_id].copy()
        bert_subset = bert_df[bert_df['query_id'] == query_id].copy()
        qrels_subset = qrels_subset_df[qrels_subset_df['query_id'] == query_id].copy()
        
        top_n_bm25_ranking = get_top_n_ranking(bm25_subset,'bm25_rank',N)
        top_n_bert_ranking = get_top_n_ranking(bert_subset,'bert_rank',N)
        
        bm25_relevant_items = get_ranks_relevant_passages(qrels_subset,top_n_bm25_ranking,"bm25")
        bert_relevant_items = get_ranks_relevant_passages(qrels_subset,top_n_bert_ranking,"bert")
        
        bm25_map += compute_average_precision(bm25_relevant_items)
        bert_map += compute_average_precision(bert_relevant_items)
        
    bm25_map_answer_types[answer_type] = round((bm25_map/len(query_ids))*100,1)
    bert_map_answer_types[answer_type] = round((bert_map/len(query_ids))*100,1)

In [36]:
bm25_map_answer_types

{'DESC': 81.2, 'ENTY': 100.0, 'HUM': 93.7, 'LOC': 91.1, 'NUM': 74.0}

In [37]:
bert_map_answer_types

{'DESC': 81.1, 'ENTY': 93.3, 'HUM': 72.4, 'LOC': 70.9, 'NUM': 71.0}