## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

## Load Data

In [2]:
data_dir = 'data/'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

# Experiment query ids
thesis_query_subset = 'experiment_query_subset.tsv'

In [3]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

query_subset = pd.read_csv(data_dir + thesis_query_subset,delimiter='\t',encoding='utf-8',header=None)
query_subset.columns = ['query_id', 'query']

models_dict = {"bm25": bm25_df, "bert": bert_df}
new_qrels_dict = {"threshold=2": new_qrels2_df, "threshold=3": new_qrels3_df}

In [4]:
experiment_query_ids = list(np.unique(new_qrels2_df['query_id'].values.tolist()))

## Helper Functions

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

In [7]:
def compute_query_token_occurrences(passage,query):
    passage_tokens = word_tokenize(passage)
    query_tokens = word_tokenize(query)
    filtered_passage = [w for w in passage_tokens if not w in stop_words]
    token_matches = [w for w in filtered_passage if w in query_tokens]
    return len(token_matches)/len(filtered_passage)

In [8]:
def compute_fqt(query_id,model,threshold):
    top_n_ranking = bert_df[(bert_df['query_id'] == query_id) & (bert_df['%s_rank'%(model)] <= threshold)]
    passages = top_n_ranking['passage'].values.tolist()
    query = top_n_ranking['query'].values.tolist()[0]
    fqt = 0
    for passage in passages:
        fqt += compute_query_token_occurrences(passage,query)
    return fqt/threshold

In [9]:
def get_top_n_ranking(dataframe,rank_column,n):
    top_n_ranking = dataframe[dataframe[rank_column] <= n].sort_values(by=[rank_column])
    return top_n_ranking

In [10]:
def get_passage_ids(dataframe):
    relevant_passages = dataframe['passage_id'].values.tolist()
    return relevant_passages

In [12]:
def compute_mfr(gt,ranking,model,n):
    score = n+1
    best_rank = n+1
    for index, row in ranking.iterrows():
        current_rank = row['%s_rank'%(model)]
        if row['passage_id'] in gt:
            if current_rank < best_rank:
                score = row['%s_rank'%(model)]
                best_rank = current_rank
    return score

## Compute FQT Top 10 ranking

In [23]:
query_ids = get_query_ids(query_subset)
bm25_fqt_dict = {}
bert_fqt_dict = {}
threshold = 10

for query_id in query_ids:
    bm25_fqt_dict[query_id] = compute_fqt(query_id,'bm25',threshold)
    bert_fqt_dict[query_id] = compute_fqt(query_id,'bert',threshold)

## Compute Average MFR per ranges of FQT

In [13]:
N = 10

In [14]:
intervals = [(0.0,0.1), (0.1,0.15), (0.15,0.2), (0.2, 0.25), (0.25,1.0)]

In [18]:
def compute_group(fqt):
    for index, (low,high) in enumerate(intervals):
        if not index == (len(intervals) + 1):
            if (fqt >= low) & (fqt < high):
                return index
        else:
            if (fqt >= low) & (fqt <= high):
                return index

In [25]:
bm25_groups = [[],[],[],[],[]]
bert_groups = [[],[],[],[],[]]
for query_id in query_ids:
    bm25_fqt = bm25_fqt_dict[query_id]
    bert_fqt = bert_fqt_dict[query_id]
    bm25_groups[compute_group(bm25_fqt)].append(query_id)
    bert_groups[compute_group(bert_fqt)].append(query_id)
bm25_group_counts = [len(group) for group in bm25_groups]
bert_group_counts = [len(group) for group in bert_groups]

### Compute group MFRS for MS MARCO

In [28]:
bm25_ms_mfr_per_group = []
bert_ms_mfr_per_group = []

for bm25_group in bm25_groups:
    query_ids = bm25_group
    
    mfr = 0.0
    
    for query_id in query_ids:
        # Get query specific ranking and relevance datasets
        query_subset = bm25_df[bm25_df['query_id'] == query_id].copy()
        qrels_query_subset = og_qrels_df[og_qrels_df['query_id'] == query_id]
    
        top_n_ranking = get_top_n_ranking(query_subset,'bm25_rank',N)
    
        relevant_passages = get_passage_ids(qrels_query_subset)
    
        mfr += compute_mfr(relevant_passages,top_n_ranking,"bm25",N)
    
    bm25_ms_mfr_per_group.append(round((mfr/len(query_ids)),2))
    
for bert_group in bert_groups:
    query_ids = bert_group
    
    mfr = 0.0
    
    for query_id in query_ids:
        # Get query specific ranking and relevance datasets
        query_subset = bert_df[bert_df['query_id'] == query_id].copy()
        qrels_query_subset = og_qrels_df[og_qrels_df['query_id'] == query_id]
    
        top_n_ranking = get_top_n_ranking(query_subset,'bert_rank',N)
    
        relevant_passages = get_passage_ids(qrels_query_subset)
    
        mfr += compute_mfr(relevant_passages,top_n_ranking,"bert",N)
    
    bert_ms_mfr_per_group.append(round((mfr/len(query_ids)),2))

In [30]:
bm25_ms_mfr_per_group

[7.33, 2.58, 4.14, 4.83, 6.6]

In [29]:
bert_ms_mfr_per_group

[2.56, 2.69, 2.2, 3.67, 4.33]

### Compute group MFRS for threshold 2

In [31]:
bm25_t2_mfr_per_group = []
bert_t2_mfr_per_group = []

for bm25_group in bm25_groups:
    query_ids = bm25_group
    
    mfr = 0.0
    
    for query_id in query_ids:
        # Get query specific ranking and relevance datasets
        query_subset = bm25_df[bm25_df['query_id'] == query_id].copy()
        qrels_query_subset = new_qrels2_df[new_qrels2_df['query_id'] == query_id]
    
        top_n_ranking = get_top_n_ranking(query_subset,'bm25_rank',N)
    
        relevant_passages = get_passage_ids(qrels_query_subset)
    
        mfr += compute_mfr(relevant_passages,top_n_ranking,"bm25",N)
    
    bm25_t2_mfr_per_group.append(round((mfr/len(query_ids)),2))
    
for bert_group in bert_groups:
    query_ids = bert_group
    
    mfr = 0.0
    
    for query_id in query_ids:
        # Get query specific ranking and relevance datasets
        query_subset = bert_df[bert_df['query_id'] == query_id].copy()
        qrels_query_subset = new_qrels2_df[new_qrels2_df['query_id'] == query_id]
    
        top_n_ranking = get_top_n_ranking(query_subset,'bert_rank',N)
    
        relevant_passages = get_passage_ids(qrels_query_subset)
    
        mfr += compute_mfr(relevant_passages,top_n_ranking,"bert",N)
    
    bert_t2_mfr_per_group.append(round((mfr/len(query_ids)),2))

In [32]:
bm25_t2_mfr_per_group

[1.17, 1.08, 1.0, 1.0, 1.0]

In [33]:
bert_t2_mfr_per_group

[1.11, 1.46, 1.0, 2.67, 1.0]

### Compute group MFRS for threshold 3

In [34]:
bm25_t3_mfr_per_group = []
bert_t3_mfr_per_group = []

for bm25_group in bm25_groups:
    query_ids = bm25_group
    
    mfr = 0.0
    
    for query_id in query_ids:
        # Get query specific ranking and relevance datasets
        query_subset = bm25_df[bm25_df['query_id'] == query_id].copy()
        qrels_query_subset = new_qrels3_df[new_qrels3_df['query_id'] == query_id]
    
        top_n_ranking = get_top_n_ranking(query_subset,'bm25_rank',N)
    
        relevant_passages = get_passage_ids(qrels_query_subset)
    
        mfr += compute_mfr(relevant_passages,top_n_ranking,"bm25",N)
    
    bm25_t3_mfr_per_group.append(round((mfr/len(query_ids)),2))
    
for bert_group in bert_groups:
    query_ids = bert_group
    
    mfr = 0.0
    
    for query_id in query_ids:
        # Get query specific ranking and relevance datasets
        query_subset = bert_df[bert_df['query_id'] == query_id].copy()
        qrels_query_subset = new_qrels3_df[new_qrels3_df['query_id'] == query_id]
    
        top_n_ranking = get_top_n_ranking(query_subset,'bert_rank',N)
    
        relevant_passages = get_passage_ids(qrels_query_subset)
    
        mfr += compute_mfr(relevant_passages,top_n_ranking,"bert",N)
    
    bert_t3_mfr_per_group.append(round((mfr/len(query_ids)),2))

In [35]:
bm25_t3_mfr_per_group

[1.5, 1.08, 1.79, 1.0, 1.2]

In [36]:
bert_t3_mfr_per_group

[1.11, 2.0, 1.07, 2.67, 1.33]