## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

## Load Data

In [2]:
data_dir = 'data/'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

# Experiment query ids
thesis_query_subset = 'experiment_query_subset.tsv'

In [3]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

query_subset = pd.read_csv(data_dir + thesis_query_subset,delimiter='\t',encoding='utf-8',header=None)
query_subset.columns = ['query_id', 'query']

models_dict = {"bm25": bm25_df, "bert": bert_df}
new_qrels_dict = {"threshold=2": new_qrels2_df, "threshold=3": new_qrels3_df}

In [4]:
experiment_query_ids = list(np.unique(new_qrels2_df['query_id'].values.tolist()))

## Helper Functions

In [5]:
stop_words = set(stopwords.words('english'))

In [6]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

In [10]:
def compute_query_token_occurrences(passage,query):
    passage_tokens = word_tokenize(passage)
    query_tokens = word_tokenize(query)
    filtered_passage = [w for w in passage_tokens if not w in stop_words]
    unique_filtered_passage = list(np.unique(filtered_passage))
    token_matches = [w for w in unique_filtered_passage if not w in query_tokens]
    return len(token_matches)/len(unique_filtered_passage)

In [11]:
def compute_fnt(query_id,model,threshold):
    top_n_ranking = bert_df[(bert_df['query_id'] == query_id) & (bert_df['%s_rank'%(model)] <= threshold)]
    passages = top_n_ranking['passage'].values.tolist()
    query = top_n_ranking['query'].values.tolist()[0]
    fqt = 0
    for passage in passages:
        fqt += compute_query_token_occurrences(passage,query)
    return fqt/threshold

## Compute FNT top 10 ranking

In [12]:
query_ids = get_query_ids(query_subset)

In [14]:
bm25_fnt_dict = {}
bert_fnt_dict = {}
threshold = 10

for query_id in query_ids:
    bm25_fnt_dict[query_id] = compute_fnt(query_id,'bm25',threshold)
    bert_fnt_dict[query_id] = compute_fnt(query_id,'bert',threshold)

In [16]:
np.mean(list(bm25_fnt_dict.values()))

0.8905316563629885

In [17]:
np.mean(list(bert_fnt_dict.values()))

0.8997616449286722

In [18]:
bert_higher = 0
for query_id in query_ids:
    if bert_fnt_dict[query_id] > bm25_fnt_dict[query_id]:
        bert_higher += 1

In [20]:
bert_higher/len(bert_fnt_dict)

0.6511627906976745

In [21]:
from scipy.stats import wilcoxon

In [24]:
stat, p = wilcoxon(list(bm25_fnt_dict.values()), list(bert_fnt_dict.values()))

In [26]:
# interpret
alpha = 0.05
if p > alpha:
	print('Same distribution (fail to reject H0)')
else:
	print('Different distribution (reject H0)')

Different distribution (reject H0)


Dezelfde conclusies kunnen getrokken worden als in het paper. Echter zijn de verschillen zo nihil dat je ze ook weer in twijfel kan trekken. De gemiddelde liggen namelijk zo dichtbij elkaar.