## Import Libraries

In [1]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook

## Load Files

In [3]:
data_dir = 'data/'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

In [4]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

models_dict = {"bm25": bm25_df, "bert": bert_df}
new_qrels_dict = {"threshold=2": new_qrels2_df, "threshold=3": new_qrels3_df}

## Get experiment query ids

In [9]:
experiment_query_list = list(np.unique(new_qrels2_df['query_id'].values.tolist()))

## Create subset of original qrels file

In [11]:
og_qrels_exp_subset = og_qrels_df[og_qrels_df['query_id'].isin(experiment_query_list)]

In [16]:
og_qrels_exp_subset.to_csv('output/qrels.dev.small.thesis.subset.tsv',sep='\t',index=False,header=False)

## Create BM25 run file for trec_eval

In [31]:
bm25_subset = bm25_df[bm25_df['query_id'].isin(experiment_query_list)].copy()

In [33]:
bm25_subset.to_csv('output/bm25_run_thesis_subset_trec_file.tsv',sep='\t',index=False,header=False)

## Create BERT run file for trec_eval

In [17]:
bert_df.head(1)

Unnamed: 0,query_id,passage_id,bm25_rank,query,passage,bert_score,bert_rank
0,9083,7067274,1,is considered the father ...,true hippocrates is considered the father of ...,3.594209,1


In [18]:
bert_subset = bert_df[bert_df['query_id'].isin(experiment_query_list)].copy()

In [20]:
bert_subset = bert_subset.drop(['bm25_rank', 'query', 'passage', 'bert_score'], axis=1)

In [26]:
bert_subset = bert_subset.sort_values(by=['query_id','bert_rank'])

In [30]:
bert_subset.to_csv('output/bert_run_thesis_subset_trec_file.tsv',sep='\t',index=False,header=False)