In [2]:
model = "bm25"
cutoff = 10

output_file = "output/fqt_scores_%s_N%s.txt"%(model,cutoff)

## Import Libraries

In [10]:
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join
from tqdm.auto import tqdm 
from tqdm import tqdm_notebook
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

## Load Data

In [6]:
data_dir = 'data/'

# binary threshold (irrelevant <2; relevant >= 2)
thesis_qrels_threshold2_filename = 'thesis_dataset_binary_threshold2.tsv'

# binary theshold (irrelevant <3; relevant >= 3)
thesis_qrels_threshold3_filename = 'thesis_dataset_binary_threshold3.tsv'

# msmarco relevance file
og_qrels_filename = 'qrels.dev.small.tsv'

# BM25 top 100 ranking
bm25_top100_filename = 'run_development_top100.tsv'

# BERT top 100 ranking
bert_top100_filename = 'bert_thesis_dataset_top100.tsv'

In [7]:
bm25_df = pd.read_csv(data_dir + bm25_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bm25_df.columns = ['query_id', 'passage_id', 'bm25_rank']

bert_df = pd.read_csv(data_dir + bert_top100_filename,delimiter='\t',encoding='utf-8',header=None)
bert_df.columns = ['query_id', 'passage_id', 'bm25_rank', 'query', 'passage', 'bert_score', 'bert_rank']

og_qrels_df = pd.read_csv(data_dir + og_qrels_filename,delimiter='\t',encoding='utf-8',header=None)
og_qrels_df.columns = ['query_id','label1','passage_id','label2']

new_qrels2_df = pd.read_csv(data_dir + thesis_qrels_threshold2_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels2_df.columns = ['query_id','label1','passage_id','label2']

new_qrels3_df = pd.read_csv(data_dir + thesis_qrels_threshold3_filename,delimiter='\t',encoding='utf-8',header=None)
new_qrels3_df.columns = ['query_id','label1','passage_id','label2']

models_dict = {"bm25": bm25_df, "bert": bert_df}
dataset_dict = {"ms_marco": og_qrels_df, "threshold=2": new_qrels2_df, "threshold=3": new_qrels3_df}

In [8]:
experiment_query_ids = list(np.unique(new_qrels2_df['query_id'].values.tolist()))

## Helper Functions

In [11]:
stop_words = set(stopwords.words('english'))

In [12]:
def get_query_ids(dataframe):
    return list(np.unique(dataframe['query_id'].tolist()))

In [13]:
def compute_query_token_occurrences(passage,query):
    passage_tokens = word_tokenize(passage)
    query_tokens = word_tokenize(query)
    filtered_passage = [w for w in passage_tokens if not w in stop_words]
    token_matches = [w for w in filtered_passage if w in query_tokens]
    return len(token_matches)/len(filtered_passage)

In [14]:
def compute_fqt(query_id,model,threshold):
    top_n_ranking = bert_df[(bert_df['query_id'] == query_id) & (bert_df['%s_rank'%(model)] <= threshold)]
    passages = top_n_ranking['passage'].values.tolist()
    query = top_n_ranking['query'].values.tolist()[0]
    fqt = 0
    for passage in passages:
        fqt += compute_query_token_occurrences(passage,query)
    return fqt/threshold

## Compute FQT

In [16]:
model_df = models_dict[model].copy()

threshold = cutoff

fqt_scores = []

summed_fqt = 0.0

query_ids = get_query_ids(new_qrels2_df)

for query_id in tqdm_notebook(query_ids):
    
    fqt = compute_fqt(query_id,model,threshold)
    
    summed_fqt += fqt
    
    fqt_scores.append((query_id,fqt))
    
fqt = round((summed_fqt/len(query_ids))*100,1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # This is added back by InteractiveShellApp.init_path()


HBox(children=(FloatProgress(value=0.0, max=43.0), HTML(value='')))




In [17]:
fqt

16.5

In [1]:
with open(output_file,'w') as outfile:
    for (query_id, ap) in fqt_scores:
        line = "%s\t%s\n"%(query_id,fqt)
        outfile.write(line)

NameError: name 'output_file' is not defined