In [None]:
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git
!pip install ir_datasets

In [None]:
from google.colab import drive
import os

drive.mount("/content/drive", force_remount=True)

os.environ['IR_DATASETS_HOME'] = "/content/drive/MyDrive/Colab Notebooks/Dissertation/ir_datasets"

print(os.getenv('IR_DATASETS_HOME'))

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Dissertation/ir_datasets


In [None]:
#Importing libraries
import ir_datasets
import torch
torch.cuda.empty_cache()
import pandas as pd
import pyterrier as pt
if not pt.started():
  pt.init()

#For saving the datasets
import pickle

#Counter
from collections import Counter

terrier-assemblies 5.6 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.6 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)



In [None]:
dataset = ir_datasets.load("msmarco-qna/train")
docs = dataset.docs_store()
qna_dataset = pt.get_dataset('irds:msmarco-qna/train')

#Loading qrels using ir_datasets
qrels = qna_dataset.get_qrels()
qrels=qrels.drop('iteration', axis=1)
qrels.rename(columns = {'label':'relevance'}, inplace = True)

#Loading scored documents using ir_datasets
sdoc = qna_dataset.get_results()
sdoc=sdoc.drop(['type', 'answers'], axis=1)
sdoc.rename(columns = {'score':'rank', 'text':'query'}, inplace = True)
sdoc['rank']= -1*sdoc['rank']+1

In [None]:
all_info=[]
all_info= pd.merge(qrels, sdoc, how='inner', on=['qid','docno'], sort=True)
all_info=all_info[['qid','query','docno','rank','relevance']]
print("Total records: ", len(all_info))
all_info

Total records:  8070091


Unnamed: 0,qid,query,docno,rank,relevance
0,1,A potlatch is considered an example of,4063745-0,1.0,0
1,1,A potlatch is considered an example of,4063746-0,2.0,0
2,1,A potlatch is considered an example of,4063747-0,3.0,0
3,1,A potlatch is considered an example of,4063748-0,4.0,0
4,1,A potlatch is considered an example of,4063749-0,5.0,0
...,...,...,...,...,...
8070086,999999,where is westminster ca,4452277-0,5.0,0
8070087,999999,where is westminster ca,4452278-0,6.0,0
8070088,999999,where is westminster ca,4452279-0,8.0,1
8070089,999999,where is westminster ca,4452280-0,9.0,0


In [None]:
#SPlitting dataset- taking 7M records for training and rest 1070091 for eval
eval_data = all_info[7000000:]
all_info = all_info[:7000000]

In [None]:
#Calculating the propensities
pos=[]
#Finding the rank of all relevant documents
pos=all_info.loc[all_info['relevance'] == 1, 'rank']
position_freq_train=Counter(pos)

train_score = pd.DataFrame.from_dict(position_freq_train, orient='index').reset_index()
train_score = train_score.rename(columns={'index':'rank', 0:'ips'})

#1/propensity = 1/(train_score['ips']/len(pos))
train_score['ips'] = len(pos)/train_score['ips'] 
train_score = train_score.rename(columns={'ips':'weight'})

#Normalizing weight to 16 for all relevant documents whose rank is above 10
train_score.loc[train_score['rank'] >= 11, 'weight'] = 16
train_score['relevance']=1
train_score=train_score.sort_values(by=['rank']).reset_index(drop=True)
train_score


Unnamed: 0,rank,weight,relevance
0,1.0,5.373771,1
1,2.0,7.212471,1
2,3.0,8.702793,1
3,4.0,10.024469,1
4,5.0,11.091878,1
5,6.0,12.129336,1
6,7.0,12.887313,1
7,8.0,13.670029,1
8,9.0,14.63395,1
9,10.0,14.547961,1


In [None]:
#Final training dataset
data= pd.merge(all_info, train_score, how='left', on=['relevance', 'rank'])
data.fillna(0, inplace = True) 
data.loc[data['relevance'] == 0, 'relevance'] = 'false'
data.loc[data['relevance'] == 1, 'relevance'] = 'true'
data

Unnamed: 0,qid,query,docno,rank,relevance,weight
0,1,A potlatch is considered an example of,4063745-0,1.0,0,0.000000
1,1,A potlatch is considered an example of,4063746-0,2.0,0,0.000000
2,1,A potlatch is considered an example of,4063747-0,3.0,0,0.000000
3,1,A potlatch is considered an example of,4063748-0,4.0,0,0.000000
4,1,A potlatch is considered an example of,4063749-0,5.0,0,0.000000
...,...,...,...,...,...,...
6999995,864260,what is yield to worst,4221105-0,4.0,0,0.000000
6999996,864260,what is yield to worst,4221106-0,5.0,0,0.000000
6999997,864260,what is yield to worst,4221107-0,6.0,0,0.000000
6999998,864260,what is yield to worst,4221108-0,7.0,0,0.000000


In [None]:
#Saving train and eval datasets 
with open('/content/drive/MyDrive/Colab Notebooks/Dissertation/Training and Eval/train_data.pkl', "wb") as file:
    pickle.dump(data, file)

with open('/content/drive/MyDrive/Colab Notebooks/Dissertation/Training and Eval/eval_data.pkl', "wb") as file:
    pickle.dump(eval_data, file)