In [None]:
!pip install --upgrade git+https://github.com/terrierteam/pyterrier_t5.git
!pip install ir_datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/terrierteam/pyterrier_t5.git
  Cloning https://github.com/terrierteam/pyterrier_t5.git to /tmp/pip-req-build-b5j_bij6
  Running command git clone -q https://github.com/terrierteam/pyterrier_t5.git /tmp/pip-req-build-b5j_bij6
Collecting python-terrier>=0.5.0
  Downloading python-terrier-0.8.1.tar.gz (97 kB)
[K     |████████████████████████████████| 97 kB 6.5 MB/s 
Collecting transformers>=4.0.0
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 56.3 MB/s 
Collecting sentencepiece>=0.1.95
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 59.7 MB/s 
Collecting wget
  Downloading wget-3.2.zip (10 kB)
Collecting pyjnius~=1.3.0
  Downloading pyjnius-1.3.0-cp37-cp37m-manylinux2010_x86_64.whl (1.1 MB)
[

In [None]:
#loading ir_datasets module
from google.colab import drive
import os

drive.mount("/content/drive", force_remount=True)

os.environ['IR_DATASETS_HOME'] = "/content/drive/MyDrive/Colab Notebooks/Dissertation/ir_datasets"

print(os.getenv('IR_DATASETS_HOME'))

Mounted at /content/drive
/content/drive/MyDrive/Colab Notebooks/Dissertation/ir_datasets


In [None]:
#Importing libraries
import ir_datasets
import torch
torch.cuda.empty_cache()
from torch.nn import CrossEntropyLoss, Softmax
import pandas as pd
import pyterrier as pt
if not pt.started():
  pt.init()
from pyterrier.measures import *
from pyterrier_t5 import MonoT5ReRanker
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import AdamW
from random import Random
import itertools
import pickle

terrier-assemblies 5.6 jar-with-dependencies not found, downloading to /root/.pyterrier...
Done
terrier-python-helper 0.0.6 jar not found, downloading to /root/.pyterrier...
Done


PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)



##Load Dataset

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/Dissertation/train_data.pkl', 'rb') as handle:
    data = pickle.load(handle)
data

Unnamed: 0,qid,query,docno,rank,relevance,weight
0,1,A potlatch is considered an example of,4063745-0,1.0,false,0.000000
1,1,A potlatch is considered an example of,4063746-0,2.0,false,0.000000
2,1,A potlatch is considered an example of,4063747-0,3.0,false,0.000000
3,1,A potlatch is considered an example of,4063748-0,4.0,false,0.000000
4,1,A potlatch is considered an example of,4063749-0,5.0,false,0.000000
...,...,...,...,...,...,...
6999995,864260,what is yield to worst,4221105-0,4.0,false,0.000000
6999996,864260,what is yield to worst,4221106-0,5.0,false,0.000000
6999997,864260,what is yield to worst,4221107-0,6.0,false,0.000000
6999998,864260,what is yield to worst,4221108-0,7.0,false,0.000000


##Removing Class Imbalance

In [None]:
#Removing class imbalance

# Total number of non-relevant doucments - 6541972
nr = data[data['relevance'] == 'false']

# Total number of relevant doucments - 458028
r = data[data['relevance'] == 'true']

#Undersampling
n_rel = pd.DataFrame(nr.sample(frac = 0.0700137))
rel = pd.DataFrame(r.sample(frac=1))

print(n_rel.shape)
rel.shape

(458028, 6)


(458028, 6)

#T5

###With Weights

In [None]:
#Setting batch size
BATCH_SIZE = 16

torch.manual_seed(0)

_logger = ir_datasets.log.easy()

OUTPUTS = ['true', 'false']

#Function to send query-document pair as well as its relevance and weight to the trianing model
def iter_train_samples():
  while True: 
    for (nr_idx, nr_row), (r_idx, r_row) in zip(n_rel.iterrows(), rel.iterrows()):
        yield 'Query: ' + str(r_row['query']) + ' Document: ' + str(docs.get(r_row['docno']).text), OUTPUTS[0], r_row['weight']
        #Taking non-relevant documents into account by setting their weight to 1
        yield 'Query: ' + str(nr_row['query']) + ' Document: ' + str(docs.get(nr_row['docno']).text), OUTPUTS[1], 1


train_iter = _logger.pbar(iter_train_samples(), desc='total train samples')


model = T5ForConditionalGeneration.from_pretrained("t5-base").cuda()
tokenizer = T5Tokenizer.from_pretrained("t5-base")
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)


reranker = MonoT5ReRanker(verbose=False, batch_size=BATCH_SIZE)
reranker.REL = tokenizer.encode(OUTPUTS[0])[0]
reranker.NREL = tokenizer.encode(OUTPUTS[1])[0]


                                               [A[INFO] [error] total train samples: [32:59] [27023it] [13.65it/s]


In [None]:
#Function to build validation dataset
def build_validation_data():
  result = []
  #Loading TREC 2019 dataset
  dataset = ir_datasets.load('msmarco-passage/trec-dl-2019/judged')
  docs = dataset.docs_store()
  queries = {q.query_id: q.text for q in dataset.queries_iter()}
  for qrel in _logger.pbar(ir_datasets.load('msmarco-passage/trec-dl-2019/judged').scoreddocs, desc='dev data'):
    if qrel.query_id in queries:
      result.append([qrel.query_id, queries[qrel.query_id], qrel.doc_id, docs.get(qrel.doc_id).text])
  return pd.DataFrame(result, columns=['qid', 'query', 'docno', 'text'])

valid_data = build_validation_data()
valid_qrels = pt.get_dataset('irds:msmarco-passage/trec-dl-2019/judged').get_qrels()

[INFO] [starting] dev data
[INFO] [finished] dev data: [02:21] [41042it] [289.75it/s]


In [None]:
epoch = 0
max_ndcg = 0.

while epoch!= 10:
  with _logger.pbar_raw(desc=f'train {epoch}', total=16384 // BATCH_SIZE) as pbar:
    model.train()
    total_loss = 0
    count = 0
    for _ in range(16384 // BATCH_SIZE):
      inp, out, weight = [], [], []
      for i in range(BATCH_SIZE):
        i, o, p = next(train_iter)
        inp.append(i)
        out.append(o)
        weight.append(p)
      inp_ids = tokenizer(inp, return_tensors='pt', padding=True).input_ids.cuda()
      out_ids = tokenizer(out, return_tensors='pt', padding=True).input_ids.cuda()

      # ------------------------------CUSTOM LOSS FUNCTION---------------------------------------------
      weight_tensor=torch.FloatTensor(weight).cuda()
      #Using cross entropy loss
      loss_fct= CrossEntropyLoss(ignore_index=-100, reduction='none')
      logits = model(input_ids=inp_ids, labels=out_ids).logits
      #Applying CE Loss to the model logits
      loss_seq = loss_fct(logits.view(-1,logits.shape[-1]), out_ids.view(-1)).view(BATCH_SIZE,-1)
      loss_vec = torch.mean(loss_seq, -1).squeeze()
      #Multiplying weights to the loss of individual samples before taking a mean of the array
      loss = (loss_vec * weight_tensor).mean()
      # -----------------------------------------------------------------------------------------------

      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
      total_loss = loss.item()
      count += 1
      pbar.update(1)
      pbar.set_postfix({'loss': total_loss/count})

  with _logger.duration(f'valid {epoch}'):
    reranker.model = model
    reranker.verbose = True
    res = reranker(valid_data)
    reranker.verbose = False
    metrics = {'epoch': epoch, 'loss': total_loss / count}
    metrics.update(pt.Utils.evaluate(res, valid_qrels, metrics=[nDCG, RR(rel=2)]))
    _logger.info(metrics)
    with open('log.jsonl', 'at') as f:
      f.write(json.dumps(metrics) + '\n')
    model.save_pretrained(f'/content/drive/MyDrive/Colab Notebooks/Dissertation/Models/indwgt-nrwgt1-{epoch}')
    if metrics['nDCG'] > max_ndcg:
      _logger.info('New Best nDCG')
      max_ndcg = metrics['nDCG']
  epoch += 1

[INFO] [starting] train 8
[INFO] [starting] total train samples
train 8:   0%|                                         | 0/1024 [00:00<?, ?it/s]
total train samples: 0it [00:00, ?it/s][A
train 8:   0%|                     | 1/1024 [00:01<18:40,  1.09s/it, loss=0.699]
total train samples: 16it [00:01, 14.87it/s][A
train 8:   0%|                     | 2/1024 [00:01<16:42,  1.02it/s, loss=0.228]
train 8:   0%|                     | 3/1024 [00:02<15:23,  1.11it/s, loss=0.188]
train 8:   0%|                     | 4/1024 [00:03<14:34,  1.17it/s, loss=0.116]
train 8:   0%|                     | 5/1024 [00:04<14:53,  1.14it/s, loss=0.141]
train 8:   1%|                     | 6/1024 [00:05<14:18,  1.19it/s, loss=0.136]
train 8:   1%|▏                   | 7/1024 [00:05<13:41,  1.24it/s, loss=0.0948]
train 8:   1%|▏                   | 8/1024 [00:06<13:36,  1.24it/s, loss=0.0805]
train 8:   1%|▏                   | 9/1024 [00:07<13:32,  1.25it/s, loss=0.0936]
train 8:   1%|▏                  | 

monoT5:   0%|          | 0/2566 [00:00<?, ?batches/s]


total train samples: 16368it [12:33, 21.72it/s][A
[A[INFO] {'epoch': 8, 'loss': 0.0005255199503153563, 'nDCG': 0.47436545012680953, 'RR(rel=2)': 0.5062421665246349}

total train samples: 16368it [21:30, 12.68it/s][A
[A[INFO] New Best nDCG

total train samples: 16368it [21:34, 12.65it/s][A
[A[INFO] [finished] valid 8 [09:13]

total train samples: 16368it [21:34, 12.65it/s][A
[A[INFO] [starting] train 9

train 9:   0%|                                         | 0/1024 [00:00<?, ?it/s]
train 9:   0%|                     | 1/1024 [00:00<12:20,  1.38it/s, loss=0.779]
train 9:   0%|                     | 2/1024 [00:01<12:12,  1.40it/s, loss=0.296]
train 9:   0%|                     | 3/1024 [00:01<11:14,  1.51it/s, loss=0.311]
train 9:   0%|                     | 4/1024 [00:02<10:54,  1.56it/s, loss=0.142]
train 9:   0%|                     | 5/1024 [00:03<11:16,  1.51it/s, loss=0.152]
train 9:   1%|                     | 6/1024 [00:04<11:29,  1.48it/s, loss=0.107]
train 9:   1%|▏   

monoT5:   0%|          | 0/2566 [00:00<?, ?batches/s]


total train samples: 32752it [34:03, 16.03it/s][A
[A[INFO] {'epoch': 9, 'loss': 0.0007515743491239846, 'nDCG': 0.492891062515586, 'RR(rel=2)': 0.5688899939860922}

total train samples: 32752it [43:00, 12.69it/s][A
[A[INFO] New Best nDCG

total train samples: 32752it [43:04, 12.67it/s][A
[A[INFO] [finished] valid 9 [09:13]

total train samples: 32752it [43:04, 12.67it/s][A