# 4.1 Retrieval BERT - reduced

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read in Data

In [None]:
import json
import numpy as np
import gc
import pickle

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/curated/train_claims2.json') as f:
    train_claims = json.load(f)

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/dev-claims.json') as f:
    dev_claims = json.load(f)

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/curated/test_claims2.json') as f:
    test_claims = json.load(f)

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/test-claims-unlabelled.json') as f:
    future_claims = json.load(f)

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/evidence.json') as f:
    evidence = json.load(f)

In [None]:
import random
random.seed(19260817)

## Reduce claims

In [None]:
scientific_claims_id = set()
for claim in train_claims:
  for evid in train_claims[claim]['evidences']:
    scientific_claims_id.add(evid)

for claim in dev_claims:
  for evid in dev_claims[claim]['evidences']:
    scientific_claims_id.add(evid)

for claim in test_claims:
  for evid in test_claims[claim]['evidences']:
    scientific_claims_id.add(evid)


scientific_claims_id = list(scientific_claims_id)

### Create dataset used to train retriever

In [None]:
NEG_SAMPLE_FACTOR = 1

In [None]:
training_data = []

for id in train_claims:

  claim_text = train_claims[id]['claim_text']

  n_evid = len(train_claims[id]['evidences'])

  for evid_id in train_claims[id]['evidences']:
    evid_text = evidence[evid_id]

    training_data.append((((claim_text, evid_text)), 1))
  
  #negative sampling
  neg_sampled = 0
  while neg_sampled < n_evid * NEG_SAMPLE_FACTOR:
    sampled_neg_evid_id = random.choice(scientific_claims_id)
    if sampled_neg_evid_id not in train_claims[id]['evidences']:
      neg_sampled += 1
      evid_text = evidence[sampled_neg_evid_id]

      training_data.append((((claim_text, evid_text)), 0))

In [None]:
dev_data = []

for id in dev_claims:

  claim_text = dev_claims[id]['claim_text']

  n_evid = len(dev_claims[id]['evidences'])

  for evid_id in dev_claims[id]['evidences']:
    evid_text = evidence[evid_id]

    dev_data.append((((claim_text, evid_text)), 1))
  
  #negative sampling
  neg_sampled = 0
  while neg_sampled < n_evid * NEG_SAMPLE_FACTOR:
    sampled_neg_evid_id = random.choice(scientific_claims_id)
    if sampled_neg_evid_id not in dev_claims[id]['evidences']:
      neg_sampled += 1
      evid_text = evidence[sampled_neg_evid_id]

      dev_data.append((((claim_text, evid_text)), 0))

In [None]:
test_data = []

for id in test_claims:

  claim_text = test_claims[id]['claim_text']

  n_evid = len(test_claims[id]['evidences'])

  for evid_id in test_claims[id]['evidences']:
    evid_text = evidence[evid_id]

    test_data.append((((claim_text, evid_text)), 1))
  
  #negative sampling
  neg_sampled = 0
  while neg_sampled < n_evid * NEG_SAMPLE_FACTOR:
    sampled_neg_evid_id = random.choice(scientific_claims_id)
    if sampled_neg_evid_id not in test_claims[id]['evidences']:
      neg_sampled += 1
      evid_text = evidence[sampled_neg_evid_id]

      test_data.append((((claim_text, evid_text)), 0))

In [None]:
!pip install torch torchvision transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m103.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m106.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


## Build model, dataloader etc

In [None]:
from transformers import BertModel

In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class Dataset():

    def __init__(self, data, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence1 = self.data[index][0][0]
        sentence2 = self.data[index][0][1]
        label = self.data[index][1]

        #Preprocessing the text to be suitable for BERT
        tokens1 = tokenizer.tokenize(sentence1)
        tokens2 = tokenizer.tokenize(sentence2)
        tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
        tokens2 = tokens2 + ['[SEP]']
        tokens = tokens1 + tokens2 #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        seg_ids = [0 for _ in range(len(tokens1))]
        seg_ids2 = [1 for _ in range(self.maxlen-len(tokens1))]
        seg_ids.extend(seg_ids2)

        seg_ids_tensor = torch.tensor(seg_ids)


        return tokens_ids_tensor, attn_mask, seg_ids_tensor, label

In [None]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = Dataset(training_data, maxlen = 512)
dev_set = Dataset(dev_data, maxlen = 512)
test_set = Dataset(test_data, maxlen = 512)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 16, shuffle = True, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 16, shuffle = True, num_workers = 2)
test_loader = DataLoader(test_set, batch_size = 16, shuffle = True, num_workers = 2)

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class RelatednessClassifier(nn.Module):

    def __init__(self):
        super(RelatednessClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks, seg_ids):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, token_type_ids = seg_ids, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [None]:
gpu = 0 #gpu ID

print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = RelatednessClassifier()
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the sentiment classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment classifier.


### Setup Training

In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, seg_ids_tensor, labels in dataloader:
            seq, attn_masks, seg_ids_tensor, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids_tensor.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks, seg_ids_tensor)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, seg_ids_tensor, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, seg_ids_tensor, labels = seq.cuda(gpu), attn_masks.cuda(gpu), seg_ids_tensor.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks, seg_ids_tensor)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), './drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_Bert_reduced.pt')
            torch.save(net, './drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_Bert_reduced.pt')

In [None]:
num_epoch = 5

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

## Read In Model

In [None]:
net.load_state_dict(torch.load('./drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_Bert_reduced.dat'))

<All keys matched successfully>

In [None]:
net = torch.load('./drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_Bert_reduced.pt')

In [None]:
net.eval()

RelatednessClassifier(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

## Make Predictions (to get Retrievals)

In [None]:
class PredictDataset():

    def __init__(self, data, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence1 = self.data[index][0]
        sentence2 = self.data[index][1]

        #Preprocessing the text to be suitable for BERT
        tokens1 = tokenizer.tokenize(sentence1)
        tokens2 = tokenizer.tokenize(sentence2)
        tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
        tokens2 = tokens2 + ['[SEP]']
        tokens = tokens1 + tokens2 #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        seg_ids = [0 for _ in range(len(tokens1))]
        seg_ids2 = [1 for _ in range(self.maxlen-len(tokens1))]
        seg_ids.extend(seg_ids2)

        seg_ids_tensor = torch.tensor(seg_ids)


        return tokens_ids_tensor, attn_mask, seg_ids_tensor

In [None]:
torch.cuda.empty_cache()
gc.collect()

21

In [None]:
EVIDENCE = [(claim,evidence[claim]) for claim in scientific_claims_id]

In [None]:
def get_retrievals(claims, file_name, SIZE=32):
  try:
    with open(f'./drive/My Drive/LAB/COMP90042 A3/predictions/Retrievals/{file_name}.pickle', 'rb') as f:
      retrievals = pickle.load(f)
  except:
      retrievals = {}

  EVIDENCE.sort(key = lambda x:x[0])

  i = 0
  for id in claims:
    if id in retrievals:
      print('pass:', id)
      continue

    print(id)
    data_for_predict = []

    claim_text = claims[id]['claim_text']

    for evid in EVIDENCE:

      evid_text = evid[1]

      data_for_predict.append((claim_text, evid_text))

    set_for_predict = PredictDataset(data_for_predict, maxlen = 512)

    predict_loader = DataLoader(set_for_predict, batch_size = SIZE, num_workers = 2)
    
    predicted_logit = list()
    with torch.no_grad():
      for it, (tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2) in enumerate(predict_loader):
        
        torch.cuda.empty_cache()
        gc.collect()
        tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2 = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu), tokens_ids_tensor2.cuda(gpu), attn_mask2.cuda(gpu)

        logits = net(tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2).tolist()
        logits = [x[0] for x in logits]
        
        for prediction in zip([claim_id for claim_id in EVIDENCE[it*SIZE:(it+1)*SIZE+1]], logits):
          predicted_logit.append(prediction)
      
      predicted_logit.sort(key = lambda x:x[1], reverse = True)
      predicted_logit = predicted_logit[:5]

      retrievals[id] = {'evidences': [x[0] for x in predicted_logit]}

      with open(f'./drive/My Drive/LAB/COMP90042 A3/predictions/Retrievals/{file_name}.pickle', 'wb') as f:
          pickle.dump(retrievals, f)

In [None]:
get_retrievals(dev_claims, 'Retrieval_BERT_reduced_dev', SIZE=32)

In [None]:
get_retrievals(test_claims, 'Retrieval_BERT_reduced_test', SIZE=32)

In [None]:
get_retrievals(future_claims, 'Retrieval_BERT_reduced_future', SIZE=32)