# 6.2 Regressor for number of evidence to take (based on claim)

- treat number of evidence to use for each claim as an ordinal categorical variable that can be regressed upon

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import numpy as np

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/train-claims.json') as f:
    train_claims = json.load(f)

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/dev-claims.json') as f:
    dev_claims = json.load(f)

In [None]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/evidence.json') as f:
    evidence = json.load(f)

In [None]:
import random
random.seed(19260817)

In [None]:
evid_id_list = [evid_id for evid_id in evidence]

In [None]:
training_data = []

for id in train_claims:

  claim_text = train_claims[id]['claim_text']

  n_evid = len(train_claims[id]['evidences'])

  training_data.append((claim_text, n_evid))

In [None]:
dev_data = []

for id in dev_claims:

  claim_text = dev_claims[id]['claim_text']

  n_evid = len(dev_claims[id]['evidences'])

  dev_data.append((claim_text, n_evid))

In [None]:
!pip install torch torchvision transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import BertModel

In [None]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# sentence1 = training_data[0][0][0]
# sentence2 = training_data[0][0][1]

In [None]:
# tokens1 = tokenizer.tokenize(sentence1)
# tokens2 = tokenizer.tokenize(sentence2)

In [None]:
# tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
# tokens2 = tokens2 + ['[SEP]']
# tokens = tokens1 + tokens2
# print(tokens)

In [None]:
# len(tokens)

In [None]:
# T = 512

# padded_tokens = tokens + ['[PAD]' for _ in range(T-len(tokens))]
# print(padded_tokens)

# attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
# print(attn_mask)

In [None]:
# seg_ids = [0 for _ in range(len(tokens1))]
# seg_ids2 = [1 for _ in range(512-len(tokens1))]
# seg_ids.extend(seg_ids2)
# print(seg_ids)

In [None]:
# token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)
# print(token_ids)

In [None]:
# import torch

# token_ids_t = torch.tensor(token_ids).unsqueeze(0)
# attn_mask_t = torch.tensor(attn_mask).unsqueeze(0)
# seg_ids_t = torch.tensor(seg_ids).unsqueeze(0)

In [None]:
# outputs = bert_model(token_ids_t, attention_mask = attn_mask_t, token_type_ids = seg_ids_t, return_dict = True)

In [None]:
# hidden_reps = outputs.last_hidden_state

In [None]:
# print(hidden_reps[0,0,:10])

In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class Dataset():

    def __init__(self, data, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.data[index][0]
        label = self.data[index][1]

        #Preprocessing the text to be suitable for BERT
        tokens = tokenizer.tokenize(sentence)
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()
  
        return tokens_ids_tensor, attn_mask, label

In [None]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = Dataset(training_data, maxlen = 512)
dev_set = Dataset(dev_data, maxlen = 512)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 16, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 16, num_workers = 2)

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

class nEvidRegressor(nn.Module):

    def __init__(self):
        super(nEvidRegressor, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        prediction = self.cls_layer(cls_rep)

        return prediction

In [None]:
gpu = 0 #gpu ID

print("Creating the sentiment regressor, initialised with pretrained BERT-BASE parameters...")
net = nEvidRegressor()
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the sentiment regressor.")

Creating the sentiment regressor, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment regressor.


In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.MSELoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [None]:
!pip install torchmetrics
from torchmetrics import R2Score
r2score = R2Score().to(gpu)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc
    

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            prediction = net(seq, attn_masks)
            mean_loss += criterion(prediction.squeeze(-1), labels.float()).item()
            count += 1

    return r2score(prediction.squeeze(-1), labels), mean_loss / count

In [None]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            prediction = net(seq, attn_masks)

            #Computing loss
            loss = criterion(prediction.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                r2 = r2score(prediction.squeeze(-1), labels)
                # acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), r2, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [None]:
num_epoch = 3

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 1.4166837930679321; Accuracy: 0.5118828415870667; Time taken (s): 1.5630509853363037
Epoch 0 complete! Development Accuracy: -0.26622307300567627; Development Loss: 2.9280277967453
Iteration 0 of epoch 1 complete. Loss: 0.49700045585632324; Accuracy: 0.828758955001831; Time taken (s): 111.84511637687683
Epoch 1 complete! Development Accuracy: -0.4728083610534668; Development Loss: 3.0915986776351927
Iteration 0 of epoch 2 complete. Loss: 0.4696519374847412; Accuracy: 0.8381818532943726; Time taken (s): 111.70906257629395
Epoch 2 complete! Development Accuracy: -0.7219902276992798; Development Loss: 3.4682044982910156
