# 4.3 Retrieval SBERT

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read in Data

In [2]:
import json
import numpy as np
import gc
import pickle

In [3]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/curated/train_claims2.json') as f:
    train_claims = json.load(f)

In [4]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/dev-claims.json') as f:
    dev_claims = json.load(f)

In [5]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/curated/test_claims2.json') as f:
    test_claims = json.load(f)

In [6]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/test-claims-unlabelled.json') as f:
    future_claims = json.load(f)

In [7]:
with open ('./drive/My Drive/LAB/COMP90042 A3/data/raw/evidence.json') as f:
    evidence = json.load(f)

In [9]:
import random
random.seed(19260817)

## Reduce claims

In [10]:
evid_id_list = [evid_id for evid_id in evidence]

### Create dataset used to train retriever

In [12]:
NEG_SAMPLE_FACTOR = 1

In [13]:
training_data = []

for id in train_claims:

  claim_text = train_claims[id]['claim_text']

  n_evid = len(train_claims[id]['evidences'])

  for evid_id in train_claims[id]['evidences']:
    evid_text = evidence[evid_id]

    training_data.append((((claim_text, evid_text)), 1))
  
  #negative sampling
  neg_sampled = 0
  while neg_sampled < n_evid * NEG_SAMPLE_FACTOR:
    sampled_neg_evid_id = random.choice(evid_id_list)
    if sampled_neg_evid_id not in train_claims[id]['evidences']:
      neg_sampled += 1
      evid_text = evidence[sampled_neg_evid_id]

      training_data.append((((claim_text, evid_text)), 0))

In [14]:
dev_data = []

for id in dev_claims:

  claim_text = dev_claims[id]['claim_text']

  n_evid = len(dev_claims[id]['evidences'])

  for evid_id in dev_claims[id]['evidences']:
    evid_text = evidence[evid_id]

    dev_data.append((((claim_text, evid_text)), 1))
  
  #negative sampling
  neg_sampled = 0
  while neg_sampled < n_evid * NEG_SAMPLE_FACTOR:
    sampled_neg_evid_id = random.choice(evid_id_list)
    if sampled_neg_evid_id not in dev_claims[id]['evidences']:
      neg_sampled += 1
      evid_text = evidence[sampled_neg_evid_id]

      dev_data.append((((claim_text, evid_text)), 0))

In [15]:
test_data = []

for id in test_claims:

  claim_text = test_claims[id]['claim_text']

  n_evid = len(test_claims[id]['evidences'])

  for evid_id in test_claims[id]['evidences']:
    evid_text = evidence[evid_id]

    test_data.append((((claim_text, evid_text)), 1))
  
  #negative sampling
  neg_sampled = 0
  while neg_sampled < n_evid * NEG_SAMPLE_FACTOR:
    sampled_neg_evid_id = random.choice(evid_id_list)
    if sampled_neg_evid_id not in test_claims[id]['evidences']:
      neg_sampled += 1
      evid_text = evidence[sampled_neg_evid_id]

      test_data.append((((claim_text, evid_text)), 0))

In [16]:
!pip install torch torchvision transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m82.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.28.1


## Build model, dataloader etc

In [17]:
from transformers import BertModel

In [18]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class Dataset():

    def __init__(self, data, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence1 = self.data[index][0][0]
        sentence2 = self.data[index][0][1]
        label = self.data[index][1]

        #Preprocessing the text to be suitable for BERT
        tokens1 = tokenizer.tokenize(sentence1)
        tokens2 = tokenizer.tokenize(sentence2)
        tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
        tokens2 = ['[CLS]'] + tokens2 + ['[SEP]']
        if len(tokens1) < self.maxlen:
            tokens1 = tokens1 + ['[PAD]' for _ in range(self.maxlen - len(tokens1))] #Padding sentences
        else:
            tokens1 = tokens1[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        if len(tokens2) < self.maxlen:
            tokens2 = tokens2 + ['[PAD]' for _ in range(self.maxlen - len(tokens2))] #Padding sentences
        else:
            tokens2 = tokens2[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids1 = self.tokenizer.convert_tokens_to_ids(tokens1) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor1 = torch.tensor(tokens_ids1) #Converting the list to a pytorch tensor

        tokens_ids2 = self.tokenizer.convert_tokens_to_ids(tokens2) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor2 = torch.tensor(tokens_ids2) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask1 = (tokens_ids_tensor1 != 0).long()
        attn_mask2 = (tokens_ids_tensor2 != 0).long()



        return tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2,  label

In [20]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = Dataset(training_data, maxlen = 512)
dev_set = Dataset(dev_data, maxlen = 512)
test_set = Dataset(test_data, maxlen = 512)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set, batch_size = 4, shuffle = True, num_workers = 2)
dev_loader = DataLoader(dev_set, batch_size = 4, shuffle = True, num_workers = 2)
test_loader = DataLoader(test_set, batch_size = 4, shuffle = True, num_workers = 2)

In [21]:
import torch
import torch.nn as nn
from transformers import BertModel

class RelatednessClassifier(nn.Module):

    def __init__(self):
        super(RelatednessClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer1 = BertModel.from_pretrained('bert-base-uncased')
        self.bert_layer2 = BertModel.from_pretrained('bert-base-uncased')

        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(1537, 1)

    def forward(self, seq1, attn_masks1, seq2, attn_masks2):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''
        batch_size = seq1.size(0)
        #Feeding the input to BERT model to obtain contextualized representations
        claim_outputs = self.bert_layer1(seq1, attention_mask = attn_masks1, return_dict=True)
        claim_cont_reps = claim_outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        claim_cls_reps = claim_cont_reps[:, 0]

        #Feeding the input to BERT model to obtain contextualized representations
        evid_outputs = self.bert_layer2(seq2, attention_mask = attn_masks2, return_dict=True)
        evid_cont_reps = evid_outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        evid_cls_reps = evid_cont_reps[:, 0]

        # Concatenate the two output tensors along the last dimension (i.e., the features dimension)
        concat_output = torch.cat((claim_cls_reps, evid_cls_reps), dim=-1)

        # Calculate the Euclidean distance between the two output tensors and flatten the result
        distances = []
        for i in range(batch_size):
            distance = torch.dist(claim_cls_reps[i], evid_cls_reps[i], p=2)
            distances.append(distance)
        distances = torch.flatten(torch.stack(distances)).unsqueeze(1)

        # Concatenate the flattened distance with the concatenated output tensor
        concat_output = torch.cat((concat_output, distances), dim=-1)

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(concat_output)

        return logits
    

    def get_claim_embedding(self, seq1, attn_masks1):
        #Feeding the input to BERT model to obtain contextualized representations
        claim_outputs = self.bert_layer1(seq1, attention_mask = attn_masks1, return_dict=True)
        claim_cont_reps = claim_outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        claim_cls_reps = claim_cont_reps[:, 0]

        return claim_cls_reps
    

    def get_evid_embedding(self, seq2, attn_masks2):
        #Feeding the input to BERT model to obtain contextualized representations
        evid_outputs = self.bert_layer2(seq2, attention_mask = attn_masks2, return_dict=True)
        evid_cont_reps = evid_outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        evid_cls_reps = evid_cont_reps[:, 0]

        return evid_cls_reps
    

    def neural_layer(self, claim_cls_reps, evid_cls_reps, batch_size):

        # Concatenate the two output tensors along the last dimension (i.e., the features dimension)
        concat_output = torch.cat((claim_cls_reps, evid_cls_reps), dim=-1)

        # Calculate the Euclidean distance between the two output tensors and flatten the result
        distances = []
        for i in range(batch_size):
            distance = torch.dist(claim_cls_reps[i], evid_cls_reps[i], p=2)
            distances.append(distance)
        if batch_size == 1:
          distances = torch.flatten(torch.stack(distances))
        else:
          distances = torch.flatten(torch.stack(distances)).unsqueeze(1)

        # Concatenate the flattened distance with the concatenated output tensor
        concat_output = torch.cat((concat_output, distances), dim=-1)

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(concat_output)

        return logits

In [22]:
torch.cuda.empty_cache()
gc.collect()

12

In [23]:
gpu = 0 #gpu ID

print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = RelatednessClassifier()
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the sentiment classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.pred

Done creating the sentiment classifier.


### Setup Training

In [None]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [23]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2, label in dataloader:
            tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2, label = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu), tokens_ids_tensor2.cuda(gpu), attn_mask2.cuda(gpu), label.cuda(gpu)
            logits = net(tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2)
            mean_loss += criterion(logits.squeeze(-1), label.float()).item()
            mean_acc += get_accuracy_from_logits(logits, label)
            count += 1

    return mean_acc / count, mean_loss / count

In [24]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2, label) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2, label = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu), tokens_ids_tensor2.cuda(gpu), attn_mask2.cuda(gpu), label.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2)

            #Computing loss
            loss = criterion(logits.squeeze(-1), label.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, label)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), './drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_SBert.dat')
            torch.save(net, './drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_SBert.pt')

In [None]:
torch.cuda.empty_cache()
gc.collect()

0

In [None]:
num_epoch = 5

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.01396762952208519; Accuracy: 1.0; Time taken (s): 0.9922280311584473
Iteration 100 of epoch 0 complete. Loss: 0.0022834932897239923; Accuracy: 1.0; Time taken (s): 76.93600249290466
Iteration 200 of epoch 0 complete. Loss: 0.03231086581945419; Accuracy: 1.0; Time taken (s): 76.4855682849884
Iteration 300 of epoch 0 complete. Loss: 0.048212192952632904; Accuracy: 1.0; Time taken (s): 76.37822246551514
Iteration 400 of epoch 0 complete. Loss: 0.025565508753061295; Accuracy: 1.0; Time taken (s): 76.409738779068
Iteration 500 of epoch 0 complete. Loss: 0.033778924494981766; Accuracy: 1.0; Time taken (s): 76.39283323287964
Iteration 600 of epoch 0 complete. Loss: 0.007882692851126194; Accuracy: 1.0; Time taken (s): 76.2576060295105
Iteration 700 of epoch 0 complete. Loss: 0.008956797420978546; Accuracy: 1.0; Time taken (s): 76.21597027778625
Iteration 800 of epoch 0 complete. Loss: 0.3136775493621826; Accuracy: 0.75; Time taken (s): 76.29948234558105

## Read In Model

In [None]:
net.load_state_dict(torch.load('./drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_SBert.dat'))

<All keys matched successfully>

In [24]:
net = torch.load('./drive/My Drive/LAB/COMP90042 A3/models/Retrievers/Retriever_SBert.pt')

In [25]:
net.eval()

RelatednessClassifier(
  (bert_layer1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

## Make Predictions (to get Retrievals)

In [26]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class EmbeddingDataset():

    def __init__(self, data, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        id = self.data[index][0]
        sentence1 = self.data[index][1]

        #Preprocessing the text to be suitable for BERT
        tokens1 = tokenizer.tokenize(sentence1)
        tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
        if len(tokens1) < self.maxlen:
            tokens1 = tokens1 + ['[PAD]' for _ in range(self.maxlen - len(tokens1))] #Padding sentences
        else:
            tokens1 = tokens1[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids1 = self.tokenizer.convert_tokens_to_ids(tokens1) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor1 = torch.tensor(tokens_ids1) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask1 = (tokens_ids_tensor1 != 0).long()



        return id, tokens_ids_tensor1, attn_mask1

In [27]:
dev_embedding_data = []

for id in dev_claims:

  claim_text = dev_claims[id]['claim_text']

  dev_embedding_data.append((id, claim_text))


test_embedding_data = []

for id in test_claims:

  claim_text = test_claims[id]['claim_text']

  test_embedding_data.append((id, claim_text))


future_embedding_data = []

for id in future_claims:

  claim_text = future_claims[id]['claim_text']

  future_embedding_data.append((id, claim_text))
  

evid_embedding_data = []
for evid in evidence:
  evid_text = evidence[evid]

  evid_embedding_data.append((evid, evid_text))

In [28]:
dev_embed_set = EmbeddingDataset(dev_embedding_data, maxlen = 512)
test_embed_set = EmbeddingDataset(test_embedding_data, maxlen = 512)
future_embed_set = EmbeddingDataset(future_embedding_data, maxlen = 512)
evid_embed_set = EmbeddingDataset(evid_embedding_data, maxlen = 512)

#Creating intsances of training and development dataloaders
dev_embed_loader = DataLoader(dev_embed_set, batch_size = 4, shuffle = False, num_workers = 2)
test_embed_loader = DataLoader(test_embed_set, batch_size = 4, shuffle = False, num_workers = 2)
future_embed_loader = DataLoader(future_embed_set, batch_size = 4, shuffle = False, num_workers = 2)
evid_embed_loader = DataLoader(evid_embed_set, batch_size = 4, shuffle = False, num_workers = 2)

In [29]:
import gc
torch.cuda.empty_cache()
gc.collect()

33

In [None]:
dev_embed = {}

for id, tokens_ids_tensor1, attn_mask1 in dev_embed_loader:
  tokens_ids_tensor1, attn_mask1 = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu)
  embed = net.get_claim_embedding(tokens_ids_tensor1, attn_mask1)

  embed = embed.tolist()
  for i in range(len(id)):
      dev_embed[id[i]] = embed[i]

with open('./drive/My Drive/LAB/COMP90042 A3/dev_embed.json', 'w') as f:
  json.dump(dev_embed, f)

In [30]:
test_embed = {}

for id, tokens_ids_tensor1, attn_mask1 in test_embed_loader:
  tokens_ids_tensor1, attn_mask1 = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu)
  embed = net.get_claim_embedding(tokens_ids_tensor1, attn_mask1)

  embed = embed.tolist()
  for i in range(len(id)):
      test_embed[id[i]] = embed[i]

with open('./drive/My Drive/LAB/COMP90042 A3/test_embed.json', 'w') as f:
  json.dump(test_embed, f)

In [31]:
future_embed = {}

for id, tokens_ids_tensor1, attn_mask1 in future_embed_loader:
  tokens_ids_tensor1, attn_mask1 = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu)
  embed = net.get_claim_embedding(tokens_ids_tensor1, attn_mask1)

  embed = embed.tolist()
  for i in range(len(id)):
      future_embed[id[i]] = embed[i]

with open('./drive/My Drive/LAB/COMP90042 A3/future_embed.json', 'w') as f:
  json.dump(future_embed, f)

In [None]:
torch.cuda.empty_cache()
gc.collect()

67

In [None]:
evid_embed = {}
j = 0
for id, tokens_ids_tensor1, attn_mask1 in evid_embed_loader:

  if j <= 172800:
    j+=1
    if j %6400 == 0:
      print('pass', j)

    continue

  tokens_ids_tensor1, attn_mask1 = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu)
  embed = net.get_evid_embedding(tokens_ids_tensor1, attn_mask1)
  embed = embed.tolist()

  torch.cuda.empty_cache()
  gc.collect()

  for i in range(len(id)):
      evid_embed[id[i]] = embed[i]
  
  j += 1
  if j %6400 == 0:
    print(j)
    with open(f'./drive/My Drive/LAB/COMP90042 A3/evid_embed{j}.json', 'w') as f:
      json.dump(evid_embed, f)
      evid_embed = {}

print(j)
with open(f'./drive/My Drive/LAB/COMP90042 A3/evid_embed{j}.json', 'w') as f:
  json.dump(evid_embed, f)
  evid_embed = {}

pass 6400
pass 12800
pass 19200
pass 25600
pass 32000
pass 38400
pass 44800
pass 51200
pass 57600
pass 64000
pass 70400
pass 76800
pass 83200
pass 89600
pass 96000
pass 102400
pass 108800
pass 115200
pass 121600
pass 128000
pass 134400
pass 140800
pass 147200
pass 153600
pass 160000
pass 166400
pass 172800
179200
185600
192000
198400
204800
211200
217600
224000
230400
236800
243200
249600
256000
262400
268800
275200
281600
288000
294400
300800
302207


Prediction

In [32]:
with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/other_embeddings/dev_embed.json', 'r') as f:
  dev_embed = json.load(f)

In [33]:
with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/other_embeddings/test_embed.json', 'r') as f:
  test_embed = json.load(f)

In [34]:
with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/other_embeddings/future_embed.json', 'r') as f:
  future_embed = json.load(f)

In [35]:
import os
evid_embed_list = os.listdir('./drive/My Drive/LAB/COMP90042 A3/embeddings/evid_embedding')

In [36]:
class PredictDataset():

    def __init__(self, data, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        evid_id = self.data[index][0]
        evid_embed_vector = self.data[index][1]

        return evid_id, evid_embed_vector

In [37]:
BATCH_SIZE = 1024

torch.cuda.empty_cache()
gc.collect()

32

In [38]:
import copy

In [37]:
top_5_dev_evid = {}
processed_dev_embed = []

In [39]:
with open('./drive/My Drive/LAB/COMP90042 A3/predictions/Retrieval_BERT_dev.pickle', 'rb') as f:
    top_5_dev_evid = pickle.load(f)
  
with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/processed_dev_embed.json', 'r') as f:
    processed_dev_embed = json.load(f)

In [42]:
for evid_embed_dir in evid_embed_list:

  if evid_embed_dir in processed_dev_embed:
    print('pass', evid_embed_dir)
    continue
  
  print(evid_embed_dir)
  
  with open(f'./drive/My Drive/LAB/COMP90042 A3/embeddings/evid_embedding/{evid_embed_dir}', 'r') as f:
    evid_embed = json.load(f)
    evid_embeds = [(evid_id, evid_embed[evid_id]) for evid_id in evid_embed]

    for dev_id in dev_embed:

      dev_embed_vector = torch.Tensor([dev_embed[dev_id] for i in range(BATCH_SIZE)])
      dev_embed_vector = dev_embed_vector.cuda(gpu)
      
      if dev_id in top_5_dev_evid:
        tmp_evid = copy.deepcopy(top_5_dev_evid[dev_id])
      else:
        tmp_evid = []

      with torch.no_grad():
        ids = []
        evid_embed_vectors = []
        for i in range(len(evid_embeds)):
          ids.append(evid_embeds[i][0])
          evid_embed_vectors.append(evid_embeds[i][1])
        
          if (i+1) % BATCH_SIZE == 0:
            evid_embed_vectors = torch.Tensor(evid_embed_vectors)
            evid_embed_vectors = evid_embed_vectors.cuda(gpu)

            predict = net.neural_layer(dev_embed_vector, evid_embed_vectors, BATCH_SIZE).tolist()

            for j in range(BATCH_SIZE):
              tmp_evid.append((ids[j], predict[j][0]))

            ids = []
            evid_embed_vectors = []
        

      tmp_evid.sort(reverse=True, key = lambda x:x[1])
      top_5_dev_evid[dev_id] = tmp_evid[:5]

  with open('./drive/My Drive/LAB/COMP90042 A3/predictions/Retrieval_BERT_dev.pickle', 'wb') as f:
    pickle.dump(top_5_dev_evid, f)
  
  processed_dev_embed.append(evid_embed_dir)
  with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/processed_dev_embed.json', 'w') as f:
    json.dump(processed_dev_embed, f)

pass evid_embed6400.json
pass evid_embed12800.json
pass evid_embed19200.json
pass evid_embed25600.json
pass evid_embed32000.json
pass evid_embed38400.json
pass evid_embed44800.json
pass evid_embed51200.json
pass evid_embed57600.json
pass evid_embed64000.json
pass evid_embed70400.json
pass evid_embed76800.json
pass evid_embed83200.json
pass evid_embed89600.json
pass evid_embed96000.json
pass evid_embed102400.json
pass evid_embed108800.json
pass evid_embed115200.json
pass evid_embed121600.json
pass evid_embed128000.json
pass evid_embed134400.json
pass evid_embed140800.json
pass evid_embed147200.json
pass evid_embed153600.json
pass evid_embed160000.json
pass evid_embed166400.json
pass evid_embed172800.json
pass evid_embed179200.json
pass evid_embed185600.json
pass evid_embed192000.json
pass evid_embed198400.json
pass evid_embed204800.json
pass evid_embed211200.json
pass evid_embed217600.json
pass evid_embed224000.json
pass evid_embed230400.json
pass evid_embed236800.json
pass evid_embed24

In [41]:
BATCH_SIZE = 1024

torch.cuda.empty_cache()
gc.collect()

0

In [46]:
top_5_test_evid = {}
processed_test_embed = []

In [None]:
with open('./drive/My Drive/LAB/COMP90042 A3/predictions/Retrieval_BERT_test.pickle', 'rb') as f:
    top_5_test_evid = pickle.load(f)

with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/processed_test_embed.json', 'r') as f:
    processed_test_embed = json.load(f)

In [None]:
for evid_embed_dir in evid_embed_list:

  if evid_embed_dir in processed_test_embed:
    print('pass', evid_embed_dir)
    continue
  
  print(evid_embed_dir)
  
  with open(f'./drive/My Drive/LAB/COMP90042 A3/embeddings/evid_embedding/{evid_embed_dir}', 'r') as f:
    evid_embed = json.load(f)
    evid_embeds = [(evid_id, evid_embed[evid_id]) for evid_id in evid_embed]

    for test_id in test_embed:

      test_embed_vector = torch.Tensor([test_embed[test_id] for i in range(BATCH_SIZE)])
      test_embed_vector = test_embed_vector.cuda(gpu)
      
      if test_id in top_5_test_evid:
        tmp_evid = copy.deepcopy(top_5_test_evid[test_id])
      else:
        tmp_evid = []

      with torch.no_grad():
        ids = []
        evid_embed_vectors = []
        for i in range(len(evid_embeds)):
          ids.append(evid_embeds[i][0])
          evid_embed_vectors.append(evid_embeds[i][1])
        
          if (i+1) % BATCH_SIZE == 0:
            evid_embed_vectors = torch.Tensor(evid_embed_vectors)
            evid_embed_vectors = evid_embed_vectors.cuda(gpu)

            predict = net.neural_layer(test_embed_vector, evid_embed_vectors, BATCH_SIZE).tolist()

            for j in range(BATCH_SIZE):
              tmp_evid.append((ids[j], predict[j][0]))

            ids = []
            evid_embed_vectors = []
        

      tmp_evid.sort(reverse=True, key = lambda x:x[1])
      top_5_test_evid[test_id] = tmp_evid[:5]

  with open('./drive/My Drive/LAB/COMP90042 A3/predictions/Retrieval_BERT_test.pickle', 'wb') as f:
    pickle.dump(top_5_test_evid, f)
  
  processed_test_embed.append(evid_embed_dir)
  with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/processed_test_embed.json', 'w') as f:
    json.dump(processed_test_embed, f)

evid_embed6400.json
evid_embed12800.json
evid_embed19200.json
evid_embed25600.json
evid_embed32000.json
evid_embed38400.json
evid_embed44800.json
evid_embed51200.json
evid_embed57600.json
evid_embed64000.json
evid_embed70400.json
evid_embed76800.json
evid_embed83200.json
evid_embed89600.json
evid_embed96000.json
evid_embed102400.json
evid_embed108800.json
evid_embed115200.json
evid_embed121600.json
evid_embed128000.json


In [None]:
BATCH_SIZE = 1024

torch.cuda.empty_cache()
gc.collect()

0

In [None]:
top_5_future_evid = {}
processed_future_embed = []

In [None]:
for evid_embed_dir in evid_embed_list:

  if evid_embed_dir in processed_future_embed:
    print('pass', evid_embed_dir)
    continue
  
  print(evid_embed_dir)
  
  with open(f'./drive/My Drive/LAB/COMP90042 A3/embeddings/evid_embedding/{evid_embed_dir}', 'r') as f:
    evid_embed = json.load(f)
    evid_embeds = [(evid_id, evid_embed[evid_id]) for evid_id in evid_embed]

    for future_id in future_embed:

      future_embed_vector = torch.Tensor([future_embed[future_id] for i in range(BATCH_SIZE)])
      future_embed_vector = future_embed_vector.cuda(gpu)
      
      if future_id in top_5_future_evid:
        tmp_evid = copy.deepcopy(top_5_future_evid[future_id])
      else:
        tmp_evid = []

      with torch.no_grad():
        ids = []
        evid_embed_vectors = []
        for i in range(len(evid_embeds)):
          ids.append(evid_embeds[i][0])
          evid_embed_vectors.append(evid_embeds[i][1])
        
          if (i+1) % BATCH_SIZE == 0:
            evid_embed_vectors = torch.Tensor(evid_embed_vectors)
            evid_embed_vectors = evid_embed_vectors.cuda(gpu)

            predict = net.neural_layer(future_embed_vector, evid_embed_vectors, BATCH_SIZE).tolist()

            for j in range(BATCH_SIZE):
              tmp_evid.append((ids[j], predict[j][0]))

            ids = []
            evid_embed_vectors = []
        

      tmp_evid.sort(reverse=True, key = lambda x:x[1])
      top_5_future_evid[future_id] = tmp_evid[:5]

  with open('./drive/My Drive/LAB/COMP90042 A3/predictions/Retrieval_BERT_future.pickle', 'wb') as f:
    pickle.dump(top_5_future_evid, f)
  
  processed_future_embed.append(evid_embed_dir)
  with open('./drive/My Drive/LAB/COMP90042 A3/embeddings/processed_future_embed.pickle', 'wb') as f:
    pickle.dump(processed_future_embed, f)

pass evid_embed6400.json
evid_embed12800.json
evid_embed19200.json
evid_embed25600.json
evid_embed32000.json
evid_embed38400.json
evid_embed44800.json
evid_embed51200.json
evid_embed57600.json
evid_embed64000.json
evid_embed70400.json
evid_embed76800.json
evid_embed83200.json
evid_embed89600.json
evid_embed96000.json
evid_embed102400.json
evid_embed108800.json
evid_embed115200.json
evid_embed121600.json
evid_embed128000.json
evid_embed134400.json
evid_embed140800.json
evid_embed147200.json
evid_embed153600.json
evid_embed160000.json
evid_embed166400.json
evid_embed172800.json
evid_embed179200.json
evid_embed185600.json
evid_embed192000.json
evid_embed198400.json
evid_embed204800.json
evid_embed211200.json
evid_embed217600.json
evid_embed224000.json
evid_embed230400.json
evid_embed236800.json
evid_embed243200.json
evid_embed249600.json
evid_embed256000.json
evid_embed262400.json
evid_embed268800.json
evid_embed275200.json
evid_embed281600.json
evid_embed288000.json
evid_embed294400.json

F score

In [None]:
class PredictDataset():

    def __init__(self, data, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.data = data

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence1 = self.data[index][0][0]
        sentence2 = self.data[index][0][1]

        #Preprocessing the text to be suitable for BERT
        tokens1 = tokenizer.tokenize(sentence1)
        tokens2 = tokenizer.tokenize(sentence2)
        tokens1 = ['[CLS]'] + tokens1 + ['[SEP]']
        tokens2 = ['[CLS]'] + tokens2 + ['[SEP]']
        if len(tokens1) < self.maxlen:
            tokens1 = tokens1 + ['[PAD]' for _ in range(self.maxlen - len(tokens1))] #Padding sentences
        else:
            tokens1 = tokens1[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        if len(tokens2) < self.maxlen:
            tokens2 = tokens2 + ['[PAD]' for _ in range(self.maxlen - len(tokens2))] #Padding sentences
        else:
            tokens2 = tokens2[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids1 = self.tokenizer.convert_tokens_to_ids(tokens1) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor1 = torch.tensor(tokens_ids1) #Converting the list to a pytorch tensor

        tokens_ids2 = self.tokenizer.convert_tokens_to_ids(tokens2) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor2 = torch.tensor(tokens_ids2) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask1 = (tokens_ids_tensor1 != 0).long()
        attn_mask2 = (tokens_ids_tensor2 != 0).long()



        return tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2

In [None]:
torch.cuda.empty_cache()
gc.collect()

9

In [None]:
EVIDENCE = [(claim,evidence[claim]) for claim in scientific_claims_id]

In [None]:
def get_retrievals(claims, file_name, SIZE=32):
  try:
    with open(f'./drive/My Drive/LAB/COMP90042 A3/predictions/Retrievals/{file_name}.pickle', 'rb') as f:
      retrievals = pickle.load(f)
  except:
      retrievals = {}

  EVIDENCE.sort(key = lambda x:x[0])

  i = 0
  for id in claims:
    if id in retrievals:
      print('pass:', id)
      continue

    print(id)
    data_for_predict = []

    claim_text = claims[id]['claim_text']

    for evid in EVIDENCE:

      evid_text = evid[1]

      data_for_predict.append((claim_text, evid_text))

    set_for_predict = PredictDataset(data_for_predict, maxlen = 512)

    predict_loader = DataLoader(set_for_predict, batch_size = SIZE, num_workers = 2)
    
    predicted_logit = list()
    with torch.no_grad():
      for it, (tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2) in enumerate(predict_loader):
        
        torch.cuda.empty_cache()
        gc.collect()
        tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2 = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu), tokens_ids_tensor2.cuda(gpu), attn_mask2.cuda(gpu)

        logits = net(tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2).tolist()
        logits = [x[0] for x in logits]
        
        for prediction in zip([claim_id for claim_id in EVIDENCE[it*SIZE:(it+1)*SIZE+1]], logits):
          predicted_logit.append(prediction)
      
      predicted_logit.sort(key = lambda x:x[1], reverse = True)
      predicted_logit = predicted_logit[:5]

      retrievals[id] = {'evidences': [x[0] for x in predicted_logit]}

      with open(f'./drive/My Drive/LAB/COMP90042 A3/predictions/Retrievals/{file_name}.pickle', 'wb') as f:
          pickle.dump(retrievals, f)

In [None]:
get_retrievals(dev_claims, 'Retrieval_SBERT_reduced_dev', SIZE=32)

pass: claim-752
pass: claim-375
pass: claim-1266
pass: claim-871
pass: claim-2164
pass: claim-1607
pass: claim-761
pass: claim-1718
pass: claim-1273
pass: claim-1786
pass: claim-2796
pass: claim-2580
pass: claim-1219
pass: claim-75
pass: claim-2813
pass: claim-2335
pass: claim-161
pass: claim-2243
pass: claim-1256
pass: claim-506
pass: claim-369
pass: claim-2184
pass: claim-1057
pass: claim-104
pass: claim-1975
pass: claim-139
pass: claim-2062
pass: claim-1160
pass: claim-2679
pass: claim-2662
pass: claim-1490
pass: claim-2768
pass: claim-2168
pass: claim-785
pass: claim-2426
pass: claim-1292
pass: claim-993
pass: claim-2593
pass: claim-1567
pass: claim-1834
pass: claim-856
pass: claim-540
pass: claim-757
pass: claim-1407
pass: claim-3070
pass: claim-1745
pass: claim-1515
pass: claim-1519
pass: claim-3069
pass: claim-677
pass: claim-765
pass: claim-2275
pass: claim-1113
pass: claim-2611
pass: claim-2060
pass: claim-2326
pass: claim-1087
pass: claim-2867
pass: claim-2300
pass: claim-225

In [None]:
get_retrievals(test_claims, 'Retrieval_SBERT_reduced_test', SIZE=32)

In [None]:
get_retrievals(future_claims, 'Retrieval_SBERT_reduced_future', SIZE=32)

pass: claim-2967
pass: claim-979
pass: claim-1609
pass: claim-1020
pass: claim-2599
pass: claim-2110
pass: claim-1135
pass: claim-712
pass: claim-1307
pass: claim-148
pass: claim-903
pass: claim-2942
pass: claim-1001
pass: claim-1034
pass: claim-1009
pass: claim-770
pass: claim-3074
pass: claim-1761
pass: claim-1475
pass: claim-477
pass: claim-1378
pass: claim-503
pass: claim-2751
pass: claim-2575
pass: claim-30
pass: claim-2994
pass: claim-55
pass: claim-1271
pass: claim-2248
pass: claim-532
pass: claim-556
pass: claim-1173
pass: claim-539
pass: claim-893
pass: claim-2857
pass: claim-109
pass: claim-2476
pass: claim-3038
pass: claim-3127
pass: claim-474
pass: claim-2464
pass: claim-2427
pass: claim-2167
pass: claim-812
pass: claim-2590
pass: claim-404
pass: claim-2977
pass: claim-2673
pass: claim-2509
pass: claim-138
pass: claim-952
pass: claim-1691
pass: claim-1741
pass: claim-1202
pass: claim-1028
pass: claim-28
pass: claim-275
pass: claim-350
pass: claim-2204
pass: claim-1604
pass:

NameError: ignored

In [None]:
# with open('./drive/My Drive/LAB/COMP90042 A3/notebooks/test_predictions_sentence.pickle', 'rb') as f:
#   final_test_predictions_sentence = pickle.load(f)

In [None]:
# final_test_predictions_sentence = {}

In [None]:
# SIZE = 32

# EVIDENCE.sort(key = lambda x:x[0])

# i = 0
# for id in test_claims:
#   if id in final_test_predictions_sentence:
#     print('pass:', id)
#     continue

#   print(id)
#   test_data_for_predict = []

#   claim_text = test_claims[id]['claim_text']

#   for evid in EVIDENCE:

#     evid_text = evid[1]

#     test_data_for_predict.append((claim_text, evid_text))

#   test_set_for_predict = PredictDataset(test_data_for_predict, maxlen = 512)

#   test_predict_loader = DataLoader(test_set_for_predict, batch_size = SIZE, num_workers = 2)
  
#   predicted_logit = list()
#   with torch.no_grad():
#     for it, (tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2) in enumerate(dev_predict_loader):
      
#       torch.cuda.empty_cache()
#       gc.collect()
#       tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2 = tokens_ids_tensor1.cuda(gpu), attn_mask1.cuda(gpu), tokens_ids_tensor2.cuda(gpu), attn_mask2.cuda(gpu)

#       logits = net(tokens_ids_tensor1, attn_mask1, tokens_ids_tensor2, attn_mask2).tolist()
#       logits = [x[0] for x in logits]
      
#       for prediction in zip([claim_id for claim_id in EVIDENCE[it*SIZE:(it+1)*SIZE+1]], logits):
#         predicted_logit.append(prediction)
    
#     predicted_logit.sort(key = lambda x:x[1], reverse = True)
#     predicted_logit = predicted_logit[:5]

#     final_test_predictions_sentence[id] = {'evidences': [x[0] for x in predicted_logit]}

#     with open('./drive/My Drive/LAB/COMP90042 A3/notebooks/test_predictions_sentence.pickle', 'wb') as f:
#         pickle.dump(final_test_predictions_sentence, f)

pass: claim-1898
pass: claim-2276
pass: claim-564
pass: claim-3003
pass: claim-2173
pass: claim-1818
pass: claim-2903
pass: claim-1362
pass: claim-2726
pass: claim-1466
pass: claim-2040
pass: claim-311
pass: claim-1855
pass: claim-72
pass: claim-840
pass: claim-1075
pass: claim-2374
pass: claim-2305
pass: claim-904
pass: claim-1276
pass: claim-447
pass: claim-1673
pass: claim-2181
pass: claim-1360
pass: claim-2901
pass: claim-586
pass: claim-788
pass: claim-3009
pass: claim-2837
pass: claim-1553
pass: claim-1649
pass: claim-2682
pass: claim-1719
pass: claim-787
pass: claim-2430
pass: claim-3062
pass: claim-1286
pass: claim-1465
pass: claim-1067
pass: claim-2745
pass: claim-2720
pass: claim-2032
pass: claim-1991
pass: claim-920
pass: claim-1421
pass: claim-1555
pass: claim-2358
pass: claim-1565
pass: claim-582
pass: claim-1399
pass: claim-555
pass: claim-1923
pass: claim-1658
pass: claim-512
pass: claim-248
pass: claim-1980
pass: claim-1492
pass: claim-948
pass: claim-2912
pass: claim-2