In [17]:
import json
import torch
import random
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, BertForSequenceClassification, BertTokenizer

## Read in Data

In [18]:
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)
evi_keys = list(evi_data.keys())

## Create Training & Development Pairs

In [19]:
# Create positive claim-evidence pair based on training set
random.seed(1)
train_pairs = []
train_labels = []
for i in tclaim_data.values():
    for j in i["evidences"]:
        train_pairs.append([i["claim_text"], evi_data[j]])
        train_labels.append(1)

# Insert negative sample to the training set
for i in tclaim_data.values():
    available_keys = [num for num in evi_keys if num not in i["evidences"]]
    random_keys = random.sample(available_keys, len(i["evidences"]))
    for j in random_keys:
        train_pairs.append([i["claim_text"], evi_data[j]])
        train_labels.append(0)

# Create list of sentence (training and evidence)
train_claim_sentence_list = []
train_evi_sentence_list = []
for i in train_pairs:
    train_claim_sentence_list.append(i[0])
    train_evi_sentence_list.append(i[1])

# Create positive claim-evidence pair based on development set
dev_pairs = []
dev_labels = []
for i in dclaim_data.values():
    for j in i["evidences"]:
        dev_pairs.append([i["claim_text"], evi_data[j]])
        dev_labels.append(1)

# Insert negative sample to the training set
for i in dclaim_data.values():
    available_keys = [num for num in evi_keys if num not in i["evidences"]]
    random_keys = random.sample(available_keys, len(i["evidences"]))
    for j in random_keys:
        dev_pairs.append([i["claim_text"], evi_data[j]])
        dev_labels.append(0)

# Create list of sentence (training, dev, test claim and evidence)
dev_claim_sentence_list = []
dev_evi_sentence_list = []
for i in dev_pairs:
    dev_claim_sentence_list.append(i[0])
    dev_evi_sentence_list.append(i[1])

## Load in Pre-trained Bert for Fine-Tuning Usage

In [20]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(set(train_labels)))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Define the Dataset Structure & Obtain DataLoader for Batch Usage

In [None]:
# Define Dataset
class MyDataset(Dataset):
    def __init__(self, claims, evidences, labels):
        self.claims = claims
        self.evidences = evidences
        encoding = tokenizer(claims, evidences, padding='max_length', truncation=True, max_length = 256)
        self.encoding = encoding
        self.labels = labels
    
    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        instance = {}
        for key_type, tokens in self.encoding.items():
            instance[key_type] = torch.tensor(tokens[idx])
        instance["label"] = torch.tensor(self.labels[idx])
        return instance

# Create training and development datasets
train_data = MyDataset(train_claim_sentence_list, train_evi_sentence_list, train_labels)
dev_data = MyDataset(dev_claim_sentence_list, dev_evi_sentence_list, dev_labels)

# Create DataLoader for training and development sets
train_data_loader = DataLoader(train_data, batch_size = 16, shuffle = True)
dev_data_loader = DataLoader(dev_data, batch_size = 16, shuffle = False)

## Start Training the Bert Model (Fine-Tune)

In [None]:
# Select the GPU (cuda in Colab) as device and start training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
performance = 0

# train the model with 10 epoches
for epoch in range(num_epochs):

    # Split the dataset into random batches for training without overflowing the RAM / GPU
    for batch in enumerate(train_data_loader):
        batch_info = batch[1]
        input_ids = batch_info['input_ids'].to(device)
        attention_mask = batch_info['attention_mask'].to(device)
        token_type_ids = batch_info["token_type_ids"].to(device)
        labels = batch_info['label'].to(device)
        outputs = model(input_ids, attention_mask, token_type_ids, labels)
        loss = outputs.loss

        # Apply zero_grad to remove the gradient from previous training
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Check the performance with development set
    model.eval()
    dev_labels = []
    pred_labels = []
    for dev_batch in enumerate(dev_data_loader):
        # Load the development pairs (pos / neg)
        dev_batch_info = dev_batch[1]
        dev_input_ids = dev_batch_info['input_ids'].to(device)
        dev_attention_mask = dev_batch_info['attention_mask'].to(device)
        dev_token_type_ids = dev_batch_info["token_type_ids"].to(device)
        dev_labels += list(dev_batch["label"].numpy())
        
        # Predict the development set evidence-claim pairs
        outputs = model(dev_input_ids, dev_attention_mask, dev_token_type_ids)
        pred_labels += list(torch.argmax(outputs[0], dim = -1).cpu().numpy())
    
    # Keep training the model and save the current optimal model
    model.train()
    current_performance = f1_score(dev_labels, pred_labels)
    if current_performance >= performance:
        performance = current_performance
        tokenizer.save_pretrained("fine_tuned_bert_model")
        model.save_pretrained("fine_tuned_bert_model")