In [16]:
import json
import torch
import random
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AdamW, BertForSequenceClassification, BertTokenizer
from nlp_function import pick_random_keys, stopwords_func, lower_processing, most_frequent_element

## Read in Data

In [17]:
# Read in training data (claim)
with open('../project-data/train-claims.json', 'r') as tclaim_file:
    tclaim_data = json.load(tclaim_file)

# Read in development data (claim)
with open('../project-data/dev-claims.json', 'r') as dclaim_file:
    dclaim_data = json.load(dclaim_file)

# Read in test data (claim)
with open('../project-data/test-claims-unlabelled.json', 'r') as uclaim_file:
    uclaim_data = json.load(uclaim_file)

# Read in evidence data
with open('../project-data/evidence.json', 'r') as evi_file:
    evi_data = json.load(evi_file)
evi_keys = list(evi_data.keys())

## Create Training & Development Pairs

In [18]:
# Create positive claim-evidence pair based on training set
random.seed(1)
train_pairs = []
train_labels = []
evidence_keys = []
for i in tclaim_data.values():
    for j in i["evidences"]:
        evidence_keys.append(j)
        train_pairs.append([i["claim_text"], evi_data[j]])
        train_labels.append(1)

# Insert negative sample to the training set
for i in tclaim_data.values():
    available_keys = [num for num in evi_keys if num not in i["evidences"]]
    random_keys = random.sample(available_keys, len(i["evidences"]))
    for j in random_keys:
        evidence_keys.append(j)
        train_pairs.append([i["claim_text"], evi_data[j]])
        train_labels.append(0)

# Create list of sentence (training and evidence)
train_claim_sentence_list = []
train_evi_sentence_list = []
for i in train_pairs:
    train_claim_sentence_list.append(i[0])
    train_evi_sentence_list.append(i[1])

# Create positive claim-evidence pair based on development set
dev_pairs = []
dev_labels = []
for i in dclaim_data.values():
    for j in i["evidences"]:
        evidence_keys.append(j)
        dev_pairs.append([i["claim_text"], evi_data[j]])
        dev_labels.append(1)

# Insert negative sample to the training set
for i in dclaim_data.values():
    available_keys = [num for num in evi_keys if num not in i["evidences"]]
    random_keys = random.sample(available_keys, len(i["evidences"]))
    for j in random_keys:
        evidence_keys.append(j)
        dev_pairs.append([i["claim_text"], evi_data[j]])
        dev_labels.append(0)

# Create list of sentence (training, dev, test claim and evidence)
dev_claim_sentence_list = []
dev_evi_sentence_list = []
for i in dev_pairs:
    dev_claim_sentence_list.append(i[0])
    dev_evi_sentence_list.append(i[1])

## Load in Pre-trained Bert for Fine-Tuning Usage

In [19]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(set(train_labels)))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Define the Dataset Structure & Obtain DataLoader for Batch Usage

In [20]:
# Define Dataset
# Implement based on the Tutorial of BERT (10-bert.ipynb)
# https://canvas.lms.unimelb.edu.au/courses/151109/pages/worksheets-slash-notebooks?module_item_id=4589208

class MyDataset(Dataset):
    def __init__(self, claims, evidences, labels):
        self.claims = claims
        self.evidences = evidences
        encoding = tokenizer(claims, evidences, padding='max_length', truncation=True, max_length = 256)
        self.encoding = encoding
        self.labels = labels
    
    def __len__(self):
        return len(self.claims)

    def __getitem__(self, idx):
        instance = {}
        for key_type, tokens in self.encoding.items():
            instance[key_type] = torch.tensor(tokens[idx])
        instance["label"] = torch.tensor(self.labels[idx])
        return instance

# Create training and development datasets
train_data = MyDataset(train_claim_sentence_list, train_evi_sentence_list, train_labels)
dev_data = MyDataset(dev_claim_sentence_list, dev_evi_sentence_list, dev_labels)

# Create DataLoader for training and development sets
train_data_loader = DataLoader(train_data, batch_size = 16, shuffle = True)
dev_data_loader = DataLoader(dev_data, batch_size = 16, shuffle = False)

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

## Start Training the Bert Model (Fine-Tune)

In [23]:
# Select the GPU (cuda in Colab) as device and start training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 10
performance = 0

# train the model with 10 epoches
for epoch in range(num_epochs):

    # Split the dataset into random batches for training without overflowing the RAM / GPU
    for batch in enumerate(train_data_loader):
        batch_info = batch[1]
        input_ids = batch_info['input_ids'].to(device)
        attention_mask = batch_info['attention_mask'].to(device)
        token_type_ids = batch_info["token_type_ids"].to(device)
        labels = batch_info['label'].to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask,
                        token_type_ids = token_type_ids, labels = labels)
        loss = outputs.loss

        # Apply zero_grad to remove the gradient from previous training
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Check the performance with development set
    model.eval()
    dev_labels = []
    pred_labels = []
    for dev_batch in enumerate(dev_data_loader):
        # Load the development pairs (pos / neg)
        dev_batch_info = dev_batch[1]
        dev_input_ids = dev_batch_info['input_ids'].to(device)
        dev_attention_mask = dev_batch_info['attention_mask'].to(device)
        dev_token_type_ids = dev_batch_info["token_type_ids"].to(device)
        dev_labels.extend(dev_batch_info["label"].numpy())
        
        # Predict the development set evidence-claim pairs
        outputs = model(dev_input_ids, attention_mask = dev_attention_mask, token_type_ids = dev_token_type_ids)
        pred_labels.extend(torch.argmax(outputs[0], dim = -1).cpu().numpy())
    
    # Keep training the model and save the current optimal model
    model.train()
    current_performance = f1_score(dev_labels, pred_labels)
    if current_performance >= performance:
        performance = current_performance
        tokenizer.save_pretrained("../fine_tuned_bert_model")
        model.save_pretrained("../fine_tuned_bert_model")

## Matching Evidence for Development Set Claim Text

In [24]:
# Load in Fine-Tuned Model
tokenizer = BertTokenizer.from_pretrained("../fine_tuned_bert_model")
model = BertForSequenceClassification.from_pretrained("../fine_tuned_bert_model")
device = torch.device('mps' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# Obtain potential evidence sentence list
potential_evidence = []
for i in evidence_keys:
    potential_evidence.append(evi_data[i])
evidence_keys = list(set(evidence_keys))
potential_evidence = list(set(potential_evidence))

# Prepare test claim
test_keys = list(uclaim_data.keys())
test_sentence = []
for i in uclaim_data.values():
    test_sentence.append(i["claim_text"])

# Compute the similarity between the embedded claim and evidence
k = 3
for i in range(len(test_sentence)):
    softmax_prob = []
    for j in range(len(potential_evidence)):
        pair_encode = tokenizer(test_sentence[i], potential_evidence[j], truncation = True, padding = 'max_length', max_length = 256)
        test_ids = pair_encode["input_ids"].to(device)
        test_attention_mask = pair_encode["attention_mask"].to(device)
        test_token_type_ids = pair_encode["token_type_ids"].to(device)
        outputs = model(test_ids, test_attention_mask, test_token_type_ids)
        probability = torch.softmax(model(test_ids, test_attention_mask, test_token_type_ids)[0], dim = -1)
        softmax_prob.append(probability[:, 1].cpu().numpy()[0])
    
    sort_idx = np.argsort(softmax_prob)[::-1]
    evidence_list = []
    for evi in sort_idx[:k]:
        evidence_list.append(potential_evidence[evi])
    
    # Assign the top-3 evidences to the claim text
    uclaim_data[test_keys[i]]['evidences'] = evidence_list

# Save the text file to json
file_path = '../Performance/FineTune-Bert/test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(uclaim_data, json_file)

## Task 2

In [None]:
# Store the claim, evidence sentence(s) to list for embedded usage
claim_train_sentence = []
claim_dep_sentence = []
claim_test_sentence = []
evidence_full_sentence = []
evidence_sample_sentence = []

for i in tclaim_data.values():
    claim_train_sentence.append(i["claim_text"])
for i in dclaim_data.values():
    claim_dep_sentence.append(i["claim_text"])
for i in uclaim_data.values():
    claim_test_sentence.append(i["claim_text"])
for i in evi_data.values():
    evidence_full_sentence.append(i)
tfidf_keys = []
for i in train_pairs[:int(len(train_pairs)/2)]:
  evidence_sample_sentence.append(i[2])
  tfidf_keys.append(i[3])

In [None]:
# Training data
train_data = claim_train_sentence + claim_dep_sentence

# Test data
test_data = claim_test_sentence

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the training data
tfidf_vectorizer.fit(train_data)

# Transform the training and test data using the trained vectorizer
train_embeddings = tfidf_vectorizer.transform(train_data).toarray()
test_embeddings = tfidf_vectorizer.transform(test_data).toarray()

# Create training and test np.ndarray
train_np = train_embeddings[:len(claim_train_sentence)]
test_np = test_embeddings

In [None]:
# Collect similarity value for mojority voting operation
label_list = []
for i in test_np:
    similarity = []
    for j in train_np:
        similarity.append(cosine_similarity(np.reshape(i, (1, -1)), np.reshape(j, (1, -1)))[0][0])
    top_index = np.argsort(similarity)[-11:]
    label_list.append(list(top_index))

with open('../Performance/FineTune-Bert/test-claims-predictions.json', 'r') as final_json:
    final_test = json.load(final_json)

# Obtain the most frequent label from the closest claim(s)
potential_label_list = []
train_key_list = list(tclaim_data.keys())
test_key_list = list(final_test.keys())
for i in range(len(label_list)):
    label_list_potential = []
    for j in label_list[i]:
        label_list_potential.append(tclaim_data[train_key_list[j]]["claim_label"])
    potential_label_list.append(label_list_potential)
    test_class = most_frequent_element(label_list_potential)
    final_test[test_key_list[i]]["claim_label"] = test_class

# Store to json
file_path = '../Performance/FineTune-Bert/test-claims-predictions.json'
with open(file_path, 'w') as json_file:
    json.dump(final_test, json_file)