In [1]:
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
from evaluate import load
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
device = "cuda" if torch.cuda.is_available() else "cpu"
# You can install and import any other libraries if needed
from transformers import RobertaModel, RobertaTokenizer, GPT2Model, GPT2Tokenizer

In [2]:
# Some Chinese punctuations will be tokenized as [UNK], so we replace them with English ones
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [3]:
tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")

In [4]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, trust_remote_code=True, cache_dir="./cache/"
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # Replace Chinese punctuations with English ones
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [5]:
# Define the hyperparameters
# You can modify these values if needed
lr = 2e-5
epochs = 5
train_batch_size = 16
validation_batch_size = 16

In [None]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here
    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to tokenize and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.
    prem = list()
    hypo = list()
    relate_score = list()
    entail = list()

    for data in batch:
        prem.append(data['premise'])
        hypo.append(data['hypothesis'])
        relate_score.append(data['relatedness_score'])
        entail.append(data['entailment_judgment'])

    # With the help of the Internet and ChatGPT
    # So I know how to tokenize the sentences first and then pack them with their labels
    packed = tokenizer(
        prem,
        hypo,
        padding=True,
        truncation=True,
        max_length=512,
        return_token_type_ids=True,
        return_tensors='pt'
    )

    return {
        'input_text': packed,
        'label_1': torch.tensor(relate_score),
        'label_2': torch.tensor(entail)
    }

# TODO1-2: Define your DataLoader
dl_train = torch.utils.data.DataLoader(dataset=SemevalDataset(split='train').data, batch_size=train_batch_size, collate_fn=collate_fn)
dl_validation = torch.utils.data.DataLoader(dataset=SemevalDataset(split='validation').data, batch_size=validation_batch_size, collate_fn=collate_fn)
dl_test = torch.utils.data.DataLoader(dataset=SemevalDataset(split='test').data, batch_size=validation_batch_size, collate_fn=collate_fn)

In [7]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Write your code here
        # Define what modules you will use in the model
        # Please use "google-bert/bert-base-uncased" model (https://huggingface.co/google-bert/bert-base-uncased)
        # Besides the base model, you may design additional architectures by incorporating linear layers, activation functions, or other neural components.
        # Remark: The use of any additional pretrained language models is not permitted.
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = torch.nn.Dropout(p=0.2)
        
        self.regression = torch.nn.Sequential(torch.nn.Linear(in_features=768, out_features=64),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(in_features=64, out_features=32),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(in_features=32, out_features=1))
        
        self.classifier = torch.nn.Sequential(torch.nn.Linear(in_features=768, out_features=64),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(in_features=64,out_features=32),
                                              torch.nn.ReLU(),
                                              torch.nn.Linear(in_features=32, out_features=3))

    def forward(self, **kwargs):
        # Write your code here
        # Forward pass
        output_x = self.bert(**kwargs)
        pooled_x = output_x.pooler_output

        pooled_x = self.dropout(pooled_x)

        regs = self.regression(pooled_x)
        logits = self.classifier(pooled_x)

        return {
            'relatedness_score': regs,
            'entailment_judgment': logits
        }

In [8]:
# TODO3: Define your optimizer and loss function

model = MultiLabelModel().to(device)
# TODO3-1: Define your Optimizer
optimizer = AdamW(params=model.parameters(), lr=lr)

# TODO3-2: Define your loss functions (you should have two)
# Write your code here
criterion_reg = torch.nn.MSELoss()
criterion_class = torch.nn.CrossEntropyLoss()

# scoring functions
psr = load("pearsonr")
acc = load("accuracy")

In [None]:
best_score = 0.0
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization

    for batch in pbar:
        optimizer.zero_grad()

        # I searched on the Internet in order to understand how to unpack the packed training data using **
        output_y = model(**batch['input_text'].to(device))

        loss_reg = criterion_reg(output_y['relatedness_score'].squeeze(1).to(device), batch['label_1'].to(device))
        # Asked GPT how to update two seperate loss functions
        loss_reg.backward(retain_graph=True)

        loss_logit = criterion_class(output_y['entailment_judgment'].squeeze(1).to(device), batch['label_2'].to(device))
        loss_logit.backward()

        optimizer.step()

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (PearsonCorr, Accuracy)
    PRED_REG = list()
    PRED_LOGIT = list()
    TRUE_REG = list()
    TRUE_LOGIT = list()

    for batch in pbar:
        TRUE_REG.extend(batch['label_1'].tolist())
        TRUE_LOGIT.extend(batch['label_2'].tolist())

        output_y = model(**batch['input_text'].to(device))

        PRED_REG.extend(output_y['relatedness_score'].squeeze(1).tolist())
        PRED_LOGIT.extend([l.item() for l in torch.argmax(output_y['entailment_judgment'].detach().cpu(), dim=1)])

    pearson_corr = psr.compute(predictions=PRED_REG, references=TRUE_REG)['pearsonr']
    accuracy = acc.compute(predictions=PRED_LOGIT, references=TRUE_LOGIT)['accuracy']
    print(f"##### EPOCH {ep+1} #####\nPearson Coefficient: {pearson_corr}\nAccuracy: {accuracy}\n")
    # print(f"F1 Score: {f1.compute()}")
    
    if pearson_corr + accuracy > best_score:
        best_score = pearson_corr + accuracy
        torch.save(model.state_dict(), f'./saved_models/best_model.ckpt')

Training epoch [1/5]: 100%|██████████| 282/282 [00:12<00:00, 22.91it/s]
Validation epoch [1/5]: 100%|██████████| 32/32 [00:00<00:00, 92.26it/s]


##### EPOCH 1 #####
Pearson Coefficient: 0.23714808261230286
Accuracy: 0.564



Training epoch [2/5]: 100%|██████████| 282/282 [00:11<00:00, 24.91it/s]
Validation epoch [2/5]: 100%|██████████| 32/32 [00:00<00:00, 95.70it/s]


##### EPOCH 2 #####
Pearson Coefficient: 0.7496506457298246
Accuracy: 0.564



Training epoch [3/5]: 100%|██████████| 282/282 [00:11<00:00, 24.25it/s]
Validation epoch [3/5]: 100%|██████████| 32/32 [00:00<00:00, 88.68it/s]


##### EPOCH 3 #####
Pearson Coefficient: 0.7775288295848769
Accuracy: 0.702



Training epoch [4/5]: 100%|██████████| 282/282 [00:12<00:00, 23.18it/s]
Validation epoch [4/5]: 100%|██████████| 32/32 [00:00<00:00, 84.61it/s]


##### EPOCH 4 #####
Pearson Coefficient: 0.8253423896996699
Accuracy: 0.824



Training epoch [5/5]: 100%|██████████| 282/282 [00:12<00:00, 23.17it/s]
Validation epoch [5/5]: 100%|██████████| 32/32 [00:00<00:00, 92.69it/s]


##### EPOCH 5 #####
Pearson Coefficient: 0.8492353943299349
Accuracy: 0.848



In [None]:
# Load the model
model = MultiLabelModel().to(device)
model.load_state_dict(torch.load(f"./saved_models/best_model.ckpt", weights_only=True))

# Test Loop
pbar = tqdm(dl_test, desc="Test")
model.eval()

# TODO6: Write the test loop
# Write your code here
# We have loaded the best model with the highest evaluation score for you
# Please implement the test loop to evaluate the model on the test dataset
# We will have 10% of the total score for the test accuracy and pearson correlation
true_y_reg = list()
pred_y_reg = list()

true_y_logit = list()
pred_y_logit = list()

for batch in pbar:
    true_y_reg.extend(batch['label_1'].tolist())
    true_y_logit.extend(batch['label_2'].tolist())

    predict_y = model(**batch['input_text'].to(device))

    pred_y_reg.extend(predict_y['relatedness_score'].squeeze(1).tolist())
    pred_y_logit.extend([logit.item() for logit in torch.argmax(predict_y['entailment_judgment'].detach().cpu(), dim=1)])

pea_corr = psr.compute(predictions=pred_y_reg, references=true_y_reg)['pearsonr']
accuracy = acc.compute(predictions=pred_y_logit, references=true_y_logit)['accuracy']

print(f"\n----- PERFORMANCE RESULT -----\nPearson Correlation: {pea_corr}\nAccuracy: {accuracy}\n")


Test: 100%|██████████| 308/308 [00:03<00:00, 90.22it/s] 


----- PERFORMANCE RESULT -----
Pearson Correlation: 0.8604370326699358
Accuracy: 0.86198498071849




