In [78]:
import transformers
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score

In [79]:
# 有些中文的標點符號在tokenizer編碼以後會變成[UNK]，所以將其換成英文標點

token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [80]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/", trust_remote_code=True
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

train_dataset = SemevalDataset(split="train").data
validation_dataset = SemevalDataset(split="validation").data
test_dataset = SemevalDataset(split="test").data

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}
Train dataset size: 4500
Validation dataset size: 500
Test dataset size: 4927


In [81]:
# Define the hyperparameters
lr = 3e-5
epochs = 3
train_batch_size = 16
validation_batch_size = 16
pretrain_model = 'microsoft/deberta-v3-base'
alpha = 0.75
beta = 0.25

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [82]:
tokenizer = transformers.DebertaV2Tokenizer.from_pretrained(pretrain_model, cache_dir='./cache/')

In [83]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    premises = [item['premise'] for item in batch]
    hypotheses = [item['hypothesis'] for item in batch]
    relatedness_scores = [item['relatedness_score'] for item in batch]
    entailment_judgments = [item['entailment_judgment'] for item in batch]

    # Tokenize premises and hypotheses, padding to max length within batch
    tokenized_data = tokenizer(
        premises, 
        hypotheses, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )

    # Convert relatedness_scores and entailment_judgments to tensors
    labels_relatedness = torch.tensor(relatedness_scores, dtype=torch.float32)
    labels_entailment = torch.tensor(entailment_judgments, dtype=torch.long)

    # Return tokenized data and labels
    return {
        "input_ids": tokenized_data["input_ids"],
        "attention_mask": tokenized_data["attention_mask"],
        "relatedness_score": labels_relatedness,
        "entailment_judgment": labels_entailment
    }


# TODO1-2: Define your DataLoader
dl_train = DataLoader(
    SemevalDataset(split="train"),
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn,
)
dl_validation = DataLoader(
    SemevalDataset(split="validation"),
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn,
)
dl_test = DataLoader(
    SemevalDataset(split="test"),
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn,
)

In [84]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Write your code here
        # Define what modules you will use in the model
        self.encoder = transformers.AutoModel.from_pretrained(pretrain_model, cache_dir="./cache/")
        self.regression_head = torch.nn.Linear(self.encoder.config.hidden_size, 1)
        self.classification_head = torch.nn.Linear(self.encoder.config.hidden_size, 3)
    def forward(self, **kwargs):
        # Write your code here
        # Forward pass

        output = self.encoder(**kwargs)
        
        last_hidden_state = output.last_hidden_state  # 可以使用 last_hidden_state 或 pooler_output
        pooled_output = last_hidden_state[:, 0]  # 使用 [CLS] token 作為 pooled_output
        
        regression_logits = self.regression_head(pooled_output)
        classification_logits = self.classification_head(pooled_output)

        return regression_logits, classification_logits

In [85]:
model = MultiLabelModel().to(device)

In [86]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01)

num_training_steps = epochs * len(dl_train)
num_warmup_steps = int(0.1 * num_training_steps)  # 10% of steps for warmup
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)


# TODO3-2: Define your loss functions (you should have two)
# Write your code here
mse_loss = torch.nn.MSELoss()
cross_entropy_loss = torch.nn.CrossEntropyLoss()

# scoring functions
spc = SpearmanCorrCoef().to(device)
acc = Accuracy(task="multiclass", num_classes=3).to(device)
f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)

In [87]:
i = 0
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization

    total_loss = 0
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        # token_type_ids = batch['token_type_ids'].to(device)
        relatedness_score = batch['relatedness_score'].to(device)
        entailment_judgment = batch['entailment_judgment'].to(device)

        # Forward pass
        pred_relatedness_score, pred_entailment_judgment = model(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids
        )

        # if (i == 0):
        #     print(f"Relatedness Score: {relatedness_score}")
        #     print(f"Predicted Relatedness Score: {pred_relatedness_score.squeeze()}")
        #     print(f"Entailment Judgment: {entailment_judgment}")
        #     print(f"Predicted Entailment Judgment: {pred_entailment_judgment}")

        # Compute loss
        loss_relatedness = mse_loss(pred_relatedness_score.squeeze(), relatedness_score)
        loss_entailment = cross_entropy_loss(pred_entailment_judgment, entailment_judgment)
        loss = alpha * loss_relatedness + beta * loss_entailment

        if (i == 0):
            print(f"Loss Relatedness: {loss_relatedness}")
            print(f"Loss Entailment: {loss_entailment}")

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        i += 1

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)

    val_loss = 0
    total_spc, total_acc, total_f1 = 0, 0, 0
    with torch.no_grad():
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            relatedness_score = batch['relatedness_score'].to(device)
            entailment_judgment = batch['entailment_judgment'].to(device)

            pred_relatedness_score, pred_entailment_judgment = model(
                input_ids=input_ids, 
                attention_mask=attention_mask,
            )

            # Validation loss calculation
            val_loss_relatedness = mse_loss(pred_relatedness_score.squeeze(), relatedness_score)
            val_loss_entailment = cross_entropy_loss(pred_entailment_judgment, entailment_judgment)
            val_loss += alpha * val_loss_relatedness + beta * val_loss_entailment

            # Metrics calculation
            total_spc += spc(pred_relatedness_score.squeeze(), relatedness_score)
            total_acc += acc(pred_entailment_judgment, entailment_judgment)
            total_f1 += f1(pred_entailment_judgment, entailment_judgment)

            

    print(f"Epoch {ep+1}")
    print(f"Validation Loss:     {val_loss / len(dl_validation)}")
    print(f"Spearman:            {total_spc / len(dl_validation)}")
    print(f"Accuracy:            {total_acc / len(dl_validation)}")
    print(f"F1 Score:            {total_f1 / len(dl_validation)}")
    print("")
        
    # torch.save(model, f'./models/ep{ep}.ckpt')

Training epoch [1/3]:   0%|          | 0/282 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Training epoch [1/3]:   1%|          | 2/282 [00:00<00:29,  9.44it/s]

Loss Relatedness: 8.928752899169922
Loss Entailment: 1.193906545639038


Training epoch [1/3]: 100%|██████████| 282/282 [00:29<00:00,  9.58it/s]
Validation epoch [1/3]: 100%|██████████| 32/32 [00:00<00:00, 38.22it/s]


Epoch 1
Validation Loss:     0.2964879870414734
Spearman:            0.822015643119812
Accuracy:            0.87890625
F1 Score:            0.8349679112434387



Training epoch [2/3]: 100%|██████████| 282/282 [00:29<00:00,  9.68it/s]
Validation epoch [2/3]: 100%|██████████| 32/32 [00:01<00:00, 31.86it/s]


Epoch 2
Validation Loss:     0.27398166060447693
Spearman:            0.8647626042366028
Accuracy:            0.873046875
F1 Score:            0.8335893750190735



Training epoch [3/3]: 100%|██████████| 282/282 [00:29<00:00,  9.56it/s]
Validation epoch [3/3]: 100%|██████████| 32/32 [00:00<00:00, 40.19it/s]

Epoch 3
Validation Loss:     0.21579082310199738
Spearman:            0.8524675965309143
Accuracy:            0.87890625
F1 Score:            0.8415563702583313






In [88]:
pbar = tqdm(dl_test)
model.eval()

total_spc, total_acc, total_f1 = 0, 0, 0
with torch.no_grad():
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        relatedness_score = batch['relatedness_score'].to(device)
        entailment_judgment = batch['entailment_judgment'].to(device)

        pred_relatedness_score, pred_entailment_judgment = model(
            input_ids=input_ids, 
            attention_mask=attention_mask,
        )

        # Metrics calculation
        total_spc += spc(pred_relatedness_score.squeeze(), relatedness_score)
        total_acc += acc(pred_entailment_judgment, entailment_judgment)
        total_f1 += f1(pred_entailment_judgment, entailment_judgment)

        loss_relatedness = mse_loss(pred_relatedness_score.squeeze(), relatedness_score)
        loss_entailment = cross_entropy_loss(pred_entailment_judgment, entailment_judgment)
        loss = alpha * loss_relatedness + beta * loss_entailment

        # if loss > 0.85:
        #     decoded_inputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]

        #     for i in range(len(decoded_inputs)):
        #         print(f"Premise: {decoded_inputs[i]}")
        #         print(f"Relatedness Score: {relatedness_score[i]}")
        #         print(f"Predicted Relatedness Score: {pred_relatedness_score[i][0]}")
        #         print(f"Entailment Judgment: {entailment_judgment[i]}")
        #         print(f"Predicted Entailment Judgment: {torch.argmax(pred_entailment_judgment[i])}")
        #         print("")


print(f"Spearman: {total_spc / len(dl_test)}")
print(f"Accuracy: {total_acc / len(dl_test)}")
print(f"F1 Score: {total_f1 / len(dl_test)}")
print("")

100%|██████████| 308/308 [00:09<00:00, 33.48it/s]

Spearman: 0.8274429440498352
Accuracy: 0.8997564911842346
F1 Score: 0.8680368661880493




