In [7]:
import transformers
import torch
from datasets import load_dataset
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score

In [8]:
# 有些中文的標點符號在tokenizer編碼以後會變成[UNK]，所以將其換成英文標點

token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [9]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation", "test"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/", trust_remote_code=True
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

train_dataset = SemevalDataset(split="train").data
validation_dataset = SemevalDataset(split="validation").data
test_dataset = SemevalDataset(split="test").data

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(validation_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}
Train dataset size: 4500
Validation dataset size: 500
Test dataset size: 4927


In [10]:
# Define the hyperparameters
lr = 3e-5
epochs = 4
train_batch_size = 16
validation_batch_size = 16
pretrain_model = 'google-bert/bert-base-uncased'
alpha = 0.7
beta = 0.3

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cuda


In [11]:
tokenizer = transformers.AutoTokenizer.from_pretrained(pretrain_model, cache_dir='./cache/')

In [12]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    premises = [item['premise'] for item in batch]
    hypotheses = [item['hypothesis'] for item in batch]
    relatedness_scores = [item['relatedness_score'] for item in batch]
    entailment_judgments = [item['entailment_judgment'] for item in batch]

    # Tokenize premises and hypotheses, padding to max length within batch
    tokenized_data = tokenizer(
        premises, 
        hypotheses, 
        padding=True, 
        truncation=True, 
        return_tensors="pt"
    )

    # Convert relatedness_scores and entailment_judgments to tensors
    labels_relatedness = torch.tensor(relatedness_scores, dtype=torch.float32)
    labels_entailment = torch.tensor(entailment_judgments, dtype=torch.long)

    # Return tokenized data and labels
    return {
        "input_ids": tokenized_data["input_ids"],
        "attention_mask": tokenized_data["attention_mask"],
        "token_type_ids": tokenized_data["token_type_ids"],
        "relatedness_score": labels_relatedness,
        "entailment_judgment": labels_entailment
    }


# TODO1-2: Define your DataLoader
dl_train = DataLoader(
    SemevalDataset(split="train"),
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn,
)
dl_validation = DataLoader(
    SemevalDataset(split="validation"),
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn,
)
dl_test = DataLoader(
    SemevalDataset(split="test"),
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn,
)

In [13]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Write your code here
        # Define what modules you will use in the model
        self.encoder = transformers.AutoModel.from_pretrained(pretrain_model, cache_dir="./cache/")
        self.regression_head = torch.nn.Linear(self.encoder.config.hidden_size, 1)
        self.classification_head = torch.nn.Linear(self.encoder.config.hidden_size, 3)
    def forward(self, **kwargs):
        # Write your code here
        # Forward pass

        output = self.encoder(**kwargs)
        
        last_hidden_state = output.last_hidden_state  # 可以使用 last_hidden_state 或 pooler_output
        pooled_output = last_hidden_state[:, 0]  # 使用 [CLS] token 作為 pooled_output
        
        regression_logits = self.regression_head(pooled_output)
        classification_logits = self.classification_head(pooled_output)

        return regression_logits, classification_logits

In [14]:
model = MultiLabelModel().to(device)

In [15]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer
optimizer = AdamW(model.parameters(), lr=lr, weight_decay=0.01)

num_training_steps = epochs * len(dl_train)
num_warmup_steps = int(0.1 * num_training_steps)  # 10% of steps for warmup
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)


# TODO3-2: Define your loss functions (you should have two)
# Write your code here
mse_loss = torch.nn.MSELoss()
cross_entropy_loss = torch.nn.CrossEntropyLoss()

# scoring functions
spc = SpearmanCorrCoef().to(device)
acc = Accuracy(task="multiclass", num_classes=3).to(device)
f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)



In [16]:
i = 0
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization

    total_loss = 0
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        relatedness_score = batch['relatedness_score'].to(device)
        entailment_judgment = batch['entailment_judgment'].to(device)

        # Forward pass
        pred_relatedness_score, pred_entailment_judgment = model(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        # if (i == 0):
        #     print(f"Relatedness Score: {relatedness_score}")
        #     print(f"Predicted Relatedness Score: {pred_relatedness_score.squeeze()}")
        #     print(f"Entailment Judgment: {entailment_judgment}")
        #     print(f"Predicted Entailment Judgment: {pred_entailment_judgment}")

        # Compute loss
        loss_relatedness = mse_loss(pred_relatedness_score.squeeze(), relatedness_score)
        loss_entailment = cross_entropy_loss(pred_entailment_judgment, entailment_judgment)
        loss = alpha * loss_relatedness + beta * loss_entailment

        # if (i == 0):
        #     print(f"Loss Relatedness: {loss_relatedness}")
        #     print(f"Loss Entailment: {loss_entailment}")

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        i += 1

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()
    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)

    val_loss = 0
    with torch.no_grad():
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            relatedness_score = batch['relatedness_score'].to(device)
            entailment_judgment = batch['entailment_judgment'].to(device)

            pred_relatedness_score, pred_entailment_judgment = model(
                input_ids=input_ids, 
                attention_mask=attention_mask,
                token_type_ids=token_type_ids
            )

            # Validation loss calculation
            val_loss_relatedness = mse_loss(pred_relatedness_score.squeeze(), relatedness_score)
            val_loss_entailment = cross_entropy_loss(pred_entailment_judgment, entailment_judgment)
            val_loss += alpha * val_loss_relatedness + beta * val_loss_entailment

            # Metrics calculation
            spc.update(pred_relatedness_score.squeeze(), relatedness_score)
            acc.update(pred_entailment_judgment, entailment_judgment)
            f1.update(pred_entailment_judgment, entailment_judgment)

            

    print(f"Epoch {ep+1}")
    print(f"Validation Loss:     {val_loss / len(dl_validation)}")
    print(f"Spearman:            {spc.compute()}")
    print(f"Accuracy:            {acc.compute()}")
    print(f"F1 Score:            {f1.compute()}")
    print("")
    # torch.save(model, f'./models/ep{ep}.ckpt')

    spc.reset()
    acc.reset()
    f1.reset()  

  attn_output = torch.nn.functional.scaled_dot_product_attention(
Training epoch [1/4]: 100%|██████████| 282/282 [00:15<00:00, 18.41it/s]
Validation epoch [1/4]: 100%|██████████| 32/32 [00:00<00:00, 85.06it/s]


Epoch 1
Validation Loss:     0.45260924100875854
Spearman:            0.7951542139053345
Accuracy:            0.8100000023841858
F1 Score:            0.812621533870697



Training epoch [2/4]: 100%|██████████| 282/282 [00:14<00:00, 19.20it/s]
Validation epoch [2/4]: 100%|██████████| 32/32 [00:00<00:00, 91.72it/s]


Epoch 2
Validation Loss:     0.29361456632614136
Spearman:            0.823934018611908
Accuracy:            0.843999981880188
F1 Score:            0.8428548574447632



Training epoch [3/4]: 100%|██████████| 282/282 [00:14<00:00, 19.04it/s]
Validation epoch [3/4]: 100%|██████████| 32/32 [00:00<00:00, 86.78it/s]


Epoch 3
Validation Loss:     0.27632591128349304
Spearman:            0.8364449143409729
Accuracy:            0.8579999804496765
F1 Score:            0.8562049865722656



Training epoch [4/4]: 100%|██████████| 282/282 [00:14<00:00, 19.18it/s]
Validation epoch [4/4]: 100%|██████████| 32/32 [00:00<00:00, 83.99it/s]

Epoch 4
Validation Loss:     0.31952816247940063
Spearman:            0.838647186756134
Accuracy:            0.8560000061988831
F1 Score:            0.8535650968551636






In [18]:
pbar = tqdm(dl_test)
model.eval()

with torch.no_grad():
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        relatedness_score = batch['relatedness_score'].to(device)
        entailment_judgment = batch['entailment_judgment'].to(device)

        pred_relatedness_score, pred_entailment_judgment = model(
            input_ids=input_ids, 
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        # Metrics calculation
        spc.update(pred_relatedness_score.squeeze(), relatedness_score)
        acc.update(pred_entailment_judgment, entailment_judgment)
        f1.update(pred_entailment_judgment, entailment_judgment)

        loss_relatedness = mse_loss(pred_relatedness_score.squeeze(), relatedness_score)
        loss_entailment = cross_entropy_loss(pred_entailment_judgment, entailment_judgment)
        loss = alpha * loss_relatedness + beta * loss_entailment

        # if loss > 0.85:
        #     decoded_inputs = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]

        #     for i in range(len(decoded_inputs)):
        #         print(f"Premise: {decoded_inputs[i]}")
        #         print(f"Relatedness Score: {relatedness_score[i]}")
        #         print(f"Predicted Relatedness Score: {pred_relatedness_score[i][0]}")
        #         print(f"Entailment Judgment: {entailment_judgment[i]}")
        #         print(f"Predicted Entailment Judgment: {torch.argmax(pred_entailment_judgment[i])}")
        #         print("")

print(f"Spearman: {spc.compute()}")
print(f"Accuracy: {acc.compute()}")
print(f"F1 Score: {f1.compute()}")
print("")

spc.reset()
acc.reset()
f1.reset()

100%|██████████| 308/308 [00:03<00:00, 94.80it/s] 

Spearman: 0.831084668636322
Accuracy: 0.8652324080467224
F1 Score: 0.8599553108215332




