<a href="https://colab.research.google.com/github/THH1118/-1-/blob/main/assignment3_sample.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import transformers as T
!pip install datasets==2.21.0
import datasets
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from tqdm import tqdm
!pip install torchmetrics
import torchmetrics
from torchmetrics import SpearmanCorrCoef, Accuracy, F1Score
pearson_corr = torchmetrics.PearsonCorrCoef()
device = "cuda:0" if torch.cuda.is_available() else "cpu"



In [None]:
print(datasets.__version__)

2.21.0


In [None]:
# 有些中文的標點符號在tokenizer編碼以後會變成[UNK]，所以將其換成英文標點
token_replacement = [
    ["：" , ":"],
    ["，" , ","],
    ["“" , "\""],
    ["”" , "\""],
    ["？" , "?"],
    ["……" , "..."],
    ["！" , "!"]
]

In [None]:
model = MultiLabelModel().to(device)
tokenizer = T.BertTokenizer.from_pretrained("google-bert/bert-base-uncased", cache_dir="./cache/")



In [None]:
class SemevalDataset(Dataset):
    def __init__(self, split="train") -> None:
        super().__init__()
        assert split in ["train", "validation"]
        self.data = load_dataset(
            "sem_eval_2014_task_1", split=split, cache_dir="./cache/"
        ).to_list()

    def __getitem__(self, index):
        d = self.data[index]
        # 把中文標點替換掉
        for k in ["premise", "hypothesis"]:
            for tok in token_replacement:
                d[k] = d[k].replace(tok[0], tok[1])
        return d

    def __len__(self):
        return len(self.data)

data_sample = SemevalDataset(split="train").data[:3]
print(f"Dataset example: \n{data_sample[0]} \n{data_sample[1]} \n{data_sample[2]}")

Dataset example: 
{'sentence_pair_id': 1, 'premise': 'A group of kids is playing in a yard and an old man is standing in the background', 'hypothesis': 'A group of boys in a yard is playing and a man is standing in the background', 'relatedness_score': 4.5, 'entailment_judgment': 0} 
{'sentence_pair_id': 2, 'premise': 'A group of children is playing in the house and there is no man standing in the background', 'hypothesis': 'A group of kids is playing in a yard and an old man is standing in the background', 'relatedness_score': 3.200000047683716, 'entailment_judgment': 0} 
{'sentence_pair_id': 3, 'premise': 'The young boys are playing outdoors and the man is smiling nearby', 'hypothesis': 'The kids are playing outdoors near a man with a smile', 'relatedness_score': 4.699999809265137, 'entailment_judgment': 1}


In [None]:
# Define the hyperparameters
lr = 3e-5
epochs = 3
train_batch_size = 8
validation_batch_size = 8

In [None]:
# TODO1: Create batched data for DataLoader
# `collate_fn` is a function that defines how the data batch should be packed.
# This function will be called in the DataLoader to pack the data batch.

def collate_fn(batch):
    # TODO1-1: Implement the collate_fn function
    # Write your code here


    # 定義如何將批次數據轉換為張量
    premises = [item["premise"] for item in batch]
    hypotheses = [item["hypothesis"] for item in batch]
    relatedness_scores = [item["relatedness_score"] for item in batch]
    entailment_judgments = [item["entailment_judgment"] for item in batch]

     # 使用 tokenizer 將批次數據轉換成模型輸入格式
    inputs = tokenizer(
        premises,
        hypotheses,
        padding=True,  # 填充到相同長度
        truncation=True,  # 截斷長度過長的句子
        return_tensors="pt"  # 返回 PyTorch 張量
    )

    # 將標籤轉換成張量
    relatedness_scores = torch.tensor(relatedness_scores, dtype=torch.float32)  # 連續值的回歸標籤
    entailment_judgments = torch.tensor(entailment_judgments, dtype=torch.long)  # 離散值的分類標籤

    # 返回處理後的批次數據
    return inputs, relatedness_scores, entailment_judgments


    # The input parameter is a data batch (tuple), and this function packs it into tensors.
    # Use tokenizer to pack tokenize and pack the data and its corresponding labels.
    # Return the data batch and labels for each sub-task.

# TODO1-2: Define your DataLoader

# 創建訓練集的 DataLoader
train_dataset = SemevalDataset(split="train")
dl_train = DataLoader(
    train_dataset,
    batch_size=train_batch_size,
    shuffle=True,
    collate_fn=collate_fn
)

# 創建驗證集的 DataLoader
validation_dataset = SemevalDataset(split="validation")
dl_validation = DataLoader(
    validation_dataset,
    batch_size=validation_batch_size,
    shuffle=False,
    collate_fn=collate_fn
)

In [None]:
print(next(iter(dl_train)))

({'input_ids': tensor([[  101,  2093,  2111,  2024,  3061,  2011,  1037,  6847,  2104,  2070,
          5340,  3628,   102,  2093,  2111,  2024,  3564,  2006,  1037,  6847,
          2104,  2070,  5340,  3628,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1037,  2158,  1998,  1037,  2450,  2024,  2770,  2362,  1998,
          3173,  2398,   102,  1037,  2158,  1998,  1037,  2450,  2024,  3061,
          2362,  1998,  3173,  2398,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  1037,  2158,  2003,  2652,  1996,  3846,   102,  2858,  2003,
          2108,  2209,  2011,  1037,  2158,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  101,  2093,  2273,  2024,  3564,  2006,  1037,  4020,  5178,  199

In [None]:
# TODO2: Construct your model
class MultiLabelModel(torch.nn.Module):


    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # 載入 BERT 基礎模型
        self.bert = T.BertModel.from_pretrained("google-bert/bert-base-uncased")

        # 線性層 1：用於回歸任務（預測 relatedness_score）
        self.regression_head = torch.nn.Linear(self.bert.config.hidden_size, 1)

        # 線性層 2：用於 3 類分類任務（預測 entailment_judgement）
        self.classification_head = torch.nn.Linear(self.bert.config.hidden_size, 3)


        # Write your code here
        # Define what modules you will use in the model
    def forward(self, **kwargs):

        # Write your code here
        # Forward pass

        # BERT 前向傳遞，返回最後一層隱藏狀態
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # 使用 BERT 的 [CLS] token 的隱藏向量作為輸出
        cls_output = bert_outputs.last_hidden_state[:, 0, :]

        # 回歸頭：預測 relatedness_score
        relatedness_score = self.regression_head(cls_output)

        # 分類頭：預測 entailment_judgement
        entailment_judgement = self.classification_head(cls_output)

        return relatedness_score, entailment_judgement


In [None]:
# TODO3: Define your optimizer and loss function

# TODO3-1: Define your Optimizer

optimizer = AdamW(model.parameters(), lr=lr)# Write your code here

# TODO3-2: Define your loss functions (you should have two)

regression_loss_fn = torch.nn.MSELoss()  # 用於回歸的損失函數
classification_loss_fn = torch.nn.CrossEntropyLoss()  # 用於分類的損失函數

# Write your code here

# scoring functions
spc = SpearmanCorrCoef()
acc = Accuracy(task="multiclass", num_classes=3)
f1 = F1Score(task="multiclass", num_classes=3, average='macro')

In [None]:
for ep in range(epochs):
    pbar = tqdm(dl_train)
    pbar.set_description(f"Training epoch [{ep+1}/{epochs}]")
    model.train()
    # TODO4: Write the training loop
    # Write your code here
    # train your model
    # clear gradient
    # forward pass
    # compute loss
    # back-propagation
    # model optimization


    total_loss = 0
    for batch in pbar:
        # 從 DataLoader 中獲取批次資料
        input_ids = batch[0]["input_ids"].to(device)
        attention_mask = batch[0]["attention_mask"].to(device)
        relatedness_score = batch[1].to(device)         # relatedness_score 標籤
        entailment_judgement = batch[2].to(device)      # entailment_judgement 標籤

        # TODO4: 清除梯度
        optimizer.zero_grad()

        # TODO4: 前向傳遞
        pred_relatedness, pred_entailment = model(input_ids=input_ids, attention_mask=attention_mask)

        # TODO4: 計算損失
        loss_relatedness = regression_loss_fn(pred_relatedness.squeeze(), relatedness_score.float())
        loss_entailment = classification_loss_fn(pred_entailment, entailment_judgement.long())

        # 聚合損失
        loss = loss_relatedness + loss_entailment
        total_loss += loss.item()

        # TODO4: 反向傳播
        loss.backward()

        # TODO4: 優化模型
        optimizer.step()

        # 更新進度條的損失顯示
        pbar.set_postfix({"loss": loss.item()})

    # 每個 epoch 結束後，輸出平均損失
    avg_loss = total_loss / len(dl_train)
    print(f"Epoch [{ep+1}/{epochs}] - Average Loss: {avg_loss:.4f}")

    #eval

    pbar = tqdm(dl_validation)
    pbar.set_description(f"Validation epoch [{ep+1}/{epochs}]")
    model.eval()

    # TODO5: Write the evaluation loop
    # Write your code here
    # Evaluate your model
    # Output all the evaluation scores (SpearmanCorrCoef, Accuracy, F1Score)


    spearman_corr = SpearmanCorrCoef()
    pearson_corr = torchmetrics.PearsonCorrCoef()  # 使用 torchmetrics 的 Pearson
    acc = Accuracy(task="multiclass", num_classes=3)
    f1 = F1Score(task="multiclass", num_classes=3, average='macro')

    with torch.no_grad():
        for batch in pbar:
            input_ids = batch[0]["input_ids"].to(device)
            attention_mask = batch[0]["attention_mask"].to(device)
            relatedness_score = batch[1].to(device)
            entailment_judgement = batch[2].to(device)
            # 前向傳遞
            pred_relatedness, pred_entailment = model(input_ids=input_ids, attention_mask=attention_mask)

            # 相關係數計算
            spearman_corr.update(pred_relatedness.squeeze(), relatedness_score)

            pearson_corr = torchmetrics.PearsonCorrCoef().to(device)
            pearson_corr.update(pred_relatedness.squeeze(), relatedness_score)

            # 計算準確率和 F1 分數
            acc = Accuracy(task="multiclass", num_classes=3).to(device)
            f1 = F1Score(task="multiclass", num_classes=3, average='macro').to(device)

            acc.update(pred_entailment, entailment_judgement)
            f1.update(pred_entailment, entailment_judgement)

        # 取得評估指標結果
        spearman_score = spearman_corr.compute()
        pearson_score = pearson_corr.compute()
        accuracy_score = acc.compute()
        f1_score = f1.compute()

        # 輸出評估結果
        print(f"Validation Results - Epoch [{ep+1}/{epochs}]")
        print(f"Spearman Correlation: {spearman_score:.4f}")
        print(f"Pearson Correlation: {pearson_score:.4f}")
        print(f"Accuracy: {accuracy_score:.4f}")
        print(f"F1 Score: {f1_score:.4f}")


    torch.save(model, f'./content/save_models/ep{ep}.ckpt')

Training epoch [1/3]: 100%|██████████| 563/563 [00:22<00:00, 24.63it/s, loss=0.164]


Epoch [1/3] - Average Loss: 0.2520


Validation epoch [1/3]: 100%|██████████| 63/63 [00:01<00:00, 58.69it/s]


Validation Results - Epoch [1/3]
Spearman Correlation: 0.8166
Pearson Correlation: 0.9986
Accuracy: 1.0000
F1 Score: 1.0000


Training epoch [2/3]: 100%|██████████| 563/563 [00:22<00:00, 24.55it/s, loss=0.0121]


Epoch [2/3] - Average Loss: 0.2339


Validation epoch [2/3]: 100%|██████████| 63/63 [00:01<00:00, 60.52it/s]


Validation Results - Epoch [2/3]
Spearman Correlation: 0.8188
Pearson Correlation: 0.9977
Accuracy: 1.0000
F1 Score: 1.0000


Training epoch [3/3]: 100%|██████████| 563/563 [00:22<00:00, 24.65it/s, loss=0.308]


Epoch [3/3] - Average Loss: 0.1868


Validation epoch [3/3]: 100%|██████████| 63/63 [00:01<00:00, 61.02it/s]


Validation Results - Epoch [3/3]
Spearman Correlation: 0.8078
Pearson Correlation: 0.9974
Accuracy: 1.0000
F1 Score: 1.0000


In [None]:
import os

# 檢查目錄是否存在，不存在則創建
save_dir = './content/save_models'
os.makedirs(save_dir, exist_ok=True)

For test set predictions, you can write perform evaluation simlar to #TODO5.