# **GAI Project 3 PEFT on GLUE benchmarks - full-finetune**

## 環境設置

In [41]:
# 若沒有安裝 transformers 和 datasets 套件，請取消以下註解並執行
! pip install transformers
! pip install datasets
! pip install torch
! pip install peft
! pip install evaluate
! pip install transformers[torch]



In [42]:
# 1. 確認所需套件的版本
import torch
print("PyTorch 的版本為: {}".format(torch.__version__))

import transformers
print("Hugging Face Transformers 的版本為: {}".format(transformers.__version__))

import datasets
print("Hugging Face Datasets 的版本為: {}".format(datasets.__version__))

import peft
print("PEFT 的版本為: {}".format(peft.__version__))

PyTorch 的版本為: 2.2.1+cu121
Hugging Face Transformers 的版本為: 4.40.2
Hugging Face Datasets 的版本為: 2.19.1
PEFT 的版本為: 0.10.0


In [43]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    TaskType,
)

In [44]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import re
import warnings
import json
import os
import evaluate
import numpy as np
from pathlib import Path # (Python3.4+)
device = "cuda" if torch.cuda.is_available() else "cpu"
warnings.filterwarnings("ignore")

## 加入tokenizer

In [45]:
checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [46]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## **SST2**

### 資料處理

In [71]:
data_sst = load_dataset("GLUE", "sst2")
data_sst

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [72]:
dataset_ratio = 0.1

# 切割資料
train_size = int(len(data_sst["train"])*dataset_ratio)
validation_size = int(len(data_sst["validation"])*dataset_ratio)
test_size = int(len(data_sst["test"])*dataset_ratio)

data_sst["train"] = data_sst["train"].select(range(train_size))
data_sst["validation"] = data_sst["validation"].select(range(validation_size))
data_sst["test"] = data_sst["test"].select(range(test_size))

data_sst

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 6734
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 87
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 182
    })
})

In [73]:
def preprocess_function(batch):
  return tokenizer(batch["sentence"], truncation=True)

In [74]:
sst2_tokenized_datasets = data_sst.map(preprocess_function, batched=True)
sst2_tokenized_datasets

Map:   0%|          | 0/6734 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6734
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 87
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 182
    })
})

### 引入模型

In [75]:
sst2_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluate and Train

In [76]:
def sst2_compute_metrics(eval_preds):
    metric = evaluate.load("glue", "sst2")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [77]:
# 訓練模型

# 設定 TrainingArguments
sst2_training_args = transformers.TrainingArguments(
    output_dir="./results",          # 輸出的資料夾
    num_train_epochs=5,            # 總共訓練的 epoch 數目
    learning_rate=1e-5,            # 學習率
    per_device_train_batch_size=16,      # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=16,      # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=1,      # 梯度累積的步數
    warmup_steps=500,             # learning rate scheduler 的參數
    weight_decay=0.01,            # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy="epoch",       # 設定驗證的時機
    save_strategy="epoch",          # 設定儲存的時機
    save_total_limit=10,           # 最多儲存幾個模型
    logging_dir='./logs',           # 存放 log 的資料夾
    logging_steps=200,
    seed= 42,
)

In [78]:
sst2_trainer = Trainer(
    model=sst2_model,                     # 🤗 的模型
    args=sst2_training_args,                  #  Trainer 所需要的引數
    train_dataset=sst2_tokenized_datasets["train"],      # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=sst2_tokenized_datasets["validation"],    # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    tokenizer=tokenizer,
    compute_metrics=sst2_compute_metrics,           # 自定的評估的指標
)

In [79]:
# 指定使用 1 個 GPU 進行訓練
sst2_trainer.args._n_gpu=1

# 開始進行模型訓練
sst2_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.4271,0.259497,0.885057
2,0.2562,0.156745,0.954023
3,0.1709,0.304329,0.896552
4,0.1076,0.254708,0.931034
5,0.0738,0.333233,0.91954


TrainOutput(global_step=2105, training_loss=0.23229222648783704, metrics={'train_runtime': 300.4026, 'train_samples_per_second': 112.083, 'train_steps_per_second': 7.007, 'total_flos': 605639510558760.0, 'train_loss': 0.23229222648783704, 'epoch': 5.0})

# CoLA

### 資料處理

In [56]:
data_cola = load_dataset("GLUE", "cola")
data_cola

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [57]:
def cola_preprocess_function(batch):
  return tokenizer(batch["sentence"], truncation=True)

In [58]:
cola_tokenized_datasets = data_cola.map(cola_preprocess_function, batched=True)
cola_tokenized_datasets

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1063
    })
})

### 引入模型

In [59]:
cola_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluate and train

In [60]:
def cola_compute_metrics(eval_preds):
    metric = evaluate.load("glue", "cola")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [61]:
# 訓練模型

# 設定 TrainingArguments
cola_training_args = TrainingArguments(
    output_dir="./results",          # 輸出的資料夾
    num_train_epochs=5,            # 總共訓練的 epoch 數目
    learning_rate=1e-5,            # 學習率
    per_device_train_batch_size=16,      # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=16,      # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=1,      # 梯度累積的步數
    warmup_steps=500,             # learning rate scheduler 的參數
    weight_decay=0.01,            # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy="epoch",       # 設定驗證的時機
    save_strategy="epoch",          # 設定儲存的時機
    save_total_limit=10,           # 最多儲存幾個模型
    logging_dir='./logs',           # 存放 log 的資料夾
    logging_steps=250,
    seed= 42,
)

In [62]:
cola_trainer = Trainer(
    model=cola_model,                         # 🤗 的模型
    args=cola_training_args,                  # Trainer 所需要的引數
    train_dataset=cola_tokenized_datasets["train"],          # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=cola_tokenized_datasets["validation"],      # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    tokenizer=tokenizer,
    compute_metrics=cola_compute_metrics,     # 自定的評估的指標
)

In [63]:
# 指定使用 1 個 GPU 進行訓練
cola_trainer.args._n_gpu=1

# 開始進行模型訓練
cola_trainer.train()

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5235,0.487015,0.461338
2,0.4085,0.421025,0.568367
3,0.2655,0.498915,0.567605
4,0.2204,0.710624,0.541659
5,0.1723,0.727704,0.562907


TrainOutput(global_step=2675, training_loss=0.32723475589930456, metrics={'train_runtime': 315.5691, 'train_samples_per_second': 135.485, 'train_steps_per_second': 8.477, 'total_flos': 454848611954580.0, 'train_loss': 0.32723475589930456, 'epoch': 5.0})

# MRPC

### 資料處理

In [64]:
data = load_dataset("GLUE", "mrpc")
data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [65]:
mrpc_tokenized_datasets = data.map(lambda x: tokenizer(x["sentence1"], x["sentence2"]), batched=True)
mrpc_tokenized_datasets

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### model

In [66]:
mrpc_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Evaluate and train

In [67]:
def mrpc_compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [68]:
# 訓練模型

# 設定 TrainingArguments
mrpc_training_args = TrainingArguments(
    output_dir="./results",          # 輸出的資料夾
    num_train_epochs=5,            # 總共訓練的 epoch 數目
    learning_rate=1e-5,            # 學習率
    per_device_train_batch_size=16,      # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=16,      # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=1,      # 梯度累積的步數
    warmup_steps=500,             # learning rate scheduler 的參數
    weight_decay=0.01,            # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy="epoch",       # 設定驗證的時機
    save_strategy="epoch",          # 設定儲存的時機
    save_total_limit=10,           # 最多儲存幾個模型
    logging_dir='./logs',           # 存放 log 的資料夾
    logging_steps=150,
    seed= 42,
)

In [69]:
mrpc_trainer = Trainer(
    model=mrpc_model,                         # 🤗 的模型
    args=mrpc_training_args,                  # Trainer 所需要的引數
    train_dataset=mrpc_tokenized_datasets["train"],          # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=mrpc_tokenized_datasets["validation"],      # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    tokenizer=tokenizer,
    compute_metrics=mrpc_compute_metrics,     # 自定的評估的指標
)

In [70]:
# 指定使用 1 個 GPU 進行訓練
mrpc_trainer.args._n_gpu=1

# 開始進行模型訓練
mrpc_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7018,0.582835,0.683824,0.812227
2,0.5258,0.496472,0.77451,0.855799
3,0.3979,0.396778,0.840686,0.886562
4,0.2276,0.466933,0.845588,0.892675
5,0.1644,0.510191,0.85049,0.895369


TrainOutput(global_step=1150, training_loss=0.3935712092855702, metrics={'train_runtime': 304.6447, 'train_samples_per_second': 60.201, 'train_steps_per_second': 3.775, 'total_flos': 714950848507680.0, 'train_loss': 0.3935712092855702, 'epoch': 5.0})