# **GAI Project 3 PEFT on GLUE benchmarks - bitfit**

## 環境設置

In [65]:
# 若沒有安裝 transformers 和 datasets 套件，請取消以下註解並執行
! pip install transformers
! pip install datasets
! pip install torch
! pip install peft
! pip install evaluate
! pip install transformers[torch]



In [66]:
# 1. 確認所需套件的版本
import torch
print("PyTorch 的版本為: {}".format(torch.__version__))

import transformers
print("Hugging Face Transformers 的版本為: {}".format(transformers.__version__))

import datasets
print("Hugging Face Datasets 的版本為: {}".format(datasets.__version__))

import peft
print("PEFT 的版本為: {}".format(peft.__version__))

PyTorch 的版本為: 2.2.1+cu121
Hugging Face Transformers 的版本為: 4.40.2
Hugging Face Datasets 的版本為: 2.19.1
PEFT 的版本為: 0.10.0


In [67]:
from peft import (
    get_peft_config,
    get_peft_model,
    get_peft_model_state_dict,
    set_peft_model_state_dict,
    LoraConfig,
    PeftType,
    PrefixTuningConfig,
    PromptEncoderConfig,
    TaskType,
)

In [68]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import re
import warnings
import json
import os
import evaluate
import numpy as np
from pathlib import Path # (Python3.4+)
device = "cuda" if torch.cuda.is_available() else "cpu"
warnings.filterwarnings("ignore")

## 加入tokenizer

In [69]:
checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [70]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## **SST2**

### 資料處理

In [139]:
data_sst = load_dataset("GLUE", "sst2")
data_sst

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [140]:
dataset_ratio = 0.5

# 切割資料
train_size = int(len(data_sst["train"])*dataset_ratio)
validation_size = int(len(data_sst["validation"])*dataset_ratio)
test_size = int(len(data_sst["test"])*dataset_ratio)

data_sst["train"] = data_sst["train"].select(range(train_size))
data_sst["validation"] = data_sst["validation"].select(range(validation_size))
data_sst["test"] = data_sst["test"].select(range(test_size))

data_sst

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 33674
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 436
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 910
    })
})

In [141]:
def preprocess_function(batch):
  return tokenizer(batch["sentence"], truncation=True)

In [142]:
sst2_tokenized_datasets = data_sst.map(preprocess_function, batched=True)
sst2_tokenized_datasets

Map:   0%|          | 0/33674 [00:00<?, ? examples/s]

Map:   0%|          | 0/436 [00:00<?, ? examples/s]

Map:   0%|          | 0/910 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 33674
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 436
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 910
    })
})

### 引入模型

In [143]:
sst2_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### PEFT(bitfit) and Evaluate

In [144]:
def sst2_compute_metrics(eval_preds):
    metric = evaluate.load("glue", "sst2")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [145]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_nb_trainable_parameters():
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in sst2_model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params, all_param = get_nb_trainable_parameters()

    print(f"trainable params: {trainable_params}  all params: {all_param}  trainable%: {100 * trainable_params / all_param}")

In [146]:
for name, param in sst2_model.named_parameters():
    if 'bias' not in name:
        param.requires_grad=False

print_trainable_parameters(sst2_model)

trainable params: 102914  all params: 109483778  trainable%: 0.09399931376135011


### Train

In [150]:
# 訓練模型

# 設定 TrainingArguments
sst2_training_args = TrainingArguments(
    output_dir="./results",          # 輸出的資料夾
    num_train_epochs=5,            # 總共訓練的 epoch 數目
    learning_rate=5e-4,            # 學習率
    per_device_train_batch_size=16,      # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=16,      # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=5,      # 梯度累積的步數
    warmup_steps=500,             # learning rate scheduler 的參數
    weight_decay=0.01,            # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy="epoch",       # 設定驗證的時機
    save_strategy="epoch",          # 設定儲存的時機
    save_total_limit=10,           # 最多儲存幾個模型
    logging_dir='./logs',           # 存放 log 的資料夾
    logging_steps=150,
    seed= 42,
)

In [151]:
sst2_trainer = Trainer(
    model=sst2_model,                         # 🤗 的模型
    args=sst2_training_args,                  # Trainer 所需要的引數
    train_dataset=sst2_tokenized_datasets["train"],          # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=sst2_tokenized_datasets["validation"],      # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    tokenizer=tokenizer,
    compute_metrics=sst2_compute_metrics,     # 自定的評估的指標
)

In [152]:
# 指定使用 1 個 GPU 進行訓練
sst2_trainer.args._n_gpu=1

# 開始進行模型訓練
sst2_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.567,0.345414,0.864679
2,0.3184,0.282223,0.901376
3,0.2877,0.265837,0.915138
4,0.2683,0.26886,0.908257
5,0.265,0.257,0.912844


TrainOutput(global_step=2105, training_loss=0.34142463949117413, metrics={'train_runtime': 738.2021, 'train_samples_per_second': 228.081, 'train_steps_per_second': 2.852, 'total_flos': 3048834298421040.0, 'train_loss': 0.34142463949117413, 'epoch': 5.0})

## CoLA

### 資料處理

In [103]:
data_cola = load_dataset("GLUE", "cola")
data_cola

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [104]:
def cola_preprocess_function(batch):
  return tokenizer(batch["sentence"], truncation=True)

In [105]:
cola_tokenized_datasets = data_cola.map(cola_preprocess_function, batched=True)
cola_tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1063
    })
})

### 引入模型

In [106]:
cola_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### PEFT(bitfit) and Evaluate

In [107]:
def cola_compute_metrics(eval_preds):
    metric = evaluate.load("glue", "cola")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [108]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_nb_trainable_parameters():
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in cola_model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params, all_param = get_nb_trainable_parameters()

    print(f"trainable params: {trainable_params}  all params: {all_param}  trainable%: {100 * trainable_params / all_param}")

In [109]:
for name, param in cola_model.named_parameters():
    if 'bias' not in name:
        param.requires_grad=False

print_trainable_parameters(cola_model)

trainable params: 102914  all params: 109483778  trainable%: 0.09399931376135011


### Train

In [110]:
# 訓練模型

# 設定 TrainingArguments
cola_training_args = TrainingArguments(
    output_dir="./results",          # 輸出的資料夾
    num_train_epochs=5,            # 總共訓練的 epoch 數目
    learning_rate=1e-3,            # 學習率
    per_device_train_batch_size=16,      # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=16,      # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=1,      # 梯度累積的步數
    warmup_steps=500,             # learning rate scheduler 的參數
    weight_decay=0.01,            # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy="epoch",       # 設定驗證的時機
    save_strategy="epoch",          # 設定儲存的時機
    save_total_limit=10,           # 最多儲存幾個模型
    logging_dir='./logs',           # 存放 log 的資料夾
    logging_steps=250,
    seed= 42,
)

In [111]:
cola_trainer = Trainer(
    model=cola_model,                         # 🤗 的模型
    args=cola_training_args,                  # Trainer 所需要的引數
    train_dataset=cola_tokenized_datasets["train"],          # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=cola_tokenized_datasets["validation"],      # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    tokenizer=tokenizer,
    compute_metrics=cola_compute_metrics,     # 自定的評估的指標
)

In [112]:
# 指定使用 1 個 GPU 進行訓練
cola_trainer.args._n_gpu=1

# 開始進行模型訓練
cola_trainer.train()

Epoch,Training Loss,Validation Loss,Matthews Correlation
1,0.5799,0.518015,0.380333
2,0.5119,0.45757,0.509659
3,0.4496,0.535753,0.431873
4,0.4261,0.487779,0.483757
5,0.4083,0.47627,0.515291


TrainOutput(global_step=2675, training_loss=0.4829868951244889, metrics={'train_runtime': 167.9875, 'train_samples_per_second': 254.513, 'train_steps_per_second': 15.924, 'total_flos': 454848611954580.0, 'train_loss': 0.4829868951244889, 'epoch': 5.0})

## MRPC

### 資料處理

In [113]:
data = load_dataset("GLUE", "mrpc")
data

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [114]:
mrpc_tokenized_datasets = data.map(lambda x: tokenizer(x["sentence1"], x["sentence2"]), batched=True)
mrpc_tokenized_datasets

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

### 引入模型

In [115]:
mrpc_model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### PEFT(bitfit) and Evalute

In [116]:
def mrpc_compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [117]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_nb_trainable_parameters():
    r"""
    Returns the number of trainable parameters and number of all parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in mrpc_model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        # Due to the design of 4bit linear layers from bitsandbytes
        # one needs to multiply the number of parameters by 2 to get
        # the correct number of parameters
        if param.__class__.__name__ == "Params4bit":
            num_params = num_params * 2

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params

    return trainable_params, all_param


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params, all_param = get_nb_trainable_parameters()

    print(f"trainable params: {trainable_params}  all params: {all_param}  trainable%: {100 * trainable_params / all_param}")

In [118]:
for name, param in mrpc_model.named_parameters():
    if 'bias' not in name:
        param.requires_grad=False

print_trainable_parameters(mrpc_model)

trainable params: 102914  all params: 109483778  trainable%: 0.09399931376135011


### Train

In [119]:
# 訓練模型

# 設定 TrainingArguments
mrpc_training_args = TrainingArguments(
    output_dir="./results",          # 輸出的資料夾
    num_train_epochs=5,            # 總共訓練的 epoch 數目
    learning_rate=5e-4,            # 學習率
    per_device_train_batch_size=16,      # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=16,      # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=1,      # 梯度累積的步數
    warmup_steps=500,             # learning rate scheduler 的參數
    weight_decay=0.01,            # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy="epoch",       # 設定驗證的時機
    save_strategy="epoch",          # 設定儲存的時機
    save_total_limit=10,           # 最多儲存幾個模型
    logging_dir='./logs',           # 存放 log 的資料夾
    logging_steps=150,
    seed= 42,
)

In [120]:
mrpc_trainer = Trainer(
    model=mrpc_model,                         # 🤗 的模型
    args=mrpc_training_args,                  # Trainer 所需要的引數
    train_dataset=mrpc_tokenized_datasets["train"],          # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=mrpc_tokenized_datasets["validation"],      # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    tokenizer=tokenizer,
    compute_metrics=mrpc_compute_metrics,     # 自定的評估的指標
)

In [121]:
# 指定使用 1 個 GPU 進行訓練
mrpc_trainer.args._n_gpu=1

# 開始進行模型訓練
mrpc_trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7098,0.619579,0.683824,0.812227
2,0.5989,0.542391,0.70098,0.820059
3,0.5702,0.525442,0.784314,0.855738
4,0.5401,0.508603,0.789216,0.858553
5,0.5242,0.507368,0.794118,0.862295


TrainOutput(global_step=1150, training_loss=0.580606686135997, metrics={'train_runtime': 203.8877, 'train_samples_per_second': 89.951, 'train_steps_per_second': 5.64, 'total_flos': 714950848507680.0, 'train_loss': 0.580606686135997, 'epoch': 5.0})