In [None]:
# 若沒有安裝 transformers 和 datasets 套件，請取消以下註解並執行
!pip install transformers==4.38.0
!pip install datasets
!pip install torch==2.0.1+cu110
!pip install peft

!git clone https://github.com/NVIDIA/apex
%cd apex
!pip install -r requirements.txt
!pip install -v --disable-pip-version-check --no-cache-dir ./


In [None]:
# 1. 確認所需套件的版本
import torch
print("PyTorch 的版本為: {}".format(torch.__version__))

import transformers
print("Hugging Face Transformers 的版本為: {}".format(transformers.__version__))

import datasets
print("Hugging Face Datasets 的版本為: {}".format(datasets.__version__))

import peft
print("PEFT 的版本為: {}".format(peft.__version__))

# 2. 載入其他所需套件

import os
import json
import numpy as np
from pathlib import Path # (Python3.4+)

PyTorch 的版本為: 2.2.1+cu121
Hugging Face Transformers 的版本為: 4.38.0
Hugging Face Datasets 的版本為: 2.19.1
PEFT 的版本為: 0.10.0


In [None]:
from datasets import load_dataset

dataset_sst2 = load_dataset("glue", "sst2")
dataset_rte = load_dataset("glue", "rte")

In [None]:
# # 获取所有的键
# keys2 = dataset_rte.keys()

# # 遍历所有的键，并查看对应的数据集结构
# for key in keys2:
#     print(f"Dataset: {key}")
#     print(dataset_rte[key])
#     print("\n")

In [None]:
# 8. 載入 tokenizer

# 在 Hugging Face 套件中可使用 .from_pretrained() 的方法來導入預訓練模型
tokenizer = transformers.AutoTokenizer.from_pretrained('roberta-base')

In [None]:
# 9. 分別將3種資料 (train/valid/test) 做 tokenization
# truncation 代表依照 max_length 進行序列長度的裁切
# max_length 可以在 tokenizer 的 parameters 中進行設定
# 如果沒有指定 max_length，則依照所使用的模型的序列最大長度
# padding 為 True 表示會將序列長度補齊至該 batch 的最大長度 (欲知詳情請查看 source code)

sst2_train_encodings = tokenizer(dataset_sst2['train']['sentence'], truncation=True, padding=True)
sst2_val_encodings = tokenizer(dataset_sst2['validation']['sentence'], truncation=True, padding=True)
sst2_test_encodings = tokenizer(dataset_sst2['test']['sentence'], truncation=True, padding=True)


In [None]:
rte_train_encodings = tokenizer(dataset_rte['train']['sentence1'],dataset_rte['train']['sentence2'], return_tensors='pt',truncation=True, padding=True)
rte_val_encodings = tokenizer(dataset_rte['validation']['sentence1'],dataset_rte['validation']['sentence2'], return_tensors='pt',truncation=True, padding=True)
rte_test_encodings = tokenizer(dataset_rte['test']['sentence1'],dataset_rte['test']['sentence2'], return_tensors='pt',truncation=True, padding=True)

In [None]:
# 13. 透過 PyTorch Dataset 來建立能夠進行方便資料存取的格式

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # Dataset class 的 parameters 放入我們 tokenization 後的資料以及資料的標籤
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # 請注意 tokenization 後的資料是一個 dict
        # 在此步驟將資料以及標籤都轉換為 PyTorch 的 tensors
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        # 回傳資料集的總數
        return len(self.labels)

sst2_train_dataset = Dataset(sst2_train_encodings, dataset_sst2['train']['label'])
sst2_val_dataset = Dataset(sst2_val_encodings, dataset_sst2['validation']['label'])
sst2_test_dataset = Dataset(sst2_test_encodings, dataset_sst2['test']['label'])

In [None]:
rte_train_dataset = Dataset(rte_train_encodings, dataset_rte['train']['label'])
rte_val_dataset = Dataset(rte_val_encodings, dataset_rte['validation']['label'])
rte_test_dataset = Dataset(rte_test_encodings, dataset_rte['test']['label'])

In [None]:
model = transformers.RobertaForSequenceClassification.from_pretrained("roberta-base")

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# # lora
# from peft import LoraConfig, TaskType, get_peft_model

# config = LoraConfig(task_type=TaskType.CAUSAL_LM)
# model = get_peft_model(model, config)

In [None]:
# for name, parameter in model.named_parameters():
#     print(name)

In [None]:
# 18. 建立自定的評估的指標 (定義 function)
# 將作為 transformers.Trainer 的 parameters 之一

# Scikit-learn 的 precision_recall_fscore_support 套件可以一次計算 F1 score, precision, 和 recall
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# 請參考GLUE benchmark的官方網頁，使用和資料集對應的evaluation matrics

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = (preds == labels).mean()
    return {'accuracy': accuracy}


In [None]:
# 19. 訓練模型

# 設定 TrainingArguments
training_args = transformers.TrainingArguments(
    output_dir="./results" ,          # 輸出的資料夾
    num_train_epochs= 3,              # 總共訓練的 epoch 數目
    learning_rate=2e-5 ,              # 學習率
    per_device_train_batch_size=8 ,  # 訓練模型時每個裝置的 batch size
    per_device_eval_batch_size=8 ,   # 驗證模型時每個裝置的 batch size
    gradient_accumulation_steps=2 ,   # 梯度累積的步數
    warmup_steps=500 ,                # learning rate scheduler 的參數
    weight_decay=0.01 ,               # 最佳化演算法 (optimizer) 中的權重衰退率
    evaluation_strategy= "steps",     # 設定驗證的時機
    save_strategy="epoch" ,           # 設定儲存的時機
    save_steps=500 ,                  # 設定多少步驟儲存一次模型
    eval_steps=500 ,                  # 設定多少步驟驗證一次模型
    report_to="tensorboard" ,         # 是否將訓練結果儲存到 TensorBoard
    save_total_limit= 1,              # 最多儲存幾個模型
    logging_dir="./logs" ,            # 存放 log 的資料夾
    logging_steps=10 ,
    seed=42 ,
)

In [None]:
# BitFit
num_param = 0
for name,param in model.named_parameters():
  if "bias" not in name:
    param.requires_grad = False
  else:
    num_param += param.numel()
num_param

102914

In [None]:
sum(param.numel() for param in model.parameters())

124647170

In [None]:
trainer = transformers.Trainer(
    model=model,                         # 🤗 的模型
    args=training_args,                  # Trainer 所需要的引數
    train_dataset=sst2_train_dataset,         # 訓練集 (注意是 PyTorch Dataset)
    eval_dataset=sst2_val_dataset,            # 驗證集 (注意是 PyTorch Dataset)，可使 Trainer 在進行訓練時也進行驗證
    compute_metrics=compute_metrics,     # 自定的評估的指標
)

# 指定使用 1 個 GPU 進行訓練
trainer.args._n_gpu=1

# 開始進行模型訓練
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Accuracy
500,0.6996,0.692876,0.521789
1000,0.6819,0.693897,0.509174
1500,0.6755,0.693602,0.509174
2000,0.6747,0.682883,0.509174
2500,0.4797,0.485322,0.848624
3000,0.4871,0.405401,0.858945
3500,0.4482,0.379682,0.862385
4000,0.465,0.36983,0.866972
4500,0.3588,0.356569,0.87156
5000,0.3916,0.351986,0.872706


TrainOutput(global_step=12627, training_loss=0.43447850812989175, metrics={'train_runtime': 1769.146, 'train_samples_per_second': 114.206, 'train_steps_per_second': 7.137, 'total_flos': 6955865942774760.0, 'train_loss': 0.43447850812989175, 'epoch': 3.0})