In [None]:
# !pip install google.colab  # 在google colab上執行時安裝
# !pip install ipykernel  # 若在jupyter notebook,則是裝ipykernel

In [23]:
!pip install torch torchvision torchaudio



In [24]:
##### 設定GPU
import torch

# 檢查 PyTorch 版本
print("PyTorch 版本:", torch.__version__)

# 檢查 Metal GPU 是否可用(Mac版)
mps_available = torch.backends.mps.is_available()
print("Metal GPU 是否可用:", torch.backends.mps.is_available())

# 設定裝置
device = torch.device("mps") if mps_available else torch.device("cpu")
print("使用裝置:", device)

PyTorch 版本: 2.6.0
Metal GPU 是否可用: True
使用裝置: mps


In [25]:
# 相關套件版本號
import importlib.metadata

def get_package_version(package_name):
    try:
        version = importlib.metadata.version(package_name)
        return f"{package_name} 版本: {version}"
    except importlib.metadata.PackageNotFoundError:
        return f"{package_name} 未安裝"

# 要查詢的套件
packages = ["torch", "torchvision", "torchaudio", "transformers", "datasets", "evaluate", "accelerate", "scikit-learn","importlib.metadata"]

# 取得版本資訊
for package in packages:
    print(get_package_version(package))


torch 版本: 2.6.0
torchvision 版本: 0.21.0
torchaudio 版本: 2.6.0
transformers 版本: 4.49.0
datasets 版本: 3.3.1
evaluate 版本: 0.4.3
accelerate 版本: 1.4.0
scikit-learn 版本: 1.6.1
importlib.metadata 版本: 8.5.0


In [26]:
# 載入套件 & 取得資料
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
    AutoTokenizer, # 斷詞
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,)
import random
from sklearn.metrics import f1_score # 分類模型的評估指標：Precision 和 Recall 的 調和平均數

# 讀取資料
data = load_dataset("Johnson8187/Chinese_Multi-Emotion_Dialogue_Dataset")

# 情緒分類
emotion_mapping = {
    "平淡語氣": 0,
    "關切語調": 1,
    "開心語調": 2,
    "憤怒語調": 3,
    "悲傷語調": 4,
    "疑問語調": 5,
    "驚奇語調": 6,
    "厭惡語調": 7
}

# 將情緒分類轉換成數字
data = data.map(lambda x: {"emotion": emotion_mapping[x["emotion"]]})


In [27]:
##### 微調模型
'''
函式
'''
# 讀取 dataset
def load_dataset_from_file(file_path, seed=42):
    # 洗牌 (記得設定 random seed，確保每次洗牌結果一樣)
    random.seed(seed)
    random.shuffle(file_path)

    # 整合訓練資料
    sentences = [] # 給句子用
    labels = [] # 給答案用

    for text, index in zip(data["train"]["text"], data["train"]["emotion"]):
        sentences.append(text)
        labels.append(index)

    return sentences, labels # 回傳文字內容與標籤


# 轉換成 huggingface trainer 可以使用的 datasets
def convert_to_dataset(sentences, labels, tokenizer, max_seq_length):
    # 建立 Dataset
    dataset = Dataset.from_dict({
        'sentences': sentences,
        'labels': labels
    })

    # 回傳切分資料 (訓練 和 驗證)
    dataset = dataset.train_test_split(test_size=0.2)

    # 預處理資料
    def preprocess_data(dataset):
        # 將句子轉換為 token (tokenization)
        return tokenizer(
            dataset['sentences'], # 詞
            truncation=True,
            padding=True, # 如果沒有這一段就需要用校正器
            return_tensors='pt', # pytorch
            max_length=max_seq_length # 樣本數
        )

    # 轉換資料
    train_data = dataset['train'].map(preprocess_data, batched=True)
    valid_data = dataset['test'].map(preprocess_data, batched=True)

    return DatasetDict({ # 轉成huggingface可看懂的格式
        'train': train_data,
        'test': valid_data
    })

# 計算模型評估指標
def compute_metrics(predicted_results):
    labels = predicted_results.label_ids
    preds = predicted_results.predictions.argmax(-1)

    f1 = f1_score(labels, preds, average='weighted') # binary（二元類別）, micro, macro, weighted（多類別）
    return {
        'f1': f1,
    }


In [28]:
from sklearn.metrics import f1_score

# 驗證 F1 score 算法
y_true = [0,0,1,1,1,0,0]
y_pred = [0,1,0,1,1,1,0]
# y_true = [0,2,1,2,1,0,1]
# y_pred = [0,1,0,2,1,1,2]
print(f1_score(y_true, y_pred, average='weighted')) # binary, micro, macro

0.5714285714285714


In [29]:
# 主程式 - 微調模型
if __name__ == "__main__":
    '''
    設定 hyperparameters
    '''
    model_name = 'google-bert/bert-base-chinese' # 預訓練模型名稱
    max_seq_length = 512 # 可訓練的序列最大長度
    num_labels = 8 # 八元分類
    output_dir = './output' # 輸出模型資料夾

    # 讀取訓練資料
    sentences, labels = load_dataset_from_file(data)

    # 載入 tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # 將資料轉換為 huggingface 可以使用的格式
    dataset = convert_to_dataset(
        sentences, 
        labels, 
        tokenizer, 
        max_seq_length
    )

    # 讀取模型
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name, 
        num_labels=num_labels
    )

    # 設定訓練參數
    training_args = TrainingArguments(
        output_dir=output_dir, # 輸出資料夾
        overwrite_output_dir=True,
        num_train_epochs=3, # 訓練回合數
        per_device_train_batch_size=32, # 批次大小
        per_device_eval_batch_size=32, # 批次大小
        gradient_accumulation_steps=2,
        learning_rate=0.00003, # 學習率 5e-5
        warmup_steps=100,
        weight_decay=0.01,
        eval_strategy="steps", # epoch, steps, no
        eval_steps=50,
        save_strategy="steps", # epoch, steps, no
        save_steps=50,
        save_total_limit=2,
        load_best_model_at_end=True,
        seed=42, # 隨機種子
        # lr_scheduler_type="linear", # https://blog.csdn.net/muyao987/article/details/139319466
        # report_to='wandb', # https://wandb.ai/
    )

    # 設定 Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        data_collator=DataCollatorWithPadding(tokenizer),
        compute_metrics=compute_metrics,
    )

    # 開始訓練
    trainer.train()

    # 儲存模型
    trainer.save_model(output_dir) # , safe_serialization=True

    # 儲存 tokenizer
    tokenizer.save_pretrained(output_dir)

Map: 100%|██████████| 3327/3327 [00:00<00:00, 15903.83 examples/s]
Map: 100%|██████████| 832/832 [00:00<00:00, 16854.68 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,F1
50,No log,1.384277,0.688566
100,No log,0.513814,0.846187
150,No log,0.410678,0.870775


In [30]:
##### 測試微調好的模型

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
)
from pprint import pprint

model_dir = './output'
model = AutoModelForSequenceClassification.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
pipe = pipeline(
    task='text-classification', 
    model=model, 
    tokenizer=tokenizer, 
    device=0
    )

list_text = [
    "我每天都能跟她一起上學，我好開心！",
	"最好的朋友要離開臺灣了，以後可能不容易再見面...",
	"我覺得我快不行了",
	"剛剛收到研究所錄取的通知書！",
	"今年的冬天好像比較晚來。",
    "有人在背後說我胖！幹! "  
]
result = pipe(list_text)
reverse_emotion_mapping = {j: k for k, j in emotion_mapping.items()}

for i in range(len(list_text)):
    print("-"*60)
    print(f"中文句子:{list_text[i]}")
    label_No = int(result[i]['label'][-1])
    print(f"句子情緒:{reverse_emotion_mapping[label_No]}")
    print(f"情緒分數: {round(result[i]['score'], 2)}")


Device set to use mps:0


------------------------------------------------------------
中文句子:我每天都能跟她一起上學，我好開心！
句子情緒:開心語調
情緒分數: 0.97
------------------------------------------------------------
中文句子:最好的朋友要離開臺灣了，以後可能不容易再見面...
句子情緒:悲傷語調
情緒分數: 0.95
------------------------------------------------------------
中文句子:我覺得我快不行了
句子情緒:悲傷語調
情緒分數: 0.84
------------------------------------------------------------
中文句子:剛剛收到研究所錄取的通知書！
句子情緒:開心語調
情緒分數: 0.95
------------------------------------------------------------
中文句子:今年的冬天好像比較晚來。
句子情緒:平淡語氣
情緒分數: 0.93
------------------------------------------------------------
中文句子:有人在背後說我胖！幹! 
句子情緒:憤怒語調
情緒分數: 0.94
