# 各種 Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

# 資料集處理 (單純的 pandas 切分)

In [None]:
df = pd.read_csv('IMDB Dataset.csv')

for i in range(len(df['sentiment'])):
    if df.at[i, 'sentiment'] == 'positive':
        df.at[i, 'sentiment'] = 1
    else:
        df.at[i, 'sentiment'] = 0

groups = df.groupby(df.sentiment)
data_positive = groups.get_group(1)
data_negative = groups.get_group(0)

data_positive = data_positive.sample(frac=1.0)
data_negative = data_negative.sample(frac=1.0)

test_positive = data_positive.iloc[20000:, :]
test_negative = data_negative.iloc[20000:, :]
test_data = pd.concat([test_positive, test_negative], axis = 0, ignore_index=True).sample(frac=1)
train_positive = data_positive.iloc[:20000, :]
train_negative = data_negative.iloc[:20000, :]
train_data = pd.concat([train_positive, train_negative], axis = 0, ignore_index=True).sample(frac=1)

test_data.to_csv('test.csv', index = None)
train_data.to_csv('train.csv', index = None)

# 資料前處理
1. BERT cased vs. BERT uncased  
   BERT cased: 字母保留原樣  
   BERT uncased: 在做 WordPiece tokenization 前，會先將所有字母變小寫 (lowercased)、附加符號 (Accent markers) 拔掉  
2. BertModel vs. BertForSequenceClassification  
   BertModel: 基本 BERT 模型  
   BertForSequenceClassification: 針對序列 (文本) 分類做過 Fine-tune 的 BERT 模型

In [None]:
# Read data
data = pd.read_csv("train.csv")

# Define pretrained tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2) # num_labels: 分類類別數量

# ----- 1. Preprocess data -----#
# Preprocess data
X = list(data["review"])
y = list(data["sentiment"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

# 建立 Model
* [Train](https://huggingface.co/docs/transformers/training#train)
  * [```Trainer```](https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/trainer#transformers.Trainer)  
    一些常用參數如下：
    * ```model```: 使用的 Model
    * ```args```: 訓練的相關參數
    * ```train_dataset```: 訓練資料集
    * ```eval_dataset```: 驗證資料集
    * ```compute_metrics```: 設定評估指標
    * ```callbacks```: (資料型態：list) 設定訓練模型過程中的觸發事件，
  * [```TrainingArguments```](https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/trainer#transformers.TrainingArguments)  
    一些常用的參數如下：
    * 

In [None]:
# ----- 2. Fine-tune pretrained model -----#
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

# Define Trainer
args = TrainingArguments(
    output_dir="output",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    seed=0,
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],
)

# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 32000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12000
  Number of trainable parameters = 109483778


  0%|          | 0/12000 [00:00<?, ?it/s]

***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.441, 'learning_rate': 4.791666666666667e-05, 'epoch': 0.12}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-500
Configuration saved in output\checkpoint-500\config.json


{'eval_loss': 0.26855573058128357, 'eval_accuracy': 0.90275, 'eval_precision': 0.9030662710187932, 'eval_recall': 0.9044081228330857, 'eval_f1': 0.9037366988369214, 'eval_runtime': 122.7531, 'eval_samples_per_second': 65.171, 'eval_steps_per_second': 8.146, 'epoch': 0.12}


Model weights saved in output\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.3591, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.25}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-1000
Configuration saved in output\checkpoint-1000\config.json


{'eval_loss': 0.4665004014968872, 'eval_accuracy': 0.840875, 'eval_precision': 0.9678510998307953, 'eval_recall': 0.70827142149579, 'eval_f1': 0.8179608179608181, 'eval_runtime': 122.3712, 'eval_samples_per_second': 65.375, 'eval_steps_per_second': 8.172, 'epoch': 0.25}


Model weights saved in output\checkpoint-1000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.3308, 'learning_rate': 4.375e-05, 'epoch': 0.38}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-1500
Configuration saved in output\checkpoint-1500\config.json


{'eval_loss': 0.2645694613456726, 'eval_accuracy': 0.915375, 'eval_precision': 0.9266311246509267, 'eval_recall': 0.903912828132739, 'eval_f1': 0.9151310016296853, 'eval_runtime': 121.7257, 'eval_samples_per_second': 65.722, 'eval_steps_per_second': 8.215, 'epoch': 0.38}


Model weights saved in output\checkpoint-1500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.3269, 'learning_rate': 4.166666666666667e-05, 'epoch': 0.5}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-2000
Configuration saved in output\checkpoint-2000\config.json


{'eval_loss': 0.2602947950363159, 'eval_accuracy': 0.908625, 'eval_precision': 0.8693321420594148, 'eval_recall': 0.9638434868746905, 'eval_f1': 0.9141514973576043, 'eval_runtime': 121.7033, 'eval_samples_per_second': 65.734, 'eval_steps_per_second': 8.217, 'epoch': 0.5}


Model weights saved in output\checkpoint-2000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.2777, 'learning_rate': 3.958333333333333e-05, 'epoch': 0.62}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-2500
Configuration saved in output\checkpoint-2500\config.json


{'eval_loss': 0.24669182300567627, 'eval_accuracy': 0.918, 'eval_precision': 0.9167077378018729, 'eval_recall': 0.9212481426448736, 'eval_f1': 0.9189723320158102, 'eval_runtime': 122.4313, 'eval_samples_per_second': 65.343, 'eval_steps_per_second': 8.168, 'epoch': 0.62}


Model weights saved in output\checkpoint-2500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.3055, 'learning_rate': 3.7500000000000003e-05, 'epoch': 0.75}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-3000
Configuration saved in output\checkpoint-3000\config.json


{'eval_loss': 0.2281709909439087, 'eval_accuracy': 0.918875, 'eval_precision': 0.8932466929682061, 'eval_recall': 0.9531946508172363, 'eval_f1': 0.922247514076914, 'eval_runtime': 123.8701, 'eval_samples_per_second': 64.584, 'eval_steps_per_second': 8.073, 'epoch': 0.75}


Model weights saved in output\checkpoint-3000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.2711, 'learning_rate': 3.541666666666667e-05, 'epoch': 0.88}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-3500
Configuration saved in output\checkpoint-3500\config.json


{'eval_loss': 0.2700549364089966, 'eval_accuracy': 0.927375, 'eval_precision': 0.9333166207069441, 'eval_recall': 0.9219910846953938, 'eval_f1': 0.9276192849134173, 'eval_runtime': 122.4329, 'eval_samples_per_second': 65.342, 'eval_steps_per_second': 8.168, 'epoch': 0.88}


Model weights saved in output\checkpoint-3500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.2843, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-4000
Configuration saved in output\checkpoint-4000\config.json


{'eval_loss': 0.22953100502490997, 'eval_accuracy': 0.933, 'eval_precision': 0.9319190922545634, 'eval_recall': 0.9356116889549282, 'eval_f1': 0.9337617399901137, 'eval_runtime': 123.0603, 'eval_samples_per_second': 65.009, 'eval_steps_per_second': 8.126, 'epoch': 1.0}


Model weights saved in output\checkpoint-4000\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 8000
  Batch size = 8


{'loss': 0.1726, 'learning_rate': 3.125e-05, 'epoch': 1.12}


  0%|          | 0/1000 [00:00<?, ?it/s]

Saving model checkpoint to output\checkpoint-4500
Configuration saved in output\checkpoint-4500\config.json


{'eval_loss': 0.28655487298965454, 'eval_accuracy': 0.934, 'eval_precision': 0.920863309352518, 'eval_recall': 0.950965824665676, 'eval_f1': 0.935672514619883, 'eval_runtime': 122.7089, 'eval_samples_per_second': 65.195, 'eval_steps_per_second': 8.149, 'epoch': 1.12}


Model weights saved in output\checkpoint-4500\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from output\checkpoint-3000 (score: 0.2281709909439087).


{'train_runtime': 2838.2394, 'train_samples_per_second': 33.824, 'train_steps_per_second': 4.228, 'train_loss': 0.3076588389078776, 'epoch': 1.12}


TrainOutput(global_step=4500, training_loss=0.3076588389078776, metrics={'train_runtime': 2838.2394, 'train_samples_per_second': 33.824, 'train_steps_per_second': 4.228, 'train_loss': 0.3076588389078776, 'epoch': 1.12})

# 預測

In [None]:
# ----- 3. Predict -----#
# Load test data
test_data = pd.read_csv("test.csv")
X_test = list(test_data["review"])
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

# Create torch dataset
test_dataset = Dataset(X_test_tokenized)

# Load trained model
model_path = "output/checkpoint-4500"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2)

# Define test trainer
test_trainer = Trainer(model)

# Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

loading configuration file output/checkpoint-4500\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file output/checkpoint-4500\pytorch_model.bin
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceC

  0%|          | 0/1250 [00:00<?, ?it/s]