In [1]:
# @title 1. Setup Environment & Mount Drive
!pip install -q transformers datasets evaluate scikit-learn accelerate psutil

import os
import time
import psutil
import torch
import pandas as pd
import numpy as np
from google.colab import drive
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from transformers.trainer_utils import get_last_checkpoint
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# 1. Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/SST2_BERT_Base_FTT'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print("✅ Phần 1: Cài đặt hoàn tất!")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m81.9/84.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Using device: cuda
✅ Phần 1: Cài đặt hoàn tất!


In [2]:
# @title 2. Load Data & Tokenization
print("--- Loading SST-2 Dataset ---")
dataset = load_dataset("glue", "sst2")

MODEL_NAME = 'bert-base-uncased'
print(f"--- Loading Tokenizer: {MODEL_NAME} ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    # Max length 512, Truncation=True, Padding=max_length
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=512)

print("--- Tokenizing Dataset ---")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Format PyTorch (Quan trọng: đổi tên cột label -> labels)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

dataset_train = tokenized_datasets["train"]
dataset_val = tokenized_datasets["validation"]

print(f"Train size: {len(dataset_train)} | Val size: {len(dataset_val)}")
print("✅ Phần 2: Data sẵn sàng!")

--- Loading SST-2 Dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

--- Loading Tokenizer: bert-base-uncased ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

--- Tokenizing Dataset ---


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

Train size: 67349 | Val size: 872
✅ Phần 2: Data sẵn sàng!


In [3]:
# @title 3. Data Audit (Kiểm tra dữ liệu trước Train)
import seaborn as sns
import matplotlib.pyplot as plt

def get_split_stats(dataset_split, split_name):
    cols = dataset_split.column_names
    target_col = "labels" if "labels" in cols else "label"

    data = dataset_split[target_col]
    if isinstance(data, torch.Tensor):
        labels = data.cpu().numpy()
    else:
        labels = np.array(data)

    total = len(labels)
    neg_count = np.sum(labels == 0)
    pos_count = np.sum(labels == 1)

    return {
        "Phân tập": split_name,
        "Tổng số mẫu": total,
        "Negative (0)": neg_count,
        "Positive (1)": pos_count,
        "Tỉ lệ Neg": f"{(neg_count/total)*100:.2f}%",
        "Tỉ lệ Pos": f"{(pos_count/total)*100:.2f}%"
    }

print("⏳ Đang phân tích dữ liệu BERT SST-2...")
stats_train = get_split_stats(dataset_train, "Train Set")
stats_val = get_split_stats(dataset_val, "Validation Set")
df_report = pd.DataFrame([stats_train, stats_val])

print("\n" + "="*60)
print("📊 BẢNG KIỂM TOÁN DỮ LIỆU (BERT-base / SST-2)")
print("="*60)
display(df_report)

# Soi mẫu thực tế
idx = 0
sample = dataset_train[idx]
print("\n🔍 SOI DỮ LIỆU MẪU ĐẦU TIÊN:")
print(f"Label: {sample['labels'].item()}")
print(f"Input IDs Length: {len(sample['input_ids'])} (Should be 512)")
print("-" * 30)
print(tokenizer.decode(sample['input_ids'])[:200] + "...")
print("✅ Phần 3: Đã kiểm tra dữ liệu!")

⏳ Đang phân tích dữ liệu BERT SST-2...

📊 BẢNG KIỂM TOÁN DỮ LIỆU (BERT-base / SST-2)


Unnamed: 0,Phân tập,Tổng số mẫu,Negative (0),Positive (1),Tỉ lệ Neg,Tỉ lệ Pos
0,Train Set,67349,29780,37569,44.22%,55.78%
1,Validation Set,872,428,444,49.08%,50.92%



🔍 SOI DỮ LIỆU MẪU ĐẦU TIÊN:
Label: 0
Input IDs Length: 512 (Should be 512)
------------------------------
[CLS] hide new secretions from the parental units [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] ...
✅ Phần 3: Đã kiểm tra dữ liệu!


In [4]:
# @title 4. Training (Save by Epoch & Resume)
# 1. Load Model (Full Fine-Tuning)
print(f"--- Loading Model: {MODEL_NAME} ---")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

# 2. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

# 3. Config (Lưu theo Epoch)
training_args = TrainingArguments(
    output_dir=SAVE_PATH,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    learning_rate=5e-5,               # LR chuẩn cho BERT
    weight_decay=0.01,

    # --- CẤU HÌNH LƯU THEO EPOCH ---
    save_strategy="epoch",
    eval_strategy="epoch",
    save_total_limit=1,               # Giữ 1 bản tốt nhất
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    # -------------------------------

    fp16=True,
    report_to="none"
)

# 4. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 5. Auto Resume Logic
print(f"--- Kiểm tra checkpoint tại: {SAVE_PATH} ---")
last_checkpoint = get_last_checkpoint(SAVE_PATH)
start_train_time = time.time()

if last_checkpoint:
    print(f"🔄 Resume from: {last_checkpoint}")
    trainer.train(resume_from_checkpoint=last_checkpoint)
else:
    print("✨ Start New Training...")
    trainer.train()

training_time = time.time() - start_train_time
trainer.save_model(SAVE_PATH)
print("✅ Phần 4: Huấn luyện xong!")

--- Loading Model: bert-base-uncased ---


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


--- Kiểm tra checkpoint tại: /content/drive/My Drive/SLM_Research/SST2_BERT_Base_FTT ---
✨ Start New Training...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.1999,0.266338,0.905963,0.906606,0.917051,0.896396,0.970031
2,0.1464,0.353157,0.91055,0.912946,0.904867,0.921171,0.968382
3,0.0979,0.41022,0.90711,0.906574,0.929078,0.885135,0.967974
4,0.0643,0.394046,0.90367,0.905192,0.90724,0.903153,0.968179


✅ Phần 4: Huấn luyện xong!


In [6]:
# @title 5. Final Report (BERT-base FTT - Full 10 Metrics)
import os
import time
import pandas as pd
import psutil
import torch

print("--- Đang đánh giá lần cuối trên tập Validation (SST-2) ---")

# 1. Prediction & Latency
start_pred = time.time()
predictions_output = trainer.predict(dataset_val)
end_pred = time.time()

metrics = predictions_output.metrics
latency = ((end_pred - start_pred) / len(dataset_val)) * 1000  # ms/sample

# 2. Tính kích thước Model (Full Fine-Tuning)
# Với FTT, ta tính kích thước toàn bộ file model (pytorch_model.bin)
model_bin = os.path.join(SAVE_PATH, 'pytorch_model.bin')
model_safe = os.path.join(SAVE_PATH, 'model.safetensors')

size_mb = 0
if os.path.exists(model_bin):
    size_mb = os.path.getsize(model_bin)
elif os.path.exists(model_safe):
    size_mb = os.path.getsize(model_safe)

size_mb /= (1024**2) # Đổi sang MB

# 3. Tài nguyên hệ thống
process = psutil.Process(os.getpid())
ram_mb = process.memory_info().rss / (1024 ** 2)
vram_mb = torch.cuda.max_memory_allocated() / (1024 ** 2) if torch.cuda.is_available() else 0

# Lấy thời gian train (nếu biến còn tồn tại)
curr_train_time = training_time if 'training_time' in locals() else 0.0

# 4. In Báo Cáo Chi Tiết (Full 10 Metrics)
print("\n====== REPORT: BERT-base Full Fine-Tuning (SST-2) ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - Precision: {metrics.get('test_precision', 0):.4f}")
print(f"   - Recall:    {metrics.get('test_recall', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {curr_train_time:.2f} s")
print(f"   - Inference Latency:  {latency:.4f} ms/sample")
print(f"   - Model Size (Disk):  {size_mb:.2f} MB") # Sẽ rất lớn (~400MB)
print(f"   - Peak RAM Usage:     {ram_mb:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_mb:.2f} MB")

# 5. Lưu File CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1", "ROC-AUC",
               "Training Time (s)", "Inference Latency (ms)", "Model Size (MB)",
               "Peak RAM (MB)", "Peak VRAM (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_precision', 0),
        metrics.get('test_recall', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        curr_train_time,
        latency,
        size_mb,
        ram_mb,
        vram_mb
    ]
})

results_file = os.path.join(SAVE_PATH, 'sst2_bert_base_ftt_full_report.csv')
results_df.to_csv(results_file, index=False)
print(f"\nBáo cáo đầy đủ đã được lưu tại: {results_file}")

--- Đang đánh giá lần cuối trên tập Validation (SST-2) ---



1. Classification Metrics:
   - Accuracy:  0.9106
   - Precision: 0.9049
   - Recall:    0.9212
   - F1-Score:  0.9129
   - ROC-AUC:   0.9684

2. Efficiency Metrics:
   - Training Time:      6578.87 s
   - Inference Latency:  11.3011 ms/sample
   - Model Size (Disk):  417.67 MB
   - Peak RAM Usage:     2367.82 MB
   - Peak VRAM Usage:    4761.45 MB

Báo cáo đầy đủ đã được lưu tại: /content/drive/My Drive/SLM_Research/SST2_BERT_Base_FTT/sst2_bert_base_ftt_full_report.csv
