In [1]:
# @title 1. Setup, Load Data & Cleaning (New Session)
# Cài đặt các thư viện cần thiết
!pip install -q transformers datasets evaluate scikit-learn accelerate

import os
import re
import time
import psutil
import torch
import numpy as np
import pandas as pd
from google.colab import drive
from datasets import load_dataset
from transformers import (
    AlbertTokenizerFast,
    AlbertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# 1. Mount Google Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_ALBERT'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# Thiết lập device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load Dataset (IMDB - Stanford)
print("--- Loading Stanford IMDB Dataset ---")
dataset = load_dataset("imdb")

# 3. Define Cleaning Function (Bao gồm xóa URL như yêu cầu)
def clean_text(example):
    text = example['text']
    # Chuyển về chữ thường
    text = text.lower()
    # Loại bỏ URL (http, https, www)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Loại bỏ thẻ HTML
    text = re.sub(r'<br\s*/>', ' ', text)
    # Loại bỏ khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

print("--- Cleaning Dataset (Removing URLs & HTML) ---")
dataset = dataset.map(clean_text)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Using device: cuda
--- Loading Stanford IMDB Dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Cleaning Dataset (Removing URLs & HTML) ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [2]:
# @title 2. Tokenization & Data Splits for ALBERT

# 1. Load Tokenizer ALBERT
MODEL_NAME = 'albert-base-v2'
print(f"--- Loading Tokenizer: {MODEL_NAME} ---")
tokenizer = AlbertTokenizerFast.from_pretrained(MODEL_NAME)

# 2. Tokenize Function
def tokenize_function(examples):
    # ALBERT input max length cũng là 512
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("--- Tokenizing Dataset ---")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 3. Format cho PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# 4. Split Train/Val (90/10)
# IMDB gốc chỉ có Train/Test, ta cần tạo Validation set từ Train để chạy Early Stopping
print("--- Splitting Train set into Train/Val ---")
train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print(f"Dataset ready: Train({len(dataset_train)}), Val({len(dataset_val)}), Test({len(dataset_test)})")

--- Loading Tokenizer: albert-base-v2 ---


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

--- Tokenizing Dataset ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Splitting Train set into Train/Val ---
Dataset ready: Train(22500), Val(2500), Test(25000)


In [5]:
# @title 3. Training ALBERT (Corrected: Epoch-based Early Stopping)

# Định nghĩa lại tham số huấn luyện chuẩn
training_args = TrainingArguments(
    output_dir='./results_albert',
    num_train_epochs=5,              # Tổng số epoch
    per_device_train_batch_size=16,  # GIỮ NGUYÊN
    per_device_eval_batch_size=32,   # GIỮ NGUYÊN
    learning_rate=5e-5,              # GIỮ NGUYÊN (Khai báo tường minh)
    warmup_steps=500,                # GIỮ NGUYÊN
    weight_decay=0.01,               # GIỮ NGUYÊN
    logging_dir='./logs_albert',
    logging_steps=100,               # Log tiến độ mỗi 100 bước (chỉ để xem, không ảnh hưởng train)

    # --- THAY ĐỔI QUAN TRỌNG: TÍNH THEO EPOCH ---
    eval_strategy="epoch",           # Đánh giá sau mỗi Epoch
    save_strategy="epoch",           # Lưu checkpoint sau mỗi Epoch
    # eval_steps - Đã bỏ (không dùng nữa)
    # save_steps - Đã bỏ (không dùng nữa)
    # --------------------------------------------

    load_best_model_at_end=True,     # Load model tốt nhất (theo epoch tốt nhất)
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# Khởi tạo lại Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    # Patience = 3 Epochs (Mô hình sẽ dừng nếu 3 Epoch liên tiếp không cải thiện)
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# Bắt đầu Train lại
print("--- Starting Training (Epoch-based Strategy) ---")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()

training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds")

# Lưu model đè lên thư mục cũ
print(f"Saving model to {SAVE_PATH}...")
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("Model saved successfully!")

--- Starting Training (Epoch-based Strategy) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2847,0.257944,0.8996,0.896324,0.932131,0.863166,0.962815
2,0.2067,0.294777,0.9184,0.915842,0.951157,0.883055,0.976886
3,0.1677,0.270242,0.9264,0.925385,0.943755,0.907717,0.977214


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2847,0.257944,0.8996,0.896324,0.932131,0.863166,0.962815
2,0.2067,0.294777,0.9184,0.915842,0.951157,0.883055,0.976886
3,0.1677,0.270242,0.9264,0.925385,0.943755,0.907717,0.977214
4,0.0584,0.4687,0.9132,0.908323,0.968468,0.855211,0.974406
5,0.0196,0.43487,0.928,0.927711,0.93674,0.918854,0.974778



Training completed in: 3866.78 seconds
Saving model to /content/drive/My Drive/SLM_Research/IMDB_ALBERT...
Model saved successfully!


In [8]:
# @title 4. Final Evaluation on Test Set (FIXED for .safetensors)
import os
import time
import psutil
import torch
import pandas as pd

print("--- Running Evaluation on Test Set ---")

# Kiểm tra các biến bắt buộc
if 'trainer' not in locals() or 'dataset_test' not in locals():
    raise ValueError("Lỗi: Biến 'trainer' hoặc 'dataset_test' chưa được định nghĩa. Hãy chạy các bước training trước.")

# 1. Classification Metrics
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()
metrics = predictions_output.metrics

# 2. Efficiency Metrics
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# --- SỬA LỖI Ở ĐÂY: Kiểm tra cả 2 định dạng file ---
safetensors_path = os.path.join(SAVE_PATH, 'model.safetensors')
bin_path = os.path.join(SAVE_PATH, 'pytorch_model.bin')

if os.path.exists(safetensors_path):
    model_size = os.path.getsize(safetensors_path) / (1024 * 1024) # MB
    print(f"Detected model format: safetensors ({model_size:.2f} MB)")
elif os.path.exists(bin_path):
    model_size = os.path.getsize(bin_path) / (1024 * 1024) # MB
    print(f"Detected model format: pytorch_model.bin ({model_size:.2f} MB)")
else:
    print(f"WARNING: Không tìm thấy file model tại {SAVE_PATH}. Kích thước tính là 0.")
    model_size = 0
# ---------------------------------------------------

# Xử lý biến training_time (Nếu lỡ mất biến do restart session)
current_training_time = training_time if 'training_time' in locals() else 0.0

# Đo RAM/VRAM
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 3. Report
print("\n====== REPORT ======")
print(f"Model Path: {SAVE_PATH}")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - Precision: {metrics.get('test_precision', 0):.4f}")
print(f"   - Recall:    {metrics.get('test_recall', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {current_training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Model Size (Disk):  {model_size:.2f} MB")
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 4. Save CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Model Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        current_training_time,
        latency_per_sample,
        model_size
    ]
})

# Tự động lấy tên folder để đặt tên file CSV
model_folder_name = os.path.basename(SAVE_PATH)
results_file = os.path.join(SAVE_PATH, f'{model_folder_name}_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Evaluation on Test Set ---


Detected model format: safetensors (44.58 MB)

Model Path: /content/drive/My Drive/SLM_Research/IMDB_ALBERT
1. Classification Metrics:
   - Accuracy:  0.9324
   - Precision: 0.9277
   - Recall:    0.9380
   - F1-Score:  0.9328
   - ROC-AUC:   0.9769

2. Efficiency Metrics:
   - Training Time:      3866.78 s
   - Inference Latency:  11.9002 ms/sample
   - Model Size (Disk):  44.58 MB
   - Peak RAM Usage:     2574.69 MB
   - Peak VRAM Usage:    250.15 MB

Report saved to /content/drive/My Drive/SLM_Research/IMDB_ALBERT/IMDB_ALBERT_results.csv
