In [None]:
# @title 1. Setup, Load Data & Cleaning
!pip install -q transformers datasets evaluate scikit-learn accelerate

import os
import re
import time
import psutil
import torch
import numpy as np
import pandas as pd
from google.colab import drive
from datasets import load_dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# 1. Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load Dataset (IMDB - Stanford)
print("--- Loading Stanford IMDB Dataset ---")
dataset = load_dataset("imdb")

# 3. Clean Text (Chuẩn hóa giống ALBERT: Xóa URL, HTML)
def clean_text(example):
    text = example['text']
    text = text.lower()
    # Xóa URL (http, https, www)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Xóa HTML
    text = re.sub(r'<br\s*/>', ' ', text)
    # Xóa khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

print("--- Cleaning Dataset (Removing URLs & HTML) ---")
dataset = dataset.map(clean_text)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda
--- Loading Stanford IMDB Dataset ---
--- Cleaning Dataset (Removing URLs & HTML) ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# @title 2. Tokenization & Data Splits for DistilBERT

# 1. Load Tokenizer
MODEL_NAME = 'distilbert-base-uncased'
print(f"--- Loading Tokenizer: {MODEL_NAME} ---")
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# 2. Tokenize Function
def tokenize_function(examples):
    # Max length 512, Truncation=True, Padding=max_length
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("--- Tokenizing Dataset ---")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 3. Format PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# 4. Split Train/Val (90/10)
print("--- Splitting Train set into Train/Val ---")
train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print(f"Dataset ready: Train({len(dataset_train)}), Val({len(dataset_val)}), Test({len(dataset_test)})")

--- Loading Tokenizer: distilbert-base-uncased ---
--- Tokenizing Dataset ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Splitting Train set into Train/Val ---
Dataset ready: Train(22500), Val(2500), Test(25000)


In [None]:
# @title 3. Training DistilBERT (Golden Config)

# 1. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

# 2. Model Init
print(f"--- Loading Model: {MODEL_NAME} ---")
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

# 3. Training Arguments (Giống hệt ALBERT)
training_args = TrainingArguments(
    output_dir='./results_distilbert',
    num_train_epochs=5,              # 5 Epochs
    per_device_train_batch_size=16,  # Batch 16
    per_device_eval_batch_size=32,   # Batch 32
    learning_rate=5e-5,              # LR 5e-5
    warmup_steps=500,                # Warmup 500
    weight_decay=0.01,               # Decay 0.01
    logging_dir='./logs_distilbert',
    logging_steps=100,

    # --- CHIẾN LƯỢC EPOCH ---
    eval_strategy="epoch",           # Đánh giá sau mỗi Epoch
    save_strategy="epoch",           # Lưu sau mỗi Epoch
    load_best_model_at_end=True,     # Load model tốt nhất
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# 4. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] # Dừng nếu 3 Epochs ko tăng
)

# 5. Start Training
print("--- Starting Training ---")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()

training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds")

# 6. Save Model
print(f"Saving model to {SAVE_PATH}...")
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("DistilBERT Model saved successfully!")

--- Loading Model: distilbert-base-uncased ---


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting Training ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2469,0.246576,0.9068,0.905553,0.92314,0.888624,0.967998
2,0.1806,0.246668,0.9264,0.925747,0.939394,0.91249,0.975286
3,0.0889,0.325773,0.9248,0.923948,0.939918,0.908512,0.975377
4,0.0279,0.360916,0.928,0.927419,0.940311,0.914877,0.975312
5,0.0084,0.438733,0.922,0.921844,0.928918,0.914877,0.974457



Training completed in: 1521.91 seconds
Saving model to /content/drive/My Drive/SLM_Research/IMDB_DistilBERT...
DistilBERT Model saved successfully!


In [None]:
# @title 4. Final Evaluation on Test Set (FIXED)
import os
import time
import psutil
import torch
import pandas as pd

print("--- Running Evaluation on Test Set ---")

# Kiểm tra các biến bắt buộc
if 'trainer' not in locals() or 'dataset_test' not in locals():
    raise ValueError("Lỗi: Biến 'trainer' hoặc 'dataset_test' chưa được định nghĩa. Hãy chạy các bước trên trước.")

# 1. Classification Metrics
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()
metrics = predictions_output.metrics

# 2. Efficiency Metrics
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# Xử lý tính kích thước Model (Hỗ trợ cả .bin và .safetensors)
bin_path = os.path.join(SAVE_PATH, 'pytorch_model.bin')
safe_path = os.path.join(SAVE_PATH, 'model.safetensors')

if os.path.exists(bin_path):
    model_size = os.path.getsize(bin_path) / (1024 * 1024) # MB
elif os.path.exists(safe_path):
    model_size = os.path.getsize(safe_path) / (1024 * 1024) # MB
else:
    print("Cảnh báo: Không tìm thấy file model để tính kích thước.")
    model_size = 0

# Xử lý biến training_time (Nếu lỡ mất biến do restart session)
current_training_time = training_time if 'training_time' in locals() else 0.0

# Đo RAM/VRAM
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 3. Report
print("\n====== REPORT: DistilBERT on IMDB ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - Precision: {metrics.get('test_precision', 0):.4f}")
print(f"   - Recall:    {metrics.get('test_recall', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {current_training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Model Size (Disk):  {model_size:.2f} MB")
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 4. Save CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Model Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        current_training_time,
        latency_per_sample,
        model_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'imdb_distilbert_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Evaluation on Test Set ---


Cảnh báo: Không tìm thấy file model để tính kích thước.

1. Classification Metrics:
   - Accuracy:  0.9290
   - Precision: 0.9306
   - Recall:    0.9270
   - F1-Score:  0.9288
   - ROC-AUC:   0.9786

2. Efficiency Metrics:
   - Training Time:      1521.91 s
   - Inference Latency:  3.7598 ms/sample
   - Model Size (Disk):  0.00 MB
   - Peak RAM Usage:     2471.23 MB
   - Peak VRAM Usage:    805.41 MB

Report saved to /content/drive/My Drive/SLM_Research/IMDB_DistilBERT/imdb_distilbert_results.csv
