In [1]:
# @title 1. Setup, Load Data & Cleaning (LoRA)
!pip install -q transformers datasets evaluate scikit-learn accelerate peft

import os
import re
import time
import psutil
import torch
import numpy as np
import pandas as pd
from google.colab import drive
from datasets import load_dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# 1. Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_LoRA'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load Dataset
print("--- Loading Stanford IMDB Dataset ---")
dataset = load_dataset("imdb")

# 3. Clean Text (Quy chuẩn chung)
def clean_text(example):
    text = example['text']
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

print("--- Cleaning Dataset ---")
dataset = dataset.map(clean_text)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Using device: cuda
--- Loading Stanford IMDB Dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Cleaning Dataset ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [2]:
# @title 2. Tokenization & Data Splits

# 1. Load Tokenizer
MODEL_NAME = 'distilbert-base-uncased'
print(f"--- Loading Tokenizer: {MODEL_NAME} ---")
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# 2. Tokenize Function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# 3. Split Train/Val (90/10)
train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print(f"Dataset ready: Train({len(dataset_train)}), Val({len(dataset_val)}), Test({len(dataset_test)})")

--- Loading Tokenizer: distilbert-base-uncased ---


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset ready: Train(22500), Val(2500), Test(25000)


In [3]:
# @title 3. DistilBERT Setup with LoRA

# 1. Load Base Model
print(f"--- Loading Base Model: {MODEL_NAME} ---")
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 2. Define LoRA Config
# r=8: Rank thấp để tiết kiệm tham số
# lora_alpha=16: Scaling factor thường gấp đôi rank
# target_modules: DistilBERT dùng "q_lin" và "v_lin" trong attention layer
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_lin", "v_lin"]
)

# 3. Inject LoRA Adapters
model = get_peft_model(model, peft_config)
model.to(device)

# 4. Print Trainable Parameters
# Đây là số liệu quan trọng cho bài báo (chứng minh Efficiency)
print("\n--- LoRA Parameter Efficiency ---")
model.print_trainable_parameters()

--- Loading Base Model: distilbert-base-uncased ---


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- LoRA Parameter Efficiency ---
trainable params: 739,586 || all params: 67,694,596 || trainable%: 1.0925


In [4]:
# @title 4. Training DistilBERT + LoRA

# 1. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

# 2. Training Arguments (Golden Config - Giữ nguyên LR 5e-5)
# Lưu ý: LoRA thường có thể dùng LR cao hơn (vd 1e-3),
# nhưng để so sánh công bằng tuyệt đối với Full-FT, ta giữ 5e-5 như yêu cầu.
training_args = TrainingArguments(
    output_dir='./results_distilbert_lora',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,              # Cố định như Full FT
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_distilbert_lora',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# 3. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 4. Start Training
print("--- Starting Training (DistilBERT + LoRA) ---")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()

training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds")

# 5. Save Model
# Lưu ý: Với PEFT, hàm save_model chỉ lưu file adapter_model.bin (rất nhẹ)
print(f"Saving LoRA adapters to {SAVE_PATH}...")
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("LoRA Adapters saved successfully!")

--- Starting Training (DistilBERT + LoRA) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2666,0.291372,0.8876,0.880983,0.942029,0.827367,0.9618
2,0.2561,0.272455,0.898,0.892992,0.944938,0.84646,0.967415
3,0.2506,0.236444,0.9104,0.911532,0.905098,0.918059,0.968806
4,0.2038,0.238057,0.9104,0.909312,0.925804,0.893397,0.969726
5,0.2052,0.233845,0.9128,0.912026,0.925471,0.898966,0.969893



Training completed in: 1184.57 seconds
Saving LoRA adapters to /content/drive/My Drive/SLM_Research/IMDB_DistilBERT_LoRA...
LoRA Adapters saved successfully!


In [5]:
# @title 5. Final Evaluation on Test Set (LoRA Robust)
import os
import time
import psutil
import torch
import pandas as pd

print("--- Running Evaluation on Test Set ---")

# 1. Classification Metrics
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()
metrics = predictions_output.metrics

# 2. Efficiency Metrics
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# 3. Model Size Check (Adapter Size Only)
# PEFT lưu adapter_model.safetensors hoặc adapter_model.bin
adapter_bin = os.path.join(SAVE_PATH, 'adapter_model.bin')
adapter_safe = os.path.join(SAVE_PATH, 'adapter_model.safetensors')

if os.path.exists(adapter_safe):
    model_size = os.path.getsize(adapter_safe) / (1024 * 1024)
    format_type = "safetensors (adapter)"
elif os.path.exists(adapter_bin):
    model_size = os.path.getsize(adapter_bin) / (1024 * 1024)
    format_type = "bin (adapter)"
else:
    model_size = 0
    format_type = "unknown"

current_training_time = training_time if 'training_time' in locals() else 0.0

process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 4. Report
print("\n====== REPORT: DistilBERT + LoRA on IMDB ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {current_training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Adapter Size ({format_type}): {model_size:.2f} MB") # Chỉ kích thước phần học thêm
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 5. Save CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Adapter Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        current_training_time,
        latency_per_sample,
        model_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'imdb_distilbert_lora_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Evaluation on Test Set ---



1. Classification Metrics:
   - Accuracy:  0.9182
   - F1-Score:  0.9184
   - ROC-AUC:   0.9740

2. Efficiency Metrics:
   - Training Time:      1184.57 s
   - Inference Latency:  3.7926 ms/sample
   - Adapter Size (safetensors (adapter)): 2.82 MB
   - Peak RAM Usage:     2648.33 MB
   - Peak VRAM Usage:    282.23 MB

Report saved to /content/drive/My Drive/SLM_Research/IMDB_DistilBERT_LoRA/imdb_distilbert_lora_results.csv
