In [1]:
# @title 1. Setup, Load Data & Cleaning (LoRA Base)
# Cài đặt nếu ở phiên mới
!pip install -q transformers datasets evaluate scikit-learn accelerate peft

import os
import re
import time
import psutil
import torch
import numpy as np
import pandas as pd
from google.colab import drive
from datasets import load_dataset
from transformers import (
    AlbertTokenizerFast,
    AlbertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from peft import get_peft_model, LoraConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# 1. Mount Drive & Path
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_ALBERT_LoRA'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load Dataset
print("--- Loading Stanford IMDB Dataset ---")
dataset = load_dataset("imdb")

# 3. Clean Text (Quy chuẩn chung)
def clean_text(example):
    text = example['text']
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

print("--- Cleaning Dataset ---")
dataset = dataset.map(clean_text)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Using device: cuda
--- Loading Stanford IMDB Dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Cleaning Dataset ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [2]:
# @title 2. Tokenization (ALBERT) & Data Splits

# 1. Load Tokenizer (ALBERT)
MODEL_NAME = 'albert-base-v2'
print(f"--- Loading Tokenizer: {MODEL_NAME} ---")
tokenizer = AlbertTokenizerFast.from_pretrained(MODEL_NAME)

# 2. Tokenize Function
def tokenize_function(examples):
    # ALBERT max length cũng là 512
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("--- Tokenizing Dataset for ALBERT ---")
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# 3. Split Train/Val (90/10)
train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print(f"Dataset ready: Train({len(dataset_train)}), Val({len(dataset_val)}), Test({len(dataset_test)})")

--- Loading Tokenizer: albert-base-v2 ---


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

--- Tokenizing Dataset for ALBERT ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset ready: Train(22500), Val(2500), Test(25000)


In [3]:
# @title 3. ALBERT Setup with LoRA (Specific Target Modules)

# 1. Load Base Model
print(f"--- Loading Base Model: {MODEL_NAME} ---")
model = AlbertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 2. Define LoRA Config cho ALBERT
# Lưu ý: Target modules của ALBERT là "query" và "value"
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,                    # Rank giữ nguyên = 8
    lora_alpha=16,          # Alpha giữ nguyên = 16
    lora_dropout=0.1,
    target_modules=["query", "value"] # <-- THAY ĐỔI QUAN TRỌNG CHO ALBERT
)

# 3. Inject LoRA Adapters
model = get_peft_model(model, peft_config)
model.to(device)

# 4. Print Trainable Parameters
# Hãy quan sát kỹ con số này. Vì ALBERT chia sẻ tham số,
# số lượng tham số LoRA thêm vào cũng sẽ rất nhỏ.
print("\n--- LoRA Parameter Efficiency (ALBERT) ---")
model.print_trainable_parameters()

--- Loading Base Model: albert-base-v2 ---


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- LoRA Parameter Efficiency (ALBERT) ---
trainable params: 26,114 || all params: 11,711,236 || trainable%: 0.2230


In [4]:
# @title 4. Training ALBERT + LoRA (Golden Config)

# 1. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

# 2. Training Arguments (Golden Config - Giữ nguyên LR 5e-5 để so sánh công bằng)
training_args = TrainingArguments(
    output_dir='./results_albert_lora',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,              # STRICTLY KEPT at 5e-5 for fair comparison
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs_albert_lora',
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# 3. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 4. Start Training
print("--- Starting Training (ALBERT + LoRA) ---")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()

training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds")

# 5. Save Model (Only Adapters)
print(f"Saving LoRA adapters to {SAVE_PATH}...")
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("ALBERT LoRA Adapters saved successfully!")

--- Starting Training (ALBERT + LoRA) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.2413,0.278136,0.894,0.888515,0.942857,0.840095,0.964996
2,0.2129,0.251056,0.914,0.911413,0.945299,0.879873,0.971054
3,0.2535,0.222506,0.9208,0.921429,0.91924,0.923628,0.973966
4,0.1608,0.230716,0.922,0.920764,0.94103,0.901352,0.97432
5,0.1748,0.225333,0.924,0.923572,0.934093,0.913286,0.974837



Training completed in: 3799.79 seconds
Saving LoRA adapters to /content/drive/My Drive/SLM_Research/IMDB_ALBERT_LoRA...
ALBERT LoRA Adapters saved successfully!


In [6]:
# @title 5. Final Evaluation on Test Set (LoRA Robust - Updated)
import os
import time
import psutil
import torch
import pandas as pd

print("--- Running Evaluation on Test Set ---")

# 1. Classification Metrics
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()
metrics = predictions_output.metrics

# 2. Efficiency Metrics
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# 3. Model Size Check (Adapter Size Only)
adapter_bin = os.path.join(SAVE_PATH, 'adapter_model.bin')
adapter_safe = os.path.join(SAVE_PATH, 'adapter_model.safetensors')

if os.path.exists(adapter_safe):
    model_size = os.path.getsize(adapter_safe) / (1024 * 1024)
    format_type = "safetensors (adapter)"
elif os.path.exists(adapter_bin):
    model_size = os.path.getsize(adapter_bin) / (1024 * 1024)
    format_type = "bin (adapter)"
else:
    model_size = 0
    format_type = "unknown"

current_training_time = training_time if 'training_time' in locals() else 0.0

process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 4. Report (ĐÃ BỔ SUNG PRECISION & RECALL)
print("\n====== REPORT: ALBERT + LoRA on IMDB ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - Precision: {metrics.get('test_precision', 0):.4f}") # Mới
print(f"   - Recall:    {metrics.get('test_recall', 0):.4f}")    # Mới
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {current_training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Adapter Size ({format_type}): {model_size:.2f} MB")
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 5. Save CSV (ĐÃ BỔ SUNG)
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Adapter Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_precision', 0), # Mới
        metrics.get('test_recall', 0),    # Mới
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        current_training_time,
        latency_per_sample,
        model_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'imdb_albert_lora_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Evaluation on Test Set ---



1. Classification Metrics:
   - Accuracy:  0.9311
   - Precision: 0.9322
   - Recall:    0.9298
   - F1-Score:  0.9310
   - ROC-AUC:   0.9800

2. Efficiency Metrics:
   - Training Time:      3799.79 s
   - Inference Latency:  12.5747 ms/sample
   - Adapter Size (safetensors (adapter)): 0.10 MB
   - Peak RAM Usage:     2728.00 MB
   - Peak VRAM Usage:    62.76 MB

Report saved to /content/drive/My Drive/SLM_Research/IMDB_ALBERT_LoRA/imdb_albert_lora_results.csv
