In [9]:
# @title 1. Setup Environment
!pip install -q transformers peft datasets evaluate scikit-learn accelerate

import os
import re
import time
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from peft import get_peft_model, PromptTuningConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda


In [10]:
# @title 2. Load Data & Tokenize (FIXED LENGTH for ALBERT)
print("--- Loading Stanford IMDB Dataset ---")
dataset = load_dataset("imdb")

def clean_text(example):
    text = example['text']
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

dataset = dataset.map(clean_text)

MODEL_NAME = 'albert-base-v2'
print(f"--- Loading Tokenizer: {MODEL_NAME} ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- GIẢI PHÁP AN TOÀN ---
# Tokenize tối đa 502 tokens để chừa chỗ cho 10 tokens ảo của Prompt Tuning
# 502 + 10 = 512 (Giới hạn của ALBERT)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=502)
# -------------------------

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print("Data ready (max_len=502).")

--- Loading Stanford IMDB Dataset ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Loading Tokenizer: albert-base-v2 ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Data ready (max_len=502).


In [11]:
# @title 3. ALBERT Setup with Prompt Tuning (FIXED DIMENSIONS)

# 1. Load Base Model
print(f"--- Loading Base Model: {MODEL_NAME} ---")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 2. Define Prompt Tuning Config
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=10,

    # --- GIẢI PHÁP SỬA LỖI TENSOR MATCHING ---
    # ALBERT có Embedding Size = 128 (nhỏ hơn Hidden Size 768)
    # Ta BẮT BUỘC phải set token_dim = 128 để nối được vào embedding layer
    token_dim=model.config.embedding_size,        # Lấy tự động (128)
    num_layers=model.config.num_hidden_layers,    # 12 layers
    num_attention_heads=model.config.num_attention_heads, # 12 heads
    # -----------------------------------------
)

# 3. Inject Prompt Tuning
model = get_peft_model(model, peft_config)
model.to(device)

print("\n--- Prompt Tuning Efficiency (ALBERT) ---")
# Bạn sẽ thấy con số này SIÊU NHỎ: 10 tokens * 128 dim = 1280 tham số!
model.print_trainable_parameters()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Loading Base Model: albert-base-v2 ---

--- Prompt Tuning Efficiency (ALBERT) ---
trainable params: 2,818 || all params: 11,687,940 || trainable%: 0.0241


In [12]:
# @title 4. Training (Golden Config)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

training_args = TrainingArguments(
    output_dir='./results_albert_prompt_tuning',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,              # Golden Config (Lưu ý: LR này thấp)
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("--- Starting Training (ALBERT + Prompt Tuning) ---")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()
training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds")

# Save
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

--- Starting Training (ALBERT + Prompt Tuning) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.6751,0.67935,0.5436,0.270927,0.688312,0.168656,0.667186


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.6751,0.67935,0.5436,0.270927,0.688312,0.168656,0.667186
2,0.651,0.649982,0.6476,0.62622,0.670909,0.587112,0.712998
3,0.6417,0.635907,0.668,0.676034,0.663602,0.688942,0.728461
4,0.6377,0.62801,0.684,0.678339,0.694746,0.662689,0.746933
5,0.6314,0.625757,0.686,0.674139,0.704861,0.645982,0.74906



Training completed in: 3584.50 seconds


('/content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning/tokenizer_config.json',
 '/content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning/special_tokens_map.json',
 '/content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning/spiece.model',
 '/content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning/added_tokens.json',
 '/content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning/tokenizer.json')

In [14]:
# @title 5. Final Evaluation on Test Set (ALBERT Prompt Tuning - Full Metrics)
import os
import time
import psutil
import torch
import pandas as pd
import numpy as np

# Đảm bảo đường dẫn đúng (phải khớp với bước training của bạn)
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning'

print("--- Running Evaluation on Test Set ---")

# Kiểm tra biến môi trường
if 'trainer' not in locals() or 'dataset_test' not in locals():
     raise ValueError("Lỗi: Biến 'trainer' hoặc 'dataset_test' chưa được định nghĩa. Hãy chạy bước Training trước.")

# 1. Classification Metrics & Latency
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()

# Lấy các metrics đã tính trong hàm compute_metrics
metrics = predictions_output.metrics

# Tính toán Latency
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# 2. Model Size Check (Chỉ tính trọng số Prompt)
# PEFT lưu file adapter (prompt) riêng, kích thước rất nhỏ
adapter_bin = os.path.join(SAVE_PATH, 'adapter_model.bin')
adapter_safe = os.path.join(SAVE_PATH, 'adapter_model.safetensors')

model_size = 0
format_type = "unknown"

if os.path.exists(adapter_safe):
    model_size = os.path.getsize(adapter_safe) / (1024 * 1024) # MB
    format_type = "safetensors"
elif os.path.exists(adapter_bin):
    model_size = os.path.getsize(adapter_bin) / (1024 * 1024) # MB
    format_type = "bin"

# Lấy thời gian training (nếu có biến từ phiên trước)
current_training_time = training_time if 'training_time' in locals() else 0.0

# Đo RAM/VRAM
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 3. Full Report
print("\n====== REPORT: ALBERT + Prompt Tuning ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - Precision: {metrics.get('test_precision', 0):.4f}")
print(f"   - Recall:    {metrics.get('test_recall', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {current_training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Prompt Size ({format_type}): {model_size:.6f} MB") # 6 số lẻ vì size rất nhỏ
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 4. Save to CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Prompt Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_precision', 0),
        metrics.get('test_recall', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        current_training_time,
        latency_per_sample,
        model_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'imdb_albert_pt_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Evaluation on Test Set ---



1. Classification Metrics:
   - Accuracy:  0.6669
   - Precision: 0.6781
   - Recall:    0.6354
   - F1-Score:  0.6560
   - ROC-AUC:   0.7295

2. Efficiency Metrics:
   - Training Time:      3584.50 s
   - Inference Latency:  12.4695 ms/sample
   - Prompt Size (safetensors): 0.011024 MB
   - Peak RAM Usage:     2776.16 MB
   - Peak VRAM Usage:    120.60 MB

Report saved to /content/drive/My Drive/SLM_Research/IMDB_ALBERT_PromptTuning/imdb_albert_pt_results.csv
