In [9]:
# @title 1. Setup Environment
!pip install -q transformers peft datasets evaluate scikit-learn accelerate

import os
import re
import time
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from peft import get_peft_model, PromptTuningConfig, TaskType
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax
from google.colab import drive

# Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda


In [10]:
# @title 2. Load Data & Tokenize (FIXED LENGTH)
print("--- Loading Stanford IMDB Dataset ---")
dataset = load_dataset("imdb")

def clean_text(example):
    text = example['text']
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<br\s*/>', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

dataset = dataset.map(clean_text)

MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- SỬA LỖI Ở ĐÂY ---
# Prompt Tuning thêm 10 tokens, nên ta chỉ tokenize tối đa 502 tokens
# 502 (text) + 10 (prompt) = 512 (max limit của DistilBERT)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=502)
# ---------------------

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print("Data ready (max_len=502).")

--- Loading Stanford IMDB Dataset ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Data ready (max_len=502).


In [12]:
# @title 3. DistilBERT Setup with Prompt Tuning

# 1. Load Base Model
print(f"--- Loading Base Model: {MODEL_NAME} ---")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 2. Define Prompt Tuning Config
# Mapping thủ công các tham số của DistilBERT cho PEFT
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=10,                      # 10 soft tokens
    num_layers=model.config.n_layers,           # Map từ 'n_layers'
    token_dim=model.config.dim,                 # Map từ 'dim'
    num_attention_heads=model.config.n_heads    # Map từ 'n_heads'
)

# 3. Inject Prompt Tuning
model = get_peft_model(model, peft_config)
model.to(device)

print("\n--- Prompt Tuning Efficiency ---")
model.print_trainable_parameters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Loading Base Model: distilbert-base-uncased ---

--- Prompt Tuning Efficiency ---
trainable params: 599,810 || all params: 67,554,820 || trainable%: 0.8879


In [13]:
# @title 4. Training (Golden Config)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

training_args = TrainingArguments(
    output_dir='./results_distilbert_prompt_tuning',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,              # Golden Config
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("--- Starting Training (DistilBERT + Prompt Tuning) ---")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()
training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds")

# Save
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

--- Starting Training (DistilBERT + Prompt Tuning) ---


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.5157,0.536412,0.7332,0.686413,0.83908,0.580748,0.834248
2,0.4893,0.479415,0.7736,0.757913,0.819611,0.704853,0.85593
3,0.482,0.451996,0.7812,0.784221,0.777778,0.790772,0.871279
4,0.4531,0.436136,0.7972,0.792128,0.817259,0.768496,0.880744
5,0.4309,0.431775,0.8016,0.79402,0.830582,0.760541,0.884222



Training completed in: 1061.97 seconds


('/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning/tokenizer_config.json',
 '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning/special_tokens_map.json',
 '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning/vocab.txt',
 '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning/added_tokens.json',
 '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning/tokenizer.json')

In [15]:
# @title 5. Final Evaluation on Test Set (DistilBERT Prompt Tuning - Full Metrics)
import os
import time
import psutil
import torch
import pandas as pd

# Đảm bảo đường dẫn đúng (phải khớp với đường dẫn lúc train)
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning'

print("--- Running Evaluation on Test Set ---")

# Kiểm tra xem trainer đã tồn tại chưa
if 'trainer' not in locals() or 'dataset_test' not in locals():
     raise ValueError("Lỗi: Biến 'trainer' hoặc 'dataset_test' chưa được định nghĩa. Hãy chạy bước Training trước.")

# 1. Classification Metrics
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()

metrics = predictions_output.metrics

# 2. Efficiency Metrics
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# 3. Model Size Check (Prompt Weights Only)
# PEFT lưu file dưới dạng adapter_model.safetensors hoặc adapter_model.bin
adapter_bin = os.path.join(SAVE_PATH, 'adapter_model.bin')
adapter_safe = os.path.join(SAVE_PATH, 'adapter_model.safetensors')

if os.path.exists(adapter_safe):
    model_size = os.path.getsize(adapter_safe) / (1024 * 1024)
    format_type = "safetensors"
elif os.path.exists(adapter_bin):
    model_size = os.path.getsize(adapter_bin) / (1024 * 1024)
    format_type = "bin"
else:
    model_size = 0
    format_type = "unknown"

# Lấy thời gian train từ phiên trước (nếu còn)
current_training_time = training_time if 'training_time' in locals() else 0.0

# Đo RAM/VRAM
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 4. Report
print("\n====== REPORT: DistilBERT + Prompt Tuning =====")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - Precision: {metrics.get('test_precision', 0):.4f}")
print(f"   - Recall:    {metrics.get('test_recall', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {current_training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Prompt Size ({format_type}): {model_size:.6f} MB") # Hiển thị 6 số lẻ vì rất nhỏ
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 5. Save CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Prompt Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_precision', 0),
        metrics.get('test_recall', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        current_training_time,
        latency_per_sample,
        model_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'imdb_distilbert_pt_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Evaluation on Test Set ---



1. Classification Metrics:
   - Accuracy:  0.8009
   - Precision: 0.8111
   - Recall:    0.7846
   - F1-Score:  0.7976
   - ROC-AUC:   0.8847

2. Efficiency Metrics:
   - Training Time:      1061.97 s
   - Inference Latency:  3.5723 ms/sample
   - Prompt Size (safetensors): 2.288551 MB
   - Peak RAM Usage:     2690.83 MB
   - Peak VRAM Usage:    566.91 MB

Report saved to /content/drive/My Drive/SLM_Research/IMDB_DistilBERT_PromptTuning/imdb_distilbert_pt_results.csv
