In [1]:
# @title 1. Setup Environment & Load SST-2
!pip install -q transformers peft datasets evaluate scikit-learn accelerate psutil

import os
import time
import psutil
import torch
import pandas as pd
import numpy as np
from google.colab import drive
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# 1. Mount Drive
drive.mount('/content/drive')
SAVE_PATH = '/content/drive/My Drive/SLM_Research/SST2_DistilBERT_PromptTuning'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load SST-2 Dataset
print("--- Loading SST-2 ---")
dataset = load_dataset("glue", "sst2")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Using device: cuda
--- Loading SST-2 ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

sst2/train-00000-of-00001.parquet:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

sst2/validation-00000-of-00001.parquet:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

sst2/test-00000-of-00001.parquet:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [2]:
# @title 2. Tokenization (DistilBERT - 502 + 10 virtual)
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    # Max length 502 để chừa chỗ cho soft prompts (tổng cộng 512)
    return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=502)

print("--- Tokenizing Dataset ---")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Chuẩn hóa format
tokenized_datasets = tokenized_datasets.remove_columns(["sentence", "idx"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

dataset_train = tokenized_datasets["train"]
dataset_val = tokenized_datasets["validation"]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

--- Tokenizing Dataset ---


Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [4]:
# @title 3. DistilBERT + Prompt Tuning Configuration (Fixed ValueError)

# 1. Load Base Model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# 2. Trích xuất số lượng tầng (layers) để fix lỗi ValueError
if hasattr(model.config, "n_layers"):
    num_layers = model.config.n_layers
elif hasattr(model.config, "num_hidden_layers"):
    num_layers = model.config.num_hidden_layers
else:
    num_layers = 6 # Mặc định cho DistilBERT nếu không tìm thấy

# 3. Định nghĩa Prompt Tuning Config với tham số num_layers
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    num_virtual_tokens=10,
    prompt_tuning_init=PromptTuningInit.RANDOM,
    num_layers=num_layers,          # Dòng quan trọng để sửa lỗi
    token_dim=model.config.dim,      # Đảm bảo khớp với kích thước của DistilBERT (768)
    num_attention_heads=model.config.n_heads,
    num_transformer_submodules=1
)

# 4. Chèn Prompt Tuning vào mô hình
model = get_peft_model(model, peft_config)
model.to(device)

print("\n--- Prompt Tuning Efficiency (DistilBERT) ---")
model.print_trainable_parameters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



--- Prompt Tuning Efficiency (DistilBERT) ---
trainable params: 599,810 || all params: 67,554,820 || trainable%: 0.8879


In [5]:
# @title 4. Training (Prompt Tuning on SST-2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

training_args = TrainingArguments(
    output_dir='./results_sst2_distilbert_prompt',
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("--- Starting Training (DistilBERT Prompt Tuning) ---")
start_train_time = time.time()
trainer.train()
training_time = time.time() - start_train_time

# Save Adapter weights (Soft Prompts)
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)

--- Starting Training (DistilBERT Prompt Tuning) ---


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.459,0.397173,0.813073,0.814983,0.82151,0.808559,0.901035
2,0.4318,0.387709,0.819954,0.822197,0.826879,0.817568,0.90725
3,0.4108,0.380518,0.819954,0.82016,0.834499,0.806306,0.910344
4,0.4081,0.380058,0.817661,0.821948,0.817372,0.826577,0.911357
5,0.419,0.378222,0.817661,0.820744,0.82167,0.81982,0.911815


('/content/drive/My Drive/SLM_Research/SST2_DistilBERT_PromptTuning/tokenizer_config.json',
 '/content/drive/My Drive/SLM_Research/SST2_DistilBERT_PromptTuning/special_tokens_map.json',
 '/content/drive/My Drive/SLM_Research/SST2_DistilBERT_PromptTuning/vocab.txt',
 '/content/drive/My Drive/SLM_Research/SST2_DistilBERT_PromptTuning/added_tokens.json',
 '/content/drive/My Drive/SLM_Research/SST2_DistilBERT_PromptTuning/tokenizer.json')

In [6]:
# @title 5. Final Evaluation (Full Metrics)
print("--- Running Final Evaluation on Validation Set ---")

# 1. Prediction & Latency
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_val)
end_pred_time = time.time()

metrics = predictions_output.metrics
total_samples = len(dataset_val)
latency = ((end_pred_time - start_pred_time) / total_samples) * 1000

# 2. Prompt Size Check (Thường cực nhỏ)
adapter_file = os.path.join(SAVE_PATH, 'adapter_model.safetensors')
if not os.path.exists(adapter_file):
    adapter_file = os.path.join(SAVE_PATH, 'adapter_model.bin')
adapter_size = os.path.getsize(adapter_file) / (1024 * 1024)

# 3. Resource Usage
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 4. Final Report
print("\n====== REPORT: DistilBERT + Prompt Tuning on SST-2 ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - Precision: {metrics.get('test_precision', 0):.4f}")
print(f"   - Recall:    {metrics.get('test_recall', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {training_time:.2f} s")
print(f"   - Inference Latency:  {latency:.4f} ms/sample")
print(f"   - Prompt Size (MB):   {adapter_size:.6f} MB")
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 5. Save CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "Precision", "Recall", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Prompt Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_precision', 0),
        metrics.get('test_recall', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        training_time,
        latency,
        adapter_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'sst2_distilbert_prompt_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Final Evaluation on Validation Set ---



1. Classification Metrics:
   - Accuracy:  0.8200
   - Precision: 0.8269
   - Recall:    0.8176
   - F1-Score:  0.8222
   - ROC-AUC:   0.9072

2. Efficiency Metrics:
   - Training Time:      2967.52 s
   - Inference Latency:  3.4550 ms/sample
   - Prompt Size (MB):   2.288551 MB
   - Peak RAM Usage:     2454.32 MB
   - Peak VRAM Usage:    280.62 MB

Report saved to /content/drive/My Drive/SLM_Research/SST2_DistilBERT_PromptTuning/sst2_distilbert_prompt_results.csv
