In [1]:
# @title 1. Setup, Load Data & Cleaning
!pip install -q transformers datasets evaluate scikit-learn accelerate

import os
import re
import time
import psutil
import torch
import numpy as np
import pandas as pd
from google.colab import drive
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax

# 1. Mount Drive
drive.mount('/content/drive')
# Đặt đường dẫn riêng cho TinyBERT
SAVE_PATH = '/content/drive/My Drive/SLM_Research/IMDB_TinyBERT'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 2. Load Dataset (IMDB)
print("--- Loading Stanford IMDB Dataset ---")
dataset = load_dataset("imdb")

# 3. Clean Text (Chuẩn hóa: Xóa URL, HTML, Lowercase)
def clean_text(example):
    text = example['text']
    text = text.lower()
    # Xóa URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Xóa HTML
    text = re.sub(r'<br\s*/>', ' ', text)
    # Xóa khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

print("--- Cleaning Dataset (Removing URLs & HTML) ---")
dataset = dataset.map(clean_text)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
Using device: cuda
--- Loading Stanford IMDB Dataset ---


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Cleaning Dataset (Removing URLs & HTML) ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [2]:
# @title 2. Tokenization & Data Splits for TinyBERT

# 1. Load Tokenizer
# Sử dụng phiên bản TinyBERT 4 lớp của Huawei Noah's Ark Lab
MODEL_NAME = 'huawei-noah/TinyBERT_General_4L_312D'
print(f"--- Loading Tokenizer: {MODEL_NAME} ---")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 2. Tokenize Function
def tokenize_function(examples):
    # Max length 512, Truncation, Padding
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("--- Tokenizing Dataset ---")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 3. Format PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# 4. Split Train/Val (90/10)
print("--- Splitting Train set into Train/Val ---")
train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print(f"Dataset ready: Train({len(dataset_train)}), Val({len(dataset_val)}), Test({len(dataset_test)})")

--- Loading Tokenizer: huawei-noah/TinyBERT_General_4L_312D ---


config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

--- Tokenizing Dataset ---


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

--- Splitting Train set into Train/Val ---
Dataset ready: Train(22500), Val(2500), Test(25000)


In [3]:
# @title 3. Training TinyBERT (Golden Config)

# 1. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    probs = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    roc_auc = roc_auc_score(labels, probs)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

# 2. Model Init
print(f"--- Loading Model: {MODEL_NAME} ---")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model.to(device)

# 3. Training Arguments (Giống hệt DistilBERT/ALBERT)
training_args = TrainingArguments(
    output_dir='./results_tinybert',
    num_train_epochs=5,              # 5 Epochs
    per_device_train_batch_size=16,  # Batch 16
    per_device_eval_batch_size=32,   # Batch 32
    learning_rate=5e-5,              # LR 5e-5
    warmup_steps=500,                # Warmup 500
    weight_decay=0.01,               # Decay 0.01
    logging_dir='./logs_tinybert',
    logging_steps=100,

    # --- CHIẾN LƯỢC EPOCH ---
    eval_strategy="epoch",           # Đánh giá sau mỗi Epoch
    save_strategy="epoch",           # Lưu sau mỗi Epoch
    load_best_model_at_end=True,     # Load model tốt nhất
    metric_for_best_model="accuracy",
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

# 4. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 5. Start Training
print("--- Starting Training (TinyBERT) ---")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()

training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds")

# 6. Save Model
print(f"Saving model to {SAVE_PATH}...")
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("TinyBERT Model saved successfully!")

--- Loading Model: huawei-noah/TinyBERT_General_4L_312D ---


pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


--- Starting Training (TinyBERT) ---


model.safetensors:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.3191,0.31658,0.8792,0.873216,0.924444,0.827367,0.947372
2,0.2722,0.320364,0.8952,0.897013,0.886558,0.907717,0.958431
3,0.2151,0.449319,0.874,0.862265,0.957282,0.784407,0.961973
4,0.1617,0.392452,0.9,0.898208,0.919933,0.877486,0.96179
5,0.125,0.480837,0.8964,0.892665,0.931661,0.856802,0.961456



Training completed in: 1041.26 seconds
Saving model to /content/drive/My Drive/SLM_Research/IMDB_TinyBERT...
TinyBERT Model saved successfully!


In [4]:
# @title 4. Final Evaluation on Test Set (Robust Version)
import os
import time
import psutil
import torch
import pandas as pd

print("--- Running Evaluation on Test Set ---")

# Kiểm tra các biến bắt buộc
if 'trainer' not in locals() or 'dataset_test' not in locals():
    raise ValueError("Lỗi: Biến 'trainer' hoặc 'dataset_test' chưa được định nghĩa.")

# 1. Classification Metrics
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()
metrics = predictions_output.metrics

# 2. Efficiency Metrics
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # ms

# 3. Model Size Check (Support both .bin and .safetensors)
safetensors_path = os.path.join(SAVE_PATH, 'model.safetensors')
bin_path = os.path.join(SAVE_PATH, 'pytorch_model.bin')

if os.path.exists(safetensors_path):
    model_size = os.path.getsize(safetensors_path) / (1024 * 1024)
    format_type = "safetensors"
elif os.path.exists(bin_path):
    model_size = os.path.getsize(bin_path) / (1024 * 1024)
    format_type = "bin"
else:
    model_size = 0
    format_type = "unknown"

# Handle training_time variable if lost
current_training_time = training_time if 'training_time' in locals() else 0.0

# Resource Usage
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 4. Report
print("\n====== REPORT: TinyBERT on IMDB ======")
print(f"1. Classification Metrics:")
print(f"   - Accuracy:  {metrics.get('test_accuracy', 0):.4f}")
print(f"   - F1-Score:  {metrics.get('test_f1', 0):.4f}")
print(f"   - ROC-AUC:   {metrics.get('test_roc_auc', 0):.4f}")

print(f"\n2. Efficiency Metrics:")
print(f"   - Training Time:      {current_training_time:.2f} s")
print(f"   - Inference Latency:  {latency_per_sample:.4f} ms/sample")
print(f"   - Model Size ({format_type}): {model_size:.2f} MB")
print(f"   - Peak RAM Usage:     {ram_usage:.2f} MB")
print(f"   - Peak VRAM Usage:    {vram_usage:.2f} MB")

# 5. Save CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "F1", "ROC-AUC", "Training Time (s)", "Inference Latency (ms)", "Model Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        current_training_time,
        latency_per_sample,
        model_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'imdb_tinybert_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file}")

--- Running Evaluation on Test Set ---



1. Classification Metrics:
   - Accuracy:  0.8955
   - F1-Score:  0.8947
   - ROC-AUC:   0.9602

2. Efficiency Metrics:
   - Training Time:      1041.26 s
   - Inference Latency:  3.5716 ms/sample
   - Model Size (safetensors): 54.75 MB
   - Peak RAM Usage:     2718.29 MB
   - Peak VRAM Usage:    183.63 MB

Report saved to /content/drive/My Drive/SLM_Research/IMDB_TinyBERT/imdb_tinybert_results.csv
