In [1]:
# @title 1. Setup, Load Data & Cleaning
# Cài thư viện nếu chưa có (chạy 1 lần)
# !pip install transformers datasets evaluate scikit-learn accelerate matplotlib pandas

import os
import re
import time
import psutil
import torch
import numpy as np
import pandas as pd
import gc
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from scipy.special import softmax


# Chỉ định chỉ dùng 1 GPU (ví dụ GPU số 0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
# 1. Setup Path (Lưu kết quả ngay tại thư mục chứa file code trên máy)
SAVE_PATH = './Falcon_1B_IMDB_Results'
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

# 2. Check Device (Quan trọng cho VS Code local)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device} - Falcon-1B_FTT_IMDB.ipynb:34")
if device.type == 'cuda':
    print(f"GPU Name: {torch.cuda.get_device_name(0)} - Falcon-1B_FTT_IMDB.ipynb:36")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB - Falcon-1B_FTT_IMDB.ipynb:37")

# Dọn dẹp VRAM cũ
torch.cuda.empty_cache()
gc.collect()

# 3. Load Dataset
print("Loading IMDB Dataset - Falcon-1B_FTT_IMDB.ipynb:44")
# Load Full Dataset
dataset = load_dataset("imdb") 

# 4. Clean Text (Chuẩn hóa giống file mẫu)
def clean_text(example):
    text = example['text']
    text = text.lower()
    # Xóa URL
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Xóa HTML tags (<br />)
    text = re.sub(r'<br\s*/>', ' ', text)
    # Xóa khoảng trắng thừa
    text = re.sub(r'\s+', ' ', text).strip()
    example['text'] = text
    return example

print("Cleaning Dataset (Removing URLs & HTML) - Falcon-1B_FTT_IMDB.ipynb:61")
dataset = dataset.map(clean_text)

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda - Falcon-1B_FTT_IMDB.ipynb:34
GPU Name: NVIDIA RTX A5000 - Falcon-1B_FTT_IMDB.ipynb:36
VRAM: 25.76 GB - Falcon-1B_FTT_IMDB.ipynb:37
Loading IMDB Dataset - Falcon-1B_FTT_IMDB.ipynb:44
Cleaning Dataset (Removing URLs & HTML) - Falcon-1B_FTT_IMDB.ipynb:61


In [2]:
# @title 2. Tokenization & Data Splits for Falcon-1b

# 1. Load Tokenizer
MODEL_NAME = 'tiiuae/falcon-rw-1b'
print(f"Loading Tokenizer: {MODEL_NAME} - Falcon-1B_FTT_IMDB.ipynb:5")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# QUAN TRỌNG: Falcon không có pad_token mặc định, phải gán thủ công
tokenizer.pad_token = tokenizer.eos_token

# 2. Tokenize Function
def tokenize_function(examples):
    # Max length 512, padding max_length để đồng bộ tensor
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

print("Tokenizing Dataset (Có thể mất vài phút trên máy cá nhân) - Falcon-1B_FTT_IMDB.ipynb:16")
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# 3. Format PyTorch
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# 4. Split Train/Val (90/10)
print("Splitting Train set into Train/Val - Falcon-1B_FTT_IMDB.ipynb:25")
# IMDB gốc đã có sẵn 'test', ta chỉ chia 'train' thành 'train' và 'val'
train_val_split = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=42)
dataset_train = train_val_split["train"]
dataset_val = train_val_split["test"]
dataset_test = tokenized_datasets["test"]

print(f"Dataset ready: Train({len(dataset_train)}), Val({len(dataset_val)}), Test({len(dataset_test)}) - Falcon-1B_FTT_IMDB.ipynb:32")

Loading Tokenizer: tiiuae/falcon-rw-1b - Falcon-1B_FTT_IMDB.ipynb:5
Tokenizing Dataset (Có thể mất vài phút trên máy cá nhân) - Falcon-1B_FTT_IMDB.ipynb:16
Splitting Train set into Train/Val - Falcon-1B_FTT_IMDB.ipynb:25
Dataset ready: Train(22500), Val(2500), Test(25000) - Falcon-1B_FTT_IMDB.ipynb:32


In [3]:
# @title 3. Training Falcon-1b (Optimized for RTX A5000)

# 1. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    if isinstance(logits, tuple):
        logits = logits[0]
        
    probs = softmax(logits, axis=1)[:, 1]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    
    try:
        roc_auc = roc_auc_score(labels, probs)
    except:
        roc_auc = 0.0
        
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall, 'roc_auc': roc_auc}

# 2. Model Init (Tối ưu cho A5000: Dùng bfloat16)
print(f"Loading Model: {MODEL_NAME}... - Falcon-1B_FTT_IMDB.ipynb:24")
try:
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        num_labels=2,
        torch_dtype=torch.bfloat16,  
        use_safetensors=True 
    )
    print(">>> Loaded successfully using SafeTensors (BF16). - Falcon-1B_FTT_IMDB.ipynb:32")
except Exception as e:
    print(f">>> SafeTensors failed ({e}). Attempting legacy load... - Falcon-1B_FTT_IMDB.ipynb:34")
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME, 
        num_labels=2,
        torch_dtype=torch.bfloat16,
        use_safetensors=False
    )

model.to(device)
model.config.pad_token_id = tokenizer.pad_token_id
model.gradient_checkpointing_enable() 

# 3. Training Arguments (Cấu hình Turbo cho A5000)
training_args = TrainingArguments(
    output_dir='./results_falcon',
    num_train_epochs=3,
    
    # --- CẤU HÌNH TỐI ƯU VRAM 24GB ---
    per_device_train_batch_size=16,   
    gradient_accumulation_steps=2,  
    per_device_eval_batch_size=32,    
    
    learning_rate=2e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs_falcon',
    logging_steps=10,                 
    
    eval_strategy="epoch",
    save_strategy="epoch",
    
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    
    # --- QUAN TRỌNG CHO A5000 ---
    fp16=False,   
    bf16=True,    
    
    report_to="none",
    optim="adamw_torch",
    dataloader_num_workers=0,
)

# 4. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

# 5. Start Training
print("Starting Training (Turbo Mode with BF16 on A5000)... - Falcon-1B_FTT_IMDB.ipynb:89")
start_train_time = time.time()
trainer.train()
end_train_time = time.time()

training_time = end_train_time - start_train_time
print(f"\nTraining completed in: {training_time:.2f} seconds - Falcon-1B_FTT_IMDB.ipynb:95")

# 6. Save Final Model
print(f"Saving model to {SAVE_PATH}... - Falcon-1B_FTT_IMDB.ipynb:98")
trainer.save_model(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("Falcon1b Model saved successfully! - Falcon-1B_FTT_IMDB.ipynb:101")

Loading Model: tiiuae/falcon-rw-1b... - Falcon-1B_FTT_IMDB.ipynb:24


`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of FalconForSequenceClassification were not initialized from the model checkpoint at tiiuae/falcon-rw-1b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


>>> Loaded successfully using SafeTensors (BF16). - Falcon-1B_FTT_IMDB.ipynb:32


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Starting Training (Turbo Mode with BF16 on A5000)... - Falcon-1B_FTT_IMDB.ipynb:89


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Roc Auc
1,0.0669,0.132447,0.956,0.955574,0.970468,0.94113,0.989965
2,0.012,0.157201,0.962,0.961954,0.968548,0.955449,0.990554
3,0.0039,0.17979,0.9648,0.964912,0.967226,0.962609,0.990505



Training completed in: 10376.99 seconds - Falcon-1B_FTT_IMDB.ipynb:95
Saving model to ./Falcon_1B_IMDB_Results... - Falcon-1B_FTT_IMDB.ipynb:98
Falcon1b Model saved successfully! - Falcon-1B_FTT_IMDB.ipynb:101


In [4]:
# @title 4. Final Evaluation on Test Set & Report
import pandas as pd

print("Running Evaluation on Test Set - Falcon-1B_FTT_IMDB.ipynb:4")

# 1. Classification Metrics
start_pred_time = time.time()
predictions_output = trainer.predict(dataset_test)
end_pred_time = time.time()
metrics = predictions_output.metrics

# 2. Efficiency Metrics (Đo hiệu năng)
total_samples = len(dataset_test)
total_inference_time = end_pred_time - start_pred_time
latency_per_sample = (total_inference_time / total_samples) * 1000 # đổi sang ms

# Tính kích thước Model
bin_path = os.path.join(SAVE_PATH, 'pytorch_model.bin')
safe_path = os.path.join(SAVE_PATH, 'model.safetensors')

if os.path.exists(bin_path):
    model_size = os.path.getsize(bin_path) / (1024 * 1024) # MB
elif os.path.exists(safe_path):
    model_size = os.path.getsize(safe_path) / (1024 * 1024) # MB
else:
    model_size = 0

# Đo RAM/VRAM hiện tại
process = psutil.Process(os.getpid())
ram_usage = process.memory_info().rss / 1024 ** 2
vram_usage = torch.cuda.memory_allocated() / 1024 ** 2 if torch.cuda.is_available() else 0

# 3. Print Report
print("\n====== REPORT: Falcon1b (FFT) on IMDB ====== - Falcon-1B_FTT_IMDB.ipynb:34")
print(f"1. Classification Metrics: - Falcon-1B_FTT_IMDB.ipynb:35")
print(f"Accuracy:  {metrics.get('test_accuracy', 0):.4f} - Falcon-1B_FTT_IMDB.ipynb:36")
print(f"Precision: {metrics.get('test_precision', 0):.4f} - Falcon-1B_FTT_IMDB.ipynb:37")
print(f"Recall:    {metrics.get('test_recall', 0):.4f} - Falcon-1B_FTT_IMDB.ipynb:38")
print(f"F1Score:  {metrics.get('test_f1', 0):.4f} - Falcon-1B_FTT_IMDB.ipynb:39")
print(f"ROCAUC:   {metrics.get('test_roc_auc', 0):.4f} - Falcon-1B_FTT_IMDB.ipynb:40")

print(f"\n2. Efficiency Metrics: - Falcon-1B_FTT_IMDB.ipynb:42")
print(f"Training Time:      {training_time:.2f} s - Falcon-1B_FTT_IMDB.ipynb:43")
print(f"Inference Latency:  {latency_per_sample:.4f} ms/sample - Falcon-1B_FTT_IMDB.ipynb:44")
print(f"Model Size (Disk):  {model_size:.2f} MB - Falcon-1B_FTT_IMDB.ipynb:45")
print(f"Peak RAM Usage:     {ram_usage:.2f} MB - Falcon-1B_FTT_IMDB.ipynb:46")
print(f"Peak VRAM Usage:    {vram_usage:.2f} MB - Falcon-1B_FTT_IMDB.ipynb:47")

# 4. Save CSV
results_df = pd.DataFrame({
    "Metric": ["Accuracy", "F1", "ROC-AUC", "Precision", "Recall", "Training Time (s)", "Inference Latency (ms)", "Model Size (MB)"],
    "Value": [
        metrics.get('test_accuracy', 0),
        metrics.get('test_f1', 0),
        metrics.get('test_roc_auc', 0),
        metrics.get('test_precision', 0),
        metrics.get('test_recall', 0),
        training_time,
        latency_per_sample,
        model_size
    ]
})
results_file = os.path.join(SAVE_PATH, 'imdb_falcon_results.csv')
results_df.to_csv(results_file, index=False)
print(f"\nReport saved to {results_file} - Falcon-1B_FTT_IMDB.ipynb:65")

Running Evaluation on Test Set - Falcon-1B_FTT_IMDB.ipynb:4



1. Classification Metrics: - Falcon-1B_FTT_IMDB.ipynb:35
Accuracy:  0.9625 - Falcon-1B_FTT_IMDB.ipynb:36
Precision: 0.9601 - Falcon-1B_FTT_IMDB.ipynb:37
Recall:    0.9652 - Falcon-1B_FTT_IMDB.ipynb:38
F1Score:  0.9626 - Falcon-1B_FTT_IMDB.ipynb:39
ROCAUC:   0.9921 - Falcon-1B_FTT_IMDB.ipynb:40

2. Efficiency Metrics: - Falcon-1B_FTT_IMDB.ipynb:42
Training Time:      10376.99 s - Falcon-1B_FTT_IMDB.ipynb:43
Inference Latency:  21.5771 ms/sample - Falcon-1B_FTT_IMDB.ipynb:44
Model Size (Disk):  2501.77 MB - Falcon-1B_FTT_IMDB.ipynb:45
Peak RAM Usage:     1592.88 MB - Falcon-1B_FTT_IMDB.ipynb:46
Peak VRAM Usage:    7521.45 MB - Falcon-1B_FTT_IMDB.ipynb:47

Report saved to ./Falcon_1B_IMDB_Results\imdb_falcon_results.csv - Falcon-1B_FTT_IMDB.ipynb:65
