In [1]:
# =====================================================
# FAKE NEWS DETECTION – ALBERT + PROMPT TUNING + LIAR 2
# Dataset: chengxuphd/liar2
# CONFIG: GIỐNG HỆT ĐỂ SO SÁNH (5 Epochs, LR 0.01)
# =====================================================

# 1. INSTALL
!pip install -q transformers datasets peft accelerate bitsandbytes scikit-learn pandas numpy psutil

# 2. IMPORT
import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model, TaskType
from google.colab import drive

warnings.filterwarnings("ignore")

# 3. GPU INFO
device_name = "CPU"
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    print(f"Device: {device_name} | VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("Device: CPU")

# 4. MOUNT GOOGLE DRIVE
try:
    drive.mount('/content/drive', force_remount=True)
except ValueError:
    print("Drive có thể đã được mount, tiếp tục...")

# 5. OUTPUT DIR
OUTPUT_DIR = "/content/drive/MyDrive/LIAR2_ALBERT_PromptTuning"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 6. LOAD DATA (LIAR 2)
print("Loading 'chengxuphd/liar2' dataset...")
try:
    dataset = load_dataset("chengxuphd/liar2")
except Exception as e:
    print(f"Lỗi tải dataset: {e}. Đang thử load bản gốc...")
    dataset = load_dataset("liar")

# Chuyển sang DataFrame
train_df = pd.DataFrame(dataset["train"])
val_df   = pd.DataFrame(dataset["validation"])
test_df  = pd.DataFrame(dataset["test"])

# --- XỬ LÝ NHÃN (6 Lớp -> 2 Lớp) ---
if train_df["label"].max() > 1:
    print("Mapping 6 labels -> Binary (Fake/Real)...")
    def map_labels(label):
        if label in [0, 4, 5]: return 1 # FAKE
        return 0 # REAL

    train_df["label"] = train_df["label"].apply(map_labels)
    val_df["label"]   = val_df["label"].apply(map_labels)
    test_df["label"]  = test_df["label"].apply(map_labels)

# --- TẠO NỘI DUNG ĐẦU VÀO ---
text_col = "statement" if "statement" in train_df.columns else "text"
context_col = "subject" if "subject" in train_df.columns else "context"

def create_content(row):
    stmt = str(row.get(text_col, ""))
    spkr = str(row.get("speaker", ""))
    ctxt = str(row.get(context_col, ""))
    return f"Statement: {stmt} | Speaker: {spkr} | Context: {ctxt}"

train_df["content"] = train_df.apply(create_content, axis=1)
val_df["content"]   = val_df.apply(create_content, axis=1)
test_df["content"]  = test_df.apply(create_content, axis=1)

# Tạo Dataset
train_dataset = Dataset.from_pandas(train_df[["content", "label"]])
val_dataset   = Dataset.from_pandas(val_df[["content", "label"]])
test_dataset  = Dataset.from_pandas(test_df[["content", "label"]])

dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

# 7. TOKENIZER (ALBERT)
MODEL_NAME = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

tokenized = dataset_dict.map(tokenize_fn, batched=True, remove_columns=["content"])
tokenized = tokenized.rename_column("label", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 8. LOAD MODEL & CONFIG PROMPT TUNING
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# --- CẤU HÌNH FIX CỨNG CHO ALBERT ---
# ALBERT Base: Layers=12, Heads=12
# QUAN TRỌNG: Embedding Size = 128 (Khác với Hidden Size 768)
n_layers = 12
n_heads = 12
token_dim = 128  # <-- ĐIỂM KHÁC BIỆT KỸ THUẬT BẮT BUỘC

peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classify validity of the statement:",
    num_virtual_tokens=32,
    tokenizer_name_or_path=MODEL_NAME,
    num_layers=n_layers,
    token_dim=token_dim, # Phải là 128 cho ALBERT
    num_attention_heads=n_heads
)

model = get_peft_model(base_model, peft_config)
print("\nTham số huấn luyện (Prompt Tuning):")
model.print_trainable_parameters()

# 9. CLASS WEIGHTS
labels_array = train_df["label"].values
class_weights = compute_class_weight("balanced", classes=np.array([0,1]), y=labels_array)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# 10. METRICS
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    probs = torch.softmax(torch.tensor(logits), dim=1)[:,1].numpy()
    try:
        auc = roc_auc_score(labels, probs)
    except:
        auc = 0.0
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "auc": auc}

# 11. TRAINER
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

# 12. TRAINING ARGS (GIỮ NGUYÊN ĐỂ SO SÁNH)
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,                # Giữ nguyên 5
    per_device_train_batch_size=16,    # Giữ nguyên 16
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=1e-2,                # LR 0.01
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 13. TRAIN
print("\n==============================")
print("🚀 TRAINING LIAR 2 - ALBERT + PROMPT TUNING")
print("==============================\n")
trainer.train()

# 14. EVALUATE
print("\n🎯 TEST RESULTS")
results = trainer.evaluate(tokenized["test"])
for k,v in results.items():
    print(f"{k}: {v}")

# 15. SAVE
final_path = os.path.join(OUTPUT_DIR, "final_liar2_pt")
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"Saved to: {final_path}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDevice: Tesla T4 | VRAM: 15.8 GB
Drive có thể đã được mount, tiếp tục...
Loading 'chengxuphd/liar2' dataset...


README.md: 0.00B [00:00, ?B/s]

train.csv:   0%|          | 0.00/19.0M [00:00<?, ?B/s]

valid.csv: 0.00B [00:00, ?B/s]

test.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/18369 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2297 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2296 [00:00<?, ? examples/s]

Mapping 6 labels -> Binary (Fake/Real)...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Map:   0%|          | 0/18369 [00:00<?, ? examples/s]

Map:   0%|          | 0/2297 [00:00<?, ? examples/s]

Map:   0%|          | 0/2296 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Tham số huấn luyện (Prompt Tuning):
trainable params: 5,634 || all params: 11,690,756 || trainable%: 0.0482

🚀 TRAINING LIAR 2 - ALBERT + PROMPT TUNING



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.9579,1.191045,0.393557,0.154887,0.393557,0.22229,0.517598
2,0.8249,0.780499,0.393992,0.501384,0.393992,0.225503,0.559601
3,0.7706,0.81152,0.40357,0.638212,0.40357,0.249297,0.60463
4,0.7142,0.688674,0.619939,0.599302,0.619939,0.585658,0.619518
5,0.6848,0.665352,0.575533,0.608034,0.575533,0.579761,0.630298



🎯 TEST RESULTS


eval_loss: 0.699445903301239
eval_accuracy: 0.6110627177700348
eval_precision: 0.5877151741810603
eval_recall: 0.6110627177700348
eval_f1: 0.5761029881290154
eval_auc: 0.6071743767153485
eval_runtime: 6.0903
eval_samples_per_second: 376.991
eval_steps_per_second: 11.822
epoch: 5.0
Saved to: /content/drive/MyDrive/LIAR2_ALBERT_PromptTuning/final_liar2_pt
