In [1]:
# =====================================================
# FAKE NEWS DETECTION – MiniLM PROMPT TUNING + GonzaloA/fake_news
# =====================================================

# 1. CÀI ĐẶT & IMPORT
# !pip install -q transformers datasets torch scikit-learn pandas numpy psutil accelerate peft

import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    BertTokenizerFast,
    AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit
from google.colab import drive

warnings.filterwarnings("ignore")

# 2. KIỂM TRA GPU
if torch.cuda.is_available():
    print(f"Device: {torch.cuda.get_device_name(0)} | CUDA: True")
else:
    print("Device: CPU")

# 3. MOUNT DRIVE
drive.mount('/content/drive', force_remount=False)
OUTPUT_DIR = "/content/drive/MyDrive/GonzaloA_MiniLM_PromptTuning_Pro" # <--- Tên thư mục mới
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Hàm quản lý checkpoint
def manage_checkpoints(output_dir, keep_latest=2):
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if len(ckpts) <= keep_latest: return
    def idx_from_name(n):
        try: return int(n.split("-")[-1])
        except: return os.path.getmtime(os.path.join(output_dir, n))
    ckpts_sorted = sorted(ckpts, key=idx_from_name)
    for old_ckpt in ckpts_sorted[:-keep_latest]:
        shutil.rmtree(os.path.join(output_dir, old_ckpt), ignore_errors=True)

def get_last_checkpoint(output_dir):
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if not ckpts: return None
    def idx_from_name(n):
        try: return int(n.split("-")[-1])
        except: return 0
    ckpts_sorted = sorted(ckpts, key=idx_from_name, reverse=True)
    return os.path.join(output_dir, ckpts_sorted[0])

last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
print("Checkpoint gần nhất:", last_checkpoint or "Không có → Train từ đầu")

# 4. TẢI DATASET GonzaloA/fake_news
print("\nĐang tải dataset GonzaloA/fake_news...")
dataset = load_dataset("GonzaloA/fake_news")

# 5. XỬ LÝ DỮ LIỆU
def process_split(ds_split):
    df = pd.DataFrame(ds_split)
    # Ghép title + text
    df['content'] = df.get('title', '').fillna('') + " [SEP] " + df.get('text', '').fillna('')
    return df

if 'validation' in dataset and 'test' in dataset:
    print("Sử dụng các tập train/val/test có sẵn.")
    train_df = process_split(dataset['train'])
    val_df = process_split(dataset['validation'])
    test_df = process_split(dataset['test'])
else:
    print("Tự chia tập dữ liệu...")
    df = process_split(dataset['train'])
    from sklearn.model_selection import train_test_split
    train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42, stratify=df['label'])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

# Hàm làm sạch văn bản
def clean_text(text):
    if not isinstance(text, str): return ""
    t = text.lower()
    t = re.sub(r'https?://\S+|www\.\S+', ' ', t)
    t = re.sub(r'<.*?>', ' ', t)
    t = re.sub(r'[^a-zA-Z0-9\s]', ' ', t)
    return re.sub(r'\s+', ' ', t).strip()

print("Đang làm sạch văn bản...")
for df in [train_df, val_df, test_df]:
    df['content'] = df['content'].apply(clean_text)
    df.drop(df[df['content'].str.len() < 20].index, inplace=True)

print(f"Sizes -> Train: {len(train_df):,} | Val: {len(val_df):,} | Test: {len(test_df):,}")

# 6. TÍNH CLASS WEIGHTS
classes = np.array([0, 1])
class_weights = compute_class_weight('balanced', classes=classes, y=train_df['label'])
class_weight_dict = {0: float(class_weights[0]), 1: float(class_weights[1])}
print("Class weights:", class_weight_dict)

train_dataset = Dataset.from_pandas(train_df[['content', 'label']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df[['content', 'label']].reset_index(drop=True))
test_dataset = Dataset.from_pandas(test_df[['content', 'label']].reset_index(drop=True))

dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

# 7. TOKENIZER MiniLM
CHECKPOINT = "sentence-transformers/all-MiniLM-L12-v2"
tokenizer = BertTokenizerFast.from_pretrained(CHECKPOINT)

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

print("Tokenizing...")
tokenized = dataset_dict.map(tokenize_fn, batched=True, batch_size=1000, remove_columns=['content'])
tokenized = tokenized.rename_column("label", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 8. MODEL & PROMPT TUNING CONFIGURATION
print("\nĐang cấu hình Model và Prompt Tuning...")

# Tải model gốc
base_model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT, num_labels=2)
base_model.config.id2label = {0: "Fake", 1: "Real"}
base_model.config.label2id = {"Fake": 0, "Real": 1}

# --- FIX LỖI "Please specify num_layers" ---
# Lấy trực tiếp thông số từ config của MiniLM
# MiniLM dùng kiến trúc BERT nên tên biến thường chuẩn (num_hidden_layers, hidden_size, num_attention_heads)
# Tuy nhiên, để chắc chắn, ta lấy từ config
n_layers = base_model.config.num_hidden_layers
token_dim = base_model.config.hidden_size
n_heads = base_model.config.num_attention_heads

# Cấu hình Prompt Tuning với thông số tường minh
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classify if the news is real or fake:",
    num_virtual_tokens=8,
    tokenizer_name_or_path=CHECKPOINT,

    # Truyền tham số kiến trúc để tránh lỗi
    num_layers=n_layers,
    token_dim=token_dim,
    num_attention_heads=n_heads
)

# Áp dụng Prompt Tuning
model = get_peft_model(base_model, peft_config)
print("\n=== TRAINABLE PARAMETERS ===")
model.print_trainable_parameters()

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=1e-2, # LR cao cho Prompt Tuning

    eval_strategy="epoch",
    save_strategy="epoch",

    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        weight_tensor = torch.tensor([class_weights[0], class_weights[1]], dtype=torch.float32, device=model.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weight_tensor)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 9. TRAIN & EVAL
print("\nBẮT ĐẦU HUẤN LUYỆN MiniLM (Prompt Tuning)...")
trainer.train(resume_from_checkpoint=last_checkpoint)
manage_checkpoints(OUTPUT_DIR)

print("\nĐÁNH GIÁ TRÊN TẬP TEST...")
results = trainer.evaluate(tokenized["test"])
print(results)

# Lưu model
final_path = os.path.join(OUTPUT_DIR, "final_best_model")
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"Đã lưu Prompt Tuning weights tại: {final_path}")

Device: Tesla T4 | CUDA: True
Mounted at /content/drive
Checkpoint gần nhất: Không có → Train từ đầu

Đang tải dataset GonzaloA/fake_news...


README.md: 0.00B [00:00, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


dataset_infos.json: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/38.8M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/13.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24353 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8117 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8117 [00:00<?, ? examples/s]

Sử dụng các tập train/val/test có sẵn.
Đang làm sạch văn bản...
Sizes -> Train: 24,350 | Val: 8,115 | Test: 8,117
Class weights: {0: 1.0914388166741371, 1: 0.9226979916635089}


tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Tokenizing...


Map:   0%|          | 0/24350 [00:00<?, ? examples/s]

Map:   0%|          | 0/8115 [00:00<?, ? examples/s]

Map:   0%|          | 0/8117 [00:00<?, ? examples/s]


Đang cấu hình Model và Prompt Tuning...


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/all-MiniLM-L12-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== TRAINABLE PARAMETERS ===
trainable params: 3,842 || all params: 33,364,612 || trainable%: 0.0115

BẮT ĐẦU HUẤN LUYỆN MiniLM (Prompt Tuning)...


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2366,0.08711,0.977942,0.978517,0.977942,0.977966
2,0.0797,0.086117,0.977819,0.978052,0.977819,0.977835
3,0.0798,0.070023,0.979791,0.980421,0.979791,0.979814
4,0.071,0.063937,0.981269,0.981847,0.981269,0.98129


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2366,0.08711,0.977942,0.978517,0.977942,0.977966
2,0.0797,0.086117,0.977819,0.978052,0.977819,0.977835
3,0.0798,0.070023,0.979791,0.980421,0.979791,0.979814
4,0.071,0.063937,0.981269,0.981847,0.981269,0.98129
5,0.0671,0.066422,0.981516,0.981969,0.981516,0.981534



ĐÁNH GIÁ TRÊN TẬP TEST...


{'eval_loss': 0.06852205842733383, 'eval_accuracy': 0.9797954909449303, 'eval_precision': 0.9803485049918667, 'eval_recall': 0.9797954909449303, 'eval_f1': 0.9798125786256074, 'eval_runtime': 24.2643, 'eval_samples_per_second': 334.524, 'eval_steps_per_second': 10.468, 'epoch': 5.0}
Đã lưu Prompt Tuning weights tại: /content/drive/MyDrive/GonzaloA_MiniLM_PromptTuning_Pro/final_best_model
