In [1]:
# =========================================================
# FAKE NEWS DETECTION ‚Äì ALBERT + PROMPT TUNING x FAKENEWSNET
# (C·∫•u h√¨nh chu·∫©n cho albert-base-v2)
# =========================================================

# 1. C√ÄI ƒê·∫∂T TH∆Ø VI·ªÜN
!pip install -q transformers datasets peft accelerate bitsandbytes scikit-learn pandas numpy psutil sentencepiece

import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict, concatenate_datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model, TaskType
from google.colab import drive

warnings.filterwarnings("ignore")

# 2. KI·ªÇM TRA GPU & MOUNT DRIVE
if torch.cuda.is_available():
    print(f"‚úÖ Device: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è C·∫¢NH B√ÅO: ƒêang ch·∫°y CPU! Qu√° tr√¨nh train s·∫Ω r·∫•t ch·∫≠m.")

drive.mount('/content/drive', force_remount=True)
OUTPUT_DIR = "/content/drive/MyDrive/FakeNewsNet_ALBERT_PromptTuning"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 3. T·∫¢I DATASET FAKENEWSNET
print("\n‚è≥ ƒêang t·∫£i dataset FakeNewsNet...")
try:
    ds_gossip = load_dataset("rickstello/FakeNewsNet", "gossipcop", split="train")
    ds_politi = load_dataset("rickstello/FakeNewsNet", "politifact", split="train")
    dataset_full = concatenate_datasets([ds_gossip, ds_politi])
    df = pd.DataFrame(dataset_full)
except Exception as e:
    print(f"‚ö†Ô∏è T·∫£i config th·∫•t b·∫°i ({e}), t·∫£i b·∫£n default...")
    dataset = load_dataset("rickstello/FakeNewsNet", split="train")
    df = pd.DataFrame(dataset)

# 4. X·ª¨ L√ù D·ªÆ LI·ªÜU AN TO√ÄN
text_col = next((c for c in ['news_content', 'text', 'content', 'body'] if c in df.columns), None)
title_col = next((c for c in ['title', 'news_title', 'headline'] if c in df.columns), None)
label_col = next((c for c in ['real', 'label', 'class', 'fake'] if c in df.columns), None)

print(f"üîç Detected Columns: Text='{text_col}' | Title='{title_col}' | Label='{label_col}'")

if label_col is None:
    raise ValueError("‚ùå Dataset kh√¥ng c√≥ c·ªôt nh√£n!")

df['label'] = df[label_col]

# X·ª≠ l√Ω n·ªôi dung
title_data = df[title_col].astype(str).fillna('') if title_col else pd.Series([''] * len(df))
text_data = df[text_col].astype(str).fillna('') if text_col else pd.Series([''] * len(df))

if text_col is None:
    print("‚ö†Ô∏è C·∫¢NH B√ÅO: Kh√¥ng t√¨m th·∫•y c·ªôt n·ªôi dung. Ch·ªâ h·ªçc t·ª´ Ti√™u ƒë·ªÅ.")

def clean_text(s):
    if not isinstance(s, str): return ""
    s = s.lower()
    s = re.sub(r'https?://\S+', ' ', s)
    s = re.sub(r'<.*?>', ' ', s)
    s = re.sub(r'[^a-z0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("üßπ ƒêang l√†m s·∫°ch d·ªØ li·ªáu...", end="")
df['content'] = (title_data + " [SEP] " + text_data).apply(clean_text)
df = df[df['content'].str.len() > 10].drop_duplicates(subset=['content'])
print(f" ‚Üí Ho√†n t·∫•t. S·ªë l∆∞·ª£ng m·∫´u: {len(df):,}")

# Class Weights
classes = np.unique(df['label'])
class_weights = compute_class_weight('balanced', classes=classes, y=df['label'])
class_weight_dict = {k: float(v) for k, v in zip(classes, class_weights)}
print("Class weights:", class_weight_dict)

# Split Data
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

dataset_dict = DatasetDict({
    "train": Dataset.from_pandas(train_df[['content','label']].reset_index(drop=True)),
    "validation": Dataset.from_pandas(val_df[['content','label']].reset_index(drop=True)),
    "test": Dataset.from_pandas(test_df[['content','label']].reset_index(drop=True))
})

# 5. TOKENIZER (ALBERT)
MODEL_NAME = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

tokenized = dataset_dict.map(tokenize_fn, batched=True, remove_columns=['content'])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 6. MODEL & PROMPT TUNING SETUP
print(f"\n‚öôÔ∏è Kh·ªüi t·∫°o {MODEL_NAME} v·ªõi Prompt Tuning...")
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
base_model.config.id2label = {0: "Fake", 1: "Real"}
base_model.config.label2id = {"Fake": 0, "Real": 1}

# --- C·∫§U H√åNH PROMPT TUNING CHO ALBERT ---
peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classify if this news article is real or fake:",
    num_virtual_tokens=64,
    tokenizer_name_or_path=MODEL_NAME,

    # === MAPPING CONFIG CHO ALBERT (T∆∞∆°ng t·ª± BERT) ===
    num_layers=base_model.config.num_hidden_layers,      # ALBERT d√πng 'num_hidden_layers'
    token_dim=base_model.config.hidden_size,             # ALBERT d√πng 'hidden_size'
    num_attention_heads=base_model.config.num_attention_heads
)

model = get_peft_model(base_model, peft_config)
model.print_trainable_parameters()

# 7. CUSTOM TRAINER
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        w = torch.tensor(list(class_weight_dict.values()), dtype=torch.float32, device=model.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=w)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 8. TRAINING ARGS
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,                # Prompt Tuning c·∫ßn nhi·ªÅu epochs h∆°n
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=1e-2,                 # LR cao (0.01) cho Prompt Tuning
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none"
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)]
)

print(f"\nüöÄ B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN (ALBERT + Prompt Tuning)...")
trainer.train()

# 9. EVAL
print("\nüéØ K·∫æT QU·∫¢ TR√äN TEST SET:")
results = trainer.evaluate(tokenized["test"])
print(results)

final_path = os.path.join(OUTPUT_DIR, "final_albert_pt_model")
trainer.save_model(final_path)
print(f"\n‚úÖ ƒê√£ l∆∞u model t·∫°i: {final_path}")

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.1/59.1 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h‚úÖ Device: Tesla T4
Mounted at /content/drive

‚è≥ ƒêang t·∫£i dataset FakeNewsNet...


README.md:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

‚ö†Ô∏è T·∫£i config th·∫•t b·∫°i (BuilderConfig 'gossipcop' not found. Available: ['default']), t·∫£i b·∫£n default...


FakeNewsNet.csv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/23196 [00:00<?, ? examples/s]

üîç Detected Columns: Text='None' | Title='title' | Label='real'
‚ö†Ô∏è C·∫¢NH B√ÅO: Kh√¥ng t√¨m th·∫•y c·ªôt n·ªôi dung. Ch·ªâ h·ªçc t·ª´ Ti√™u ƒë·ªÅ.
üßπ ƒêang l√†m s·∫°ch d·ªØ li·ªáu... ‚Üí Ho√†n t·∫•t. S·ªë l∆∞·ª£ng m·∫´u: 21,710
Class weights: {np.int64(0): 2.0676190476190475, np.int64(1): 0.6594775212636695}


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Map:   0%|          | 0/17368 [00:00<?, ? examples/s]

Map:   0%|          | 0/2171 [00:00<?, ? examples/s]

Map:   0%|          | 0/2171 [00:00<?, ? examples/s]


‚öôÔ∏è Kh·ªüi t·∫°o albert-base-v2 v·ªõi Prompt Tuning...


model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 9,730 || all params: 11,694,852 || trainable%: 0.0832

üöÄ B·∫ÆT ƒê·∫¶U HU·∫§N LUY·ªÜN (ALBERT + Prompt Tuning)...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8784,0.952583,0.790419,0.742049,0.780931,0.790419
2,0.6913,0.797939,0.442653,0.435581,0.79989,0.442653
3,0.609,1.273529,0.775679,0.694486,0.815767,0.775679
4,0.5183,0.505941,0.73883,0.755439,0.80415,0.73883
5,0.4866,0.500605,0.796868,0.802277,0.810902,0.796868



üéØ K·∫æT QU·∫¢ TR√äN TEST SET:


{'eval_loss': 0.5191873908042908, 'eval_accuracy': 0.7835099032703823, 'eval_f1': 0.7904213857209916, 'eval_precision': 0.8021012071233304, 'eval_recall': 0.7835099032703823, 'eval_runtime': 4.8526, 'eval_samples_per_second': 447.391, 'eval_steps_per_second': 14.013, 'epoch': 5.0}

‚úÖ ƒê√£ l∆∞u model t·∫°i: /content/drive/MyDrive/FakeNewsNet_ALBERT_PromptTuning/final_albert_pt_model
