In [2]:
# =====================================================
# FAKE NEWS DETECTION ‚Äì ALBERT + PROMPT TUNING + WELFAKE
# =====================================================

# 1. INSTALL
!pip install -q transformers datasets peft accelerate bitsandbytes scikit-learn pandas numpy psutil

# 2. IMPORT
import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from peft import PromptTuningConfig, PromptTuningInit, get_peft_model, TaskType
from google.colab import drive

warnings.filterwarnings("ignore")

# 3. GPU INFO
device_name = "CPU"
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    print(f"Device: {device_name} | VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
else:
    print("Device: CPU")

# 4. MOUNT GOOGLE DRIVE
try:
    drive.mount('/content/drive', force_remount=True)
except ValueError:
    print("Drive c√≥ th·ªÉ ƒë√£ ƒë∆∞·ª£c mount, ti·∫øp t·ª•c...")

# 5. OUTPUT DIR
OUTPUT_DIR = "/content/drive/MyDrive/WELFake_ALBERT_PromptTuning"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 6. LOAD DATA & CLEAN
print("Loading Data...")
dataset = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset["train"])

df["content"] = df.get("title","").fillna("") + " [SEP] " + df.get("text","").fillna("")

def clean_text(t):
    if not isinstance(t, str): return ""
    t = t.lower()
    t = re.sub(r'https?://\S+', ' ', t)
    t = re.sub(r'<.*?>', ' ', t)
    t = re.sub(r'[^a-z0-9\s]', ' ', t)
    t = re.sub(r'\s+', ' ', t).strip()
    return t

print("Cleaning text...")
df["content"] = df["content"].apply(clean_text)
df = df[df["content"].str.len() > 20].drop_duplicates(subset=["content"])
print("Dataset cleaned samples:", len(df))

# 7. SPLIT DATA
labels = df["label"].values
train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42, stratify=labels)
val_df, test_df   = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df["label"])

train_dataset = Dataset.from_pandas(train_df[["content","label"]])
val_dataset   = Dataset.from_pandas(val_df[["content","label"]])
test_dataset  = Dataset.from_pandas(test_df[["content","label"]])

dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

# 8. TOKENIZER (ALBERT)
MODEL_NAME = "albert-base-v2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

tokenized = dataset_dict.map(tokenize_fn, batched=True, remove_columns=["content"])
tokenized = tokenized.rename_column("label", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 9. LOAD MODEL & APPLY PROMPT TUNING
base_model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# --- TR√çCH XU·∫§T C·∫§U H√åNH ƒê·∫∂C BI·ªÜT C·ª¶A ALBERT ---
albert_config = base_model.config

# ALBERT d√πng 'num_hidden_layers' v√† 'num_attention_heads' nh∆∞ BERT
n_layers = getattr(albert_config, "num_hidden_layers", 12)
n_heads = getattr(albert_config, "num_attention_heads", 12)

# QUAN TR·ªåNG: V·ªõi ALBERT, Prompt Embedding ph·∫£i kh·ªõp v·ªõi 'embedding_size' (128)
# ch·ª© KH√îNG ph·∫£i 'hidden_size' (768)
token_dim = getattr(albert_config, "embedding_size", 128)

print(f"Detected ALBERT Config: Layers={n_layers}, Embedding Dim={token_dim} (Input), Heads={n_heads}")

peft_config = PromptTuningConfig(
    task_type=TaskType.SEQ_CLS,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Classify if the news is real or fake:",
    num_virtual_tokens=8,
    tokenizer_name_or_path=MODEL_NAME,
    # Truy·ªÅn tham s·ªë th·ªß c√¥ng ƒë·ªÉ tr√°nh l·ªói
    num_layers=n_layers,
    token_dim=token_dim,          # L∆∞u √Ω: Gi√° tr·ªã n√†y l√† 128
    num_attention_heads=n_heads
)

model = get_peft_model(base_model, peft_config)
print("\nTham s·ªë hu·∫•n luy·ªán (Prompt Tuning):")
model.print_trainable_parameters()

# 10. CLASS WEIGHTS
class_weights = compute_class_weight("balanced", classes=np.array([0,1]), y=df["label"])
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# 11. METRICS
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="weighted", zero_division=0)
    acc = accuracy_score(labels, preds)
    probs = torch.softmax(torch.tensor(logits), dim=1)[:,1].numpy()
    try:
        auc = roc_auc_score(labels, probs)
    except:
        auc = 0.0
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1, "auc": auc}

# 12. WEIGHTED TRAINER
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights.to(model.device))
        loss = loss_fct(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

# 13. TRAINING ARGS
# ALBERT c√≥ xu h∆∞·ªõng train ch·∫≠m h∆°n m·ªôt ch√∫t do chia s·∫ª tr·ªçng s·ªë,
# nh∆∞ng Prompt Tuning r·∫•t nh·∫π n√™n v·∫´n ch·∫°y ·ªïn.
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=1,
    learning_rate=1e-2,             # LR cao cho Prompt Tuning (0.01)
    warmup_ratio=0.1,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

# 14. TRAIN
print("\n==============================")
print("üöÄ TRAINING ALBERT + PROMPT TUNING")
print("==============================\n")
trainer.train()

# 15. EVALUATE
print("\nüéØ TEST RESULTS")
results = trainer.evaluate(tokenized["test"])
for k,v in results.items():
    print(f"{k}: {v}")

# 16. SAVE
final_path = os.path.join(OUTPUT_DIR, "final_prompt_tuning")
trainer.save_model(final_path)
tokenizer.save_pretrained(final_path)
print(f"Saved to: {final_path}")

Device: Tesla T4 | VRAM: 15.8 GB
Mounted at /content/drive
Loading Data...
Cleaning text...
Dataset cleaned samples: 63323


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Map:   0%|          | 0/47492 [00:00<?, ? examples/s]

Map:   0%|          | 0/7915 [00:00<?, ? examples/s]

Map:   0%|          | 0/7916 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected ALBERT Config: Layers=12, Embedding Dim=128 (Input), Heads=12

Tham s·ªë hu·∫•n luy·ªán (Prompt Tuning):
trainable params: 2,562 || all params: 11,687,684 || trainable%: 0.0219

üöÄ TRAINING ALBERT + PROMPT TUNING



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.2661,0.186664,0.933418,0.934166,0.933418,0.933503,0.983099
2,0.2356,0.148121,0.943146,0.943251,0.943146,0.943172,0.987254
3,0.1785,0.136435,0.94681,0.950062,0.94681,0.946917,0.991246


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.2661,0.186664,0.933418,0.934166,0.933418,0.933503,0.983099
2,0.2356,0.148121,0.943146,0.943251,0.943146,0.943172,0.987254
3,0.1785,0.136435,0.94681,0.950062,0.94681,0.946917,0.991246
4,0.1327,0.12328,0.951611,0.953584,0.951611,0.9517,0.992054
5,0.1126,0.105695,0.95856,0.959847,0.95856,0.958627,0.993146



üéØ TEST RESULTS


eval_loss: 0.10083331912755966
eval_accuracy: 0.9615967660434563
eval_precision: 0.9623359217429066
eval_recall: 0.9615967660434563
eval_f1: 0.9616473311811731
eval_auc: 0.9928913463268058
eval_runtime: 79.6975
eval_samples_per_second: 99.326
eval_steps_per_second: 3.112
epoch: 5.0
Saved to: /content/drive/MyDrive/WELFake_ALBERT_PromptTuning/final_prompt_tuning
