In [3]:
# =====================================================
# FAKE NEWS DETECTION ‚Äì (RoBERTa-base version) + WELFAKE
# =====================================================

# 1. INSTALL & IMPORT
!pip install -q transformers datasets torch scikit-learn pandas numpy psutil accelerate

import os, re, shutil, psutil, warnings
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset, Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, EarlyStoppingCallback,
    DataCollatorWithPadding
)
from google.colab import drive
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# 2. GPU INFO
if torch.cuda.is_available():
    device_name = torch.cuda.get_device_name(0)
    vram_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
else:
    device_name = "CPU"
    vram_gb = 0.0

print(f"Device: {device_name} | CUDA: {torch.cuda.is_available()} | VRAM: {vram_gb:.1f} GB")

# 3. MOUNT DRIVE (ƒê√£ s·ª≠a l·ªói mount failed)
if not os.path.exists('/content/drive'):
    try:
        drive.mount('/content/drive', force_remount=True)
    except ValueError:
        print("Drive c√≥ th·ªÉ ƒë√£ ƒë∆∞·ª£c mount. B·ªè qua.")

def check_drive_space(path="/content/drive/MyDrive"):
    try:
        usage = psutil.disk_usage(path)
        used_gb = usage.used / 1e9
        total_gb = usage.total / 1e9
        pct = used_gb / total_gb * 100 if total_gb > 0 else 0
        print(f"Drive: {used_gb:.1f}GB / {total_gb:.1f}GB ({pct:.1f}%)")
        if pct > 90:
            print("‚ö†Ô∏è  C·∫¢NH B√ÅO: Drive g·∫ßn ƒë·∫ßy!")
    except Exception as e:
        print("Kh√¥ng th·ªÉ ki·ªÉm tra dung l∆∞·ª£ng Drive:", e)

check_drive_space()

# 4. OUTPUT_DIR + CONFIG
# Thay ƒë·ªïi t√™n folder ƒë·ªÉ kh√¥ng ƒë√® l√™n model BERT c≈©
OUTPUT_DIR = "/content/drive/MyDrive/WELFake_RoBERTa_base_Pro"
MODEL_NAME = "roberta-base"  # <-- ƒê·ªïi model t·∫°i ƒë√¢y

os.makedirs(OUTPUT_DIR, exist_ok=True)

def manage_checkpoints(output_dir, keep_latest=2):
    if not os.path.exists(output_dir): return
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if len(ckpts) <= keep_latest:
        return

    def idx(n):
        try:
            return int(n.split("-")[-1])
        except:
            return os.path.getmtime(os.path.join(output_dir, n))

    ckpts_sorted = sorted(ckpts, key=idx)
    for ck in ckpts_sorted[:-keep_latest]:
        shutil.rmtree(os.path.join(output_dir, ck), ignore_errors=True)
        print(f"ƒê√£ x√≥a checkpoint c≈©: {ck}")

def get_last_checkpoint(output_dir):
    if not os.path.exists(output_dir): return None
    ckpts = [c for c in os.listdir(output_dir) if c.startswith("checkpoint-")]
    if not ckpts: return None

    def idx(n):
        try:
            return int(n.split("-")[-1])
        except:
            return os.path.getmtime(os.path.join(output_dir, n))

    ckpts_sorted = sorted(ckpts, key=idx, reverse=True)
    manage_checkpoints(output_dir, keep_latest=2)
    return os.path.join(output_dir, ckpts_sorted[0])

last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
print("Checkpoint g·∫ßn nh·∫•t:", last_checkpoint or "Kh√¥ng c√≥ ‚Üí train t·ª´ ƒë·∫ßu")

# 5. LOAD & CLEAN DATA (C·∫≠p nh·∫≠t logic cho RoBERTa)
print("\nLoading WELFake dataset...")
dataset = load_dataset("davanstrien/WELFake")
df = pd.DataFrame(dataset["train"])
print("S·ªë m·∫´u ban ƒë·∫ßu:", len(df))

# --- H√ÄM CLEAN M·ªöI ---
def clean_text(s):
    if not isinstance(s, str): return ""
    # L∆ØU √ù: Kh√¥ng d√πng .lower() v√¨ RoBERTa l√† case-sensitive
    # s = s.lower()

    s = re.sub(r'https?://\S+', ' ', s) # B·ªè URL
    s = re.sub(r'<.*?>', ' ', s)        # B·ªè HTML tags

    # Regex cho ph√©p a-z, A-Z, 0-9. Gi·ªØ l·∫°i ch·ªØ hoa.
    s = re.sub(r'[^a-zA-Z0-9\s]', ' ', s)

    s = re.sub(r'\s+', ' ', s).strip()
    return s

print("ƒêang x·ª≠ l√Ω text (Clean)...")
# Clean t·ª´ng ph·∫ßn TR∆Ø·ªöC khi gh√©p ƒë·ªÉ tr√°nh l·ªói x√≥a m·∫•t separator
df['title'] = df['title'].fillna('').apply(clean_text)
df['text'] = df['text'].fillna('').apply(clean_text)

# Gh√©p chu·ªói d√πng Separator c·ªßa RoBERTa l√† </s>
print("Gh√©p chu·ªói title + </s> + text...")
df['content'] = df['title'] + " </s> " + df['text']

# L·ªçc b·ªè m·∫´u qu√° ng·∫Øn & tr√πng l·∫∑p
df = df[df['content'].str.len() > 20].drop_duplicates(subset=['content'])

print("Sau x·ª≠ l√Ω:", len(df))
print(f"V√≠ d·ª• m·∫´u 0: {df['content'].iloc[0][:100]}...")

label_counts = df['label'].value_counts(normalize=True)
print(f"Fake={label_counts.get(0,0):.1%}, Real={label_counts.get(1,0):.1%}")

# Class weights
classes = np.array([0,1])
class_weights = compute_class_weight('balanced', classes=classes, y=df['label'])
print("Class weights:", class_weights)

# 6. SPLIT
labels = df['label'].values
train_df, temp_df = train_test_split(df, test_size=0.25, random_state=42, stratify=labels)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"Train={len(train_df)}, Val={len(val_df)}, Test={len(test_df)}")

train_dataset = Dataset.from_pandas(train_df[['content','label']].reset_index(drop=True))
val_dataset   = Dataset.from_pandas(val_df[['content','label']].reset_index(drop=True))
test_dataset  = Dataset.from_pandas(test_df[['content','label']].reset_index(drop=True))
dataset_dict = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

# 7. TOKENIZER (RoBERTa-base)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    # RoBERTa x·ª≠ l√Ω padding/truncation t∆∞∆°ng t·ª± BERT
    return tokenizer(batch["content"], truncation=True, max_length=384, padding=False)

print("Tokenizing...")
tokenized = dataset_dict.map(tokenize_fn, batched=True, batch_size=1000, remove_columns=['content'])
tokenized = tokenized.rename_column("label", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 8. MODEL (RoBERTa-base)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2
)

model.config.id2label = {0:"Fake", 1:"Real"}
model.config.label2id = {"Fake":0, "Real":1}

# 9. TRAINING ARGS
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=8,   # RoBERTa t·ªën VRAM h∆°n m·ªôt ch√∫t, 8 v·∫´n an to√†n tr√™n T4
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    weight_decay=0.01,
    learning_rate=3e-5,              # RoBERTa th∆∞·ªùng ·ªïn ƒë·ªãnh ·ªü LR th·∫•p h∆°n ch√∫t (2e-5)
    lr_scheduler_type="linear",
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

# 10. METRICS
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted", zero_division=0
    )
    acc = accuracy_score(labels, preds)

    try:
        probs = torch.softmax(torch.tensor(logits), dim=1).numpy()[:,1]
        auc = roc_auc_score(labels, probs)
    except:
        auc = None

    return {
        "accuracy": float(acc),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "auc": auc
    }

# 11. CUSTOM TRAINER with CLASS WEIGHT
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        w = torch.tensor(class_weights, dtype=torch.float32, device=model.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=w)
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# 12. TRAINER INIT
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

# 13. TRAIN
print("\n================================")
print("üöÄ B·∫ÆT ƒê·∫¶U TRAIN ROBERTA-BASE")
print("================================\n")
trainer.train(resume_from_checkpoint=last_checkpoint)

manage_checkpoints(OUTPUT_DIR, keep_latest=2)

# 14. TEST
print("\nüéØ ƒê√ÅNH GI√Å TR√äN TEST SET")
results = trainer.evaluate(tokenized["test"])
for k, v in results.items():
    print(f"{k}: {v}")

# 15. SAVE MODEL
final_model_path = os.path.join(OUTPUT_DIR, "final_best_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print("\n‚úÖ ƒê√É L∆ØU M√î H√åNH T·∫†I:\n", final_model_path)

# CLEAN UP (C·∫©n th·∫≠n khi d√πng)
for f in os.listdir(OUTPUT_DIR):
    if f.startswith("checkpoint-") or f == "runs":
        shutil.rmtree(os.path.join(OUTPUT_DIR, f), ignore_errors=True)

print("\nüßπ D·ªçn d·∫πp ho√†n t·∫•t!")

Device: Tesla T4 | CUDA: True | VRAM: 15.8 GB
Drive: 45.9GB / 120.9GB (37.9%)
Checkpoint g·∫ßn nh·∫•t: Kh√¥ng c√≥ ‚Üí train t·ª´ ƒë·∫ßu

Loading WELFake dataset...
S·ªë m·∫´u ban ƒë·∫ßu: 72134
ƒêang x·ª≠ l√Ω text (Clean)...
Gh√©p chu·ªói title + </s> + text...
Sau x·ª≠ l√Ω: 63332
V√≠ d·ª• m·∫´u 0: LAW ENFORCEMENT ON HIGH ALERT Following Threats Against Cops And Whites On 9 11By BlackLivesMatter A...
Fake=54.5%, Real=45.5%
Class weights: [0.91734987 1.0990178 ]
Train=47499, Val=7916, Test=7917


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing...


Map:   0%|          | 0/47499 [00:00<?, ? examples/s]

Map:   0%|          | 0/7916 [00:00<?, ? examples/s]

Map:   0%|          | 0/7917 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



üöÄ B·∫ÆT ƒê·∫¶U TRAIN ROBERTA-BASE



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Auc
1,0.0531,0.042271,0.989515,0.989515,0.989515,0.989514,0.999092
2,0.0311,0.044708,0.992294,0.992326,0.992294,0.992297,0.999463
3,0.0093,0.043665,0.993052,0.993055,0.993052,0.993053,0.999356



üéØ ƒê√ÅNH GI√Å TR√äN TEST SET


eval_loss: 0.039608608931303024
eval_accuracy: 0.9936844764430971
eval_precision: 0.9936847508445357
eval_recall: 0.9936844764430971
eval_f1: 0.9936840381496029
eval_auc: 0.9997685719855648
eval_runtime: 42.2534
eval_samples_per_second: 187.37
eval_steps_per_second: 11.715
epoch: 3.0

‚úÖ ƒê√É L∆ØU M√î H√åNH T·∫†I:
 /content/drive/MyDrive/WELFake_RoBERTa_base_Pro/final_best_model

üßπ D·ªçn d·∫πp ho√†n t·∫•t!
