In [None]:
# ==============================================================================
# CELL 1: Clone Repo
# ==============================================================================
!git clone https://github.com/Vietchemistryyy/Fake_News_Detection_BERT.git
%cd Fake_News_Detection_BERT
!ls


In [None]:
# ==============================================================================
# CELL 2: Environment Setup
# ==============================================================================
print("="*80)
print("ENVIRONMENT SETUP & VERIFICATION")
print("="*80)

import os, sys
os.chdir("/content/Fake_News_Detection_BERT")
sys.path.append(os.getcwd())
print("üìÅ Working directory:", os.getcwd())

# Test imports
try:
    import transformers
    print(f"‚úÖ Transformers version: {transformers.__version__}")
    
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    print("‚úÖ Core transformers imports successful")
    
    # Test PhoBERT tokenizer
    test_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
    print("‚úÖ PhoBERT tokenizer test successful")
    print("\nüéâ Environment ready for training!")
    
except ImportError as e:
    print(f"‚ùå Import error: {e}")
    print("\nüí° Installing required packages...")
    !pip install -q transformers accelerate datasets torch scikit-learn
    print("‚úÖ Packages installed. Please restart kernel and run again.")

In [None]:
# ==============================================================================
# CELL 3: Complete Imports
# ==============================================================================
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import warnings
warnings.filterwarnings('ignore')

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("="*80)
print("IMPORTS COMPLETED")
print("="*80)
print(f"üñ•Ô∏è  Device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
print(f"ü§ñ Model: vinai/phobert-base")
print(f"üìä Ready to train!")
print("="*80)

In [None]:
# ==============================================================================
# CELL 4: Mount Google Drive
# ==============================================================================
from google.colab import drive
drive.mount('/content/drive')

# T·∫°o th∆∞ m·ª•c l∆∞u model trong Drive
MODEL_OUTPUT_DIR = '/content/drive/MyDrive/PhoBERT_VFND'
!mkdir -p {MODEL_OUTPUT_DIR}
print(f"‚úÖ Model s·∫Ω ƒë∆∞·ª£c l∆∞u v√†o: {MODEL_OUTPUT_DIR}")

In [None]:
# ==============================================================================
# CELL 5: T·∫£i VFND Dataset
# ==============================================================================
print("="*80)
print("DOWNLOADING VFND DATASET")
print("="*80)

import requests
from io import StringIO
import re

# URLs (GitHub raw content)
FAKE_URL = "https://raw.githubusercontent.com/WhySchools/VFND-vietnamese-fake-news-datasets/master/fake.csv"
REAL_URL = "https://raw.githubusercontent.com/WhySchools/VFND-vietnamese-fake-news-datasets/master/real.csv"

print("\nüì• Downloading fake news...")
try:
    fake_response = requests.get(FAKE_URL, timeout=30)
    fake_response.raise_for_status()
    fake_df = pd.read_csv(StringIO(fake_response.text))
    fake_df['label'] = 1
    print(f"‚úÖ Downloaded {len(fake_df):,} fake news samples")
except Exception as e:
    print(f"‚ùå Error downloading fake.csv: {e}")
    print("\nüí° Alternative: Clone repo v√† load t·ª´ file:")
    print("   !git clone https://github.com/WhySchools/VFND-vietnamese-fake-news-datasets.git")
    print("   fake_df = pd.read_csv('VFND-vietnamese-fake-news-datasets/fake.csv')")
    raise

print("\nÔøΩ  Downloading real news...")
try:
    real_response = requests.get(REAL_URL, timeout=30)
    real_response.raise_for_status()
    real_df = pd.read_csv(StringIO(real_response.text))
    real_df['label'] = 0
    print(f"‚úÖ Downloaded {len(real_df):,} real news samples")
except Exception as e:
    print(f"‚ùå Error downloading real.csv: {e}")
    print("\nüí° Alternative: Clone repo v√† load t·ª´ file:")
    print("   !git clone https://github.com/WhySchools/VFND-vietnamese-fake-news-datasets.git")
    print("   real_df = pd.read_csv('VFND-vietnamese-fake-news-datasets/real.csv')")
    raise

# Combine
df = pd.concat([fake_df, real_df], ignore_index=True)
print(f"\nüìä Total samples: {len(df):,}")
print(f"   Fake: {(df['label'] == 1).sum():,} ({(df['label'] == 1).sum() / len(df) * 100:.2f}%)")
print(f"   Real: {(df['label'] == 0).sum():,} ({(df['label'] == 0).sum() / len(df) * 100:.2f}%)")

# Hi·ªÉn th·ªã columns ƒë·ªÉ bi·∫øt c·ªôt n√†o ch·ª©a content
print(f"\nüìã Dataset columns: {df.columns.tolist()}")

In [None]:
# ==============================================================================
# CELL 5B: BACKUP - N·∫øu URLs kh√¥ng ho·∫°t ƒë·ªông (Ch·ªâ ch·∫°y n·∫øu Cell 5 l·ªói)
# ==============================================================================
# Uncomment v√† ch·∫°y n·∫øu Cell 5 b·ªã l·ªói:

# print("üì• Using backup method: Clone repo...")
# !git clone https://github.com/WhySchools/VFND-vietnamese-fake-news-datasets.git
# 
# fake_df = pd.read_csv('VFND-vietnamese-fake-news-datasets/fake.csv')
# fake_df['label'] = 1
# print(f"‚úÖ Loaded {len(fake_df):,} fake news samples")
# 
# real_df = pd.read_csv('VFND-vietnamese-fake-news-datasets/real.csv')
# real_df['label'] = 0
# print(f"‚úÖ Loaded {len(real_df):,} real news samples")
# 
# df = pd.concat([fake_df, real_df], ignore_index=True)
# print(f"\nüìä Total samples: {len(df):,}")
# print(f"üìã Dataset columns: {df.columns.tolist()}")


In [None]:
# ==============================================================================
# CELL 6: X·ª≠ l√Ω d·ªØ li·ªáu
# ==============================================================================
print("="*80)
print("PROCESSING DATASET")
print("="*80)

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\u00C0-\u1EF9]', ' ', text)
    return ' '.join(text.split()).strip()

# T√¨m c·ªôt content
content_col = None
for col in ['content', 'text', 'article', 'body', 'Content', 'Text']:
    if col in df.columns:
        content_col = col
        break

if content_col is None:
    content_col = df.columns[0]

print(f"\nüî§ Using content column: '{content_col}'")

# Clean
print("\nüßπ Cleaning text...")
df['cleaned_content'] = df[content_col].apply(clean_text)

# Remove empty
before_len = len(df)
df = df[df['cleaned_content'].str.len() >= 20]
print(f"   Removed {before_len - len(df):,} samples (too short)")

# Remove duplicates
before_len = len(df)
df = df.drop_duplicates(subset=['cleaned_content'])
print(f"   Removed {before_len - len(df):,} duplicates")

# Shuffle
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"\n‚úÖ Final dataset: {len(df):,} samples")

In [None]:
# ==============================================================================
# CELL 7: Chia train/val/test
# ==============================================================================
print("="*80)
print("SPLITTING DATASET")
print("="*80)

# Split
train_df, temp_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['label'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"\n‚úÖ Split completed:")
print(f"   Train: {len(train_df):,} samples")
print(f"   Val:   {len(val_df):,} samples")
print(f"   Test:  {len(test_df):,} samples")

# Check distribution
for name, data in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    fake_pct = (data['label'] == 1).sum() / len(data) * 100
    print(f"   {name:5} - Fake: {fake_pct:5.2f}%, Real: {100-fake_pct:5.2f}%")

In [None]:
# ==============================================================================
# CELL 8: Load PhoBERT
# ==============================================================================
print("="*80)
print("LOADING PHOBERT")
print("="*80)

MODEL_NAME = "vinai/phobert-base"
MAX_LENGTH = 256

print(f"\nü§ñ Loading {MODEL_NAME}...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"‚úÖ Tokenizer loaded (vocab size: {tokenizer.vocab_size:,})")

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=2,
    problem_type="single_label_classification"
)
model = model.to(device)
print(f"‚úÖ Model loaded")
print(f"   Parameters: {sum(p.numel() for p in model.parameters()):,}")

In [None]:
# ==============================================================================
# CELL 9: Chu·∫©n b·ªã Datasets
# ==============================================================================
print("="*80)
print("PREPARING DATASETS")
print("="*80)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=MAX_LENGTH
    )

# Create datasets
train_data = Dataset.from_dict({
    "text": train_df["cleaned_content"].tolist(),
    "label": train_df["label"].tolist()
})

val_data = Dataset.from_dict({
    "text": val_df["cleaned_content"].tolist(),
    "label": val_df["label"].tolist()
})

test_data = Dataset.from_dict({
    "text": test_df["cleaned_content"].tolist(),
    "label": test_df["label"].tolist()
})

# Tokenize
print("\nüî§ Tokenizing...")
train_dataset = train_data.map(tokenize_function, batched=True)
val_dataset = val_data.map(tokenize_function, batched=True)
test_dataset = test_data.map(tokenize_function, batched=True)

# Set format
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

print(f"‚úÖ Datasets ready")

In [None]:
# ==============================================================================
# CELL 10: Training Arguments
# ==============================================================================
training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir=f"{MODEL_OUTPUT_DIR}/logs",
    logging_steps=100,
    report_to="none",
    fp16=torch.cuda.is_available(),
    dataloader_num_workers=2,
    seed=42,
    push_to_hub=False,
)

print("‚öôÔ∏è  Training configuration:")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   FP16: {training_args.fp16}")

In [None]:
# ==============================================================================
# CELL 11: Metrics Function
# ==============================================================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, predictions, average='binary'
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

print("‚úÖ Metrics function defined")

In [None]:
# ==============================================================================
# CELL 12: Initialize Trainer
# ==============================================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("‚úÖ Trainer initialized")
print("\nüöÄ Starting training...\n")


In [None]:
# ==============================================================================
# CELL 13: TRAIN! üöÄ
# ==============================================================================
train_result = trainer.train()

print("\n‚úÖ Training completed!")
print(f"   Training time: {train_result.metrics['train_runtime']:.2f}s")
print(f"   Training loss: {train_result.metrics['train_loss']:.4f}")

In [None]:
# ==============================================================================
# CELL 14: Evaluate
# ==============================================================================
print("\nüìä Evaluating on validation set...")
val_results = trainer.evaluate(val_dataset)

print("\n‚úÖ Validation Results:")
print(f"   Accuracy:  {val_results['eval_accuracy']:.4f}")
print(f"   Precision: {val_results['eval_precision']:.4f}")
print(f"   Recall:    {val_results['eval_recall']:.4f}")
print(f"   F1 Score:  {val_results['eval_f1']:.4f}")

print("\nüìä Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print("\n‚úÖ Test Results:")
print(f"   Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"   Precision: {test_results['eval_precision']:.4f}")
print(f"   Recall:    {test_results['eval_recall']:.4f}")
print(f"   F1 Score:  {test_results['eval_f1']:.4f}")

In [None]:
# ==============================================================================
# CELL 15: Confusion Matrix
# ==============================================================================
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = test_df['label'].values

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Real', 'Fake'],
            yticklabels=['Real', 'Fake'])
plt.title('Confusion Matrix - PhoBERT Fine-tuned')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig(f'{MODEL_OUTPUT_DIR}/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Confusion matrix saved")

In [None]:
# ==============================================================================
# CELL 16: Save Model
# ==============================================================================
print("üíæ Saving model...")

model.save_pretrained(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)

# Save results
import json
results = {
    "model_name": MODEL_NAME,
    "max_length": MAX_LENGTH,
    "training_time": train_result.metrics['train_runtime'],
    "validation_results": {
        "accuracy": float(val_results['eval_accuracy']),
        "precision": float(val_results['eval_precision']),
        "recall": float(val_results['eval_recall']),
        "f1": float(val_results['eval_f1'])
    },
    "test_results": {
        "accuracy": float(test_results['eval_accuracy']),
        "precision": float(test_results['eval_precision']),
        "recall": float(test_results['eval_recall']),
        "f1": float(test_results['eval_f1'])
    }
}

with open(f'{MODEL_OUTPUT_DIR}/training_results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"\n‚úÖ Model saved to: {MODEL_OUTPUT_DIR}")

In [None]:
# ==============================================================================
# CELL 17: Copy Model sang Drive ƒë·ªÉ download
# ==============================================================================
print("="*80)
print("COPY MODEL TO GOOGLE DRIVE")
print("="*80)

import shutil
easy_path = '/content/drive/MyDrive/PhoBERT_Model_Download'

print(f"\nüì¶ Copying model to: {easy_path}")
if os.path.exists(easy_path):
    shutil.rmtree(easy_path)
shutil.copytree(MODEL_OUTPUT_DIR, easy_path)

print("\n‚úÖ Model copied to Google Drive!")
print(f"\nüìÅ Location: My Drive/PhoBERT_Model_Download/")
print(f"\nüìã Important files:")
print(f"   ‚úÖ pytorch_model.bin ({os.path.getsize(f'{easy_path}/pytorch_model.bin')/1e6:.1f} MB)")
print(f"   ‚úÖ config.json")
print(f"   ‚úÖ vocab.txt")
print(f"   ‚úÖ training_results.json")

print("\nüí° Download v·ªÅ m√°y:")
print("   1. M·ªü Google Drive (web ho·∫∑c desktop)")
print("   2. V√†o My Drive/PhoBERT_Model_Download/")
print("   3. Right-click ‚Üí Download")
print("   4. Copy v√†o: models/PhoBERT_VFND/")
print("="*80)


In [None]:
# ==============================================================================
# CELL 18: Test v·ªõi c√¢u m·∫´u
# ==============================================================================
def predict_text(text):
    model.eval()
    inputs = tokenizer(
        text,
        max_length=MAX_LENGTH,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1)[0]
        pred_class = torch.argmax(probs).item()
    
    return {
        "label": "Fake" if pred_class == 1 else "Real",
        "confidence": float(probs[pred_class]),
        "probabilities": {
            "real": float(probs[0]),
            "fake": float(probs[1])
        }
    }

# Test samples
test_texts = [
    "Ch√≠nh ph·ªß c√¥ng b·ªë ch√≠nh s√°ch m·ªõi v·ªÅ gi√°o d·ª•c ƒë·∫°i h·ªçc",
    "Ph√°t hi·ªán thu·ªëc ch·ªØa ung th∆∞ ch·ªâ trong 3 ng√†y, b√°c sƒ© kh√¥ng mu·ªën b·∫°n bi·∫øt!",
    "Ng√¢n h√†ng Nh√† n∆∞·ªõc ƒëi·ªÅu ch·ªânh l√£i su·∫•t c∆° b·∫£n xu·ªëng 4.5%",
    "U·ªëng n∆∞·ªõc chanh m·ªói s√°ng gi√∫p gi·∫£m 10kg trong 1 tu·∫ßn!",
]

print("\nüß™ Testing with sample texts:\n")
for i, text in enumerate(test_texts, 1):
    result = predict_text(text)
    print(f"{i}. {text[:70]}...")
    print(f"   Prediction: {result['label']}")
    print(f"   Confidence: {result['confidence']:.2%}")
    print(f"   Real: {result['probabilities']['real']:.2%} | Fake: {result['probabilities']['fake']:.2%}")
    print()

In [None]:
# ==============================================================================
# CELL 19: Summary
# ==============================================================================
print("\n" + "="*80)
print("üéâ FINE-TUNING HO√ÄN T·∫§T!")
print("="*80)
print(f"\nüìÅ Model ƒë√£ ƒë∆∞·ª£c l∆∞u t·∫°i: {MODEL_OUTPUT_DIR}")
print(f"\nüìä K·∫øt qu·∫£ Test Set:")
print(f"   Accuracy:  {test_results['eval_accuracy']:.2%}")
print(f"   F1 Score:  {test_results['eval_f1']:.4f}")
print(f"\nüí° B∆∞·ªõc ti·∫øp theo:")
print(f"   1. Download model t·ª´ Google Drive")
print(f"   2. Copy v√†o th∆∞ m·ª•c models/PhoBERT_VFND/")
print(f"   3. Update api/.env: PHOBERT_MODEL_PATH=../models/PhoBERT_VFND")
print(f"   4. Restart API server")
print("="*80)
