In [None]:
# ==============================================================================
# CELL 1: Setup Environment
# ==============================================================================
# Clone repo
!git clone https://github.com/Vietchemistryyy/Fake_News_Detection_BERT.git
%cd Fake_News_Detection_BERT

# Install packages
!pip install -q transformers accelerate datasets torch scikit-learn

# Imports
import os, sys, warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

warnings.filterwarnings('ignore')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("="*80)
print(f"üñ•Ô∏è  Device: {device}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
print("‚úÖ Environment ready!")
print("="*80)

In [None]:
# ==============================================================================
# CELL 2: Mount Drive & Load Dataset
# ==============================================================================
from google.colab import drive
drive.mount('/content/drive')

# Model output
MODEL_OUTPUT_DIR = '/content/drive/MyDrive/PhoBERT_Model'
!mkdir -p {MODEL_OUTPUT_DIR}

# Load dataset
DATA_DIR = 'data/DataPhoBERT'
train_df = pd.read_csv(f'{DATA_DIR}/train_df.csv')
val_df = pd.read_csv(f'{DATA_DIR}/val_df.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test_df.csv')

# Auto-fix columns
for df in [train_df, val_df, test_df]:
    if 'text' not in df.columns:
        for col in ['cmt_col', 'comment', 'content']:
            if col in df.columns:
                df.rename(columns={col: 'text'}, inplace=True)
                break
    if 'label' not in df.columns:
        for col in ['labels', 'label_id']:
            if col in df.columns:
                df.rename(columns={col: 'label'}, inplace=True)
                break

# Map labels to binary
for df in [train_df, val_df, test_df]:
    if df['label'].max() > 1:
        df['label'] = df['label'].apply(lambda x: 0 if x == 0 else 1)

print(f"‚úÖ Dataset loaded:")
print(f"   Train: {len(train_df):,}")
print(f"   Val:   {len(val_df):,}")
print(f"   Test:  {len(test_df):,}")

In [None]:
# ==============================================================================
# CELL 3: Load PhoBERT & Tokenize
# ==============================================================================
MODEL_NAME = "vinai/phobert-base"
MAX_LENGTH = 256

print(f"ü§ñ Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
model = model.to(device)

print(f"‚úÖ Model loaded ({sum(p.numel() for p in model.parameters()):,} parameters)")

# Tokenize
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=MAX_LENGTH)

train_data = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["label"].tolist()})
val_data = Dataset.from_dict({"text": val_df["text"].tolist(), "label": val_df["label"].tolist()})
test_data = Dataset.from_dict({"text": test_df["text"].tolist(), "label": test_df["label"].tolist()})

train_dataset = train_data.map(tokenize_function, batched=True)
val_dataset = val_data.map(tokenize_function, batched=True)
test_dataset = test_data.map(tokenize_function, batched=True)

train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

print("‚úÖ Datasets tokenized")

In [None]:
# ==============================================================================
# CELL 4: Training Configuration
# ==============================================================================
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

training_args = TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=500,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_steps=100,
    fp16=torch.cuda.is_available(),
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

print("‚úÖ Trainer initialized")
print(f"‚öôÔ∏è  Epochs: {training_args.num_train_epochs}, Batch: {training_args.per_device_train_batch_size}, LR: {training_args.learning_rate}")

In [None]:
# ==============================================================================
# CELL 5: TRAIN! üöÄ
# ==============================================================================
print("\nüöÄ Starting training...\n")
train_result = trainer.train()

print(f"\n‚úÖ Training completed!")
print(f"   Time: {train_result.metrics['train_runtime']:.2f}s")
print(f"   Loss: {train_result.metrics['train_loss']:.4f}")

In [None]:
# ==============================================================================
# CELL 6: Evaluate & Confusion Matrix
# ==============================================================================
# Validation
val_results = trainer.evaluate(val_dataset)
print("\nüìä Validation Results:")
print(f"   Accuracy:  {val_results['eval_accuracy']:.4f}")
print(f"   F1 Score:  {val_results['eval_f1']:.4f}")

# Test
test_results = trainer.evaluate(test_dataset)
print("\nüìä Test Results:")
print(f"   Accuracy:  {test_results['eval_accuracy']:.4f}")
print(f"   Precision: {test_results['eval_precision']:.4f}")
print(f"   Recall:    {test_results['eval_recall']:.4f}")
print(f"   F1 Score:  {test_results['eval_f1']:.4f}")

# Confusion Matrix
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = test_df['label'].values
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Real', 'Fake'], yticklabels=['Real', 'Fake'])
plt.title('Confusion Matrix - PhoBERT')
plt.ylabel('True')
plt.xlabel('Predicted')
plt.savefig(f'{MODEL_OUTPUT_DIR}/confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# ==============================================================================
# CELL 7: Save Model
# ==============================================================================
print("üíæ Saving model...")
model.save_pretrained(MODEL_OUTPUT_DIR)
tokenizer.save_pretrained(MODEL_OUTPUT_DIR)

# Save results
import json
results = {
    "model": MODEL_NAME,
    "test_accuracy": float(test_results['eval_accuracy']),
    "test_f1": float(test_results['eval_f1']),
    "training_time": train_result.metrics['train_runtime']
}
with open(f'{MODEL_OUTPUT_DIR}/results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f"\n‚úÖ Model saved to: {MODEL_OUTPUT_DIR}")
print(f"\nüìÅ Files in Drive:")
!ls -lh {MODEL_OUTPUT_DIR}

In [None]:
# ==============================================================================
# CELL 8: Test & Summary
# ==============================================================================
# Test function
def predict(text):
    model.eval()
    inputs = tokenizer(text, max_length=MAX_LENGTH, padding="max_length", truncation=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.softmax(outputs.logits, dim=1)[0]
        pred = torch.argmax(probs).item()
    return {"label": "Fake" if pred == 1 else "Real", "confidence": float(probs[pred])}

# Test samples
tests = [
    "Ch√≠nh ph·ªß c√¥ng b·ªë ch√≠nh s√°ch m·ªõi v·ªÅ gi√°o d·ª•c",
    "Ph√°t hi·ªán thu·ªëc ch·ªØa ung th∆∞ ch·ªâ trong 3 ng√†y!",
    "Ng√¢n h√†ng ƒëi·ªÅu ch·ªânh l√£i su·∫•t c∆° b·∫£n"
]

print("\nüß™ Testing:")
for i, text in enumerate(tests, 1):
    result = predict(text)
    print(f"{i}. {text[:50]}...")
    print(f"   ‚Üí {result['label']} ({result['confidence']:.2%})")

print("\n" + "="*80)
print("üéâ HO√ÄN T·∫§T!")
print("="*80)
print(f"\nüìä K·∫øt qu·∫£:")
print(f"   Accuracy: {test_results['eval_accuracy']:.2%}")
print(f"   F1 Score: {test_results['eval_f1']:.4f}")
print(f"\nüìÅ Model location: {MODEL_OUTPUT_DIR}")
print(f"\nüí° Next steps:")
print(f"   1. Download model t·ª´ Google Drive")
print(f"   2. Copy v√†o: models/PhoBERT/")
print(f"   3. Update api/.env: PHOBERT_MODEL_PATH=../models/PhoBERT")
print(f"   4. Restart API")
print("="*80)
