In [None]:
# CELL 1: Environment Test & Setup
# ============================================================================
print("="*80)
print("ENVIRONMENT SETUP & VERIFICATION")
print("="*80)

import sys
sys.path.append('..')

# Test basic imports
try:
    import transformers
    print(f"✅ Transformers version: {transformers.__version__}")
    
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
    print("✅ Core transformers imports successful")
    
    # Test tokenizer
    test_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    print("✅ Tokenizer test successful")
    
    # Import project modules
    from src.config import ModelConfig, DataConfig
    from src.train import TRANSFORMERS_AVAILABLE, EARLY_STOPPING_AVAILABLE
    
    print(f"✅ TRANSFORMERS_AVAILABLE: {TRANSFORMERS_AVAILABLE}")
    print(f"✅ EARLY_STOPPING_AVAILABLE: {EARLY_STOPPING_AVAILABLE}")
    print("\n🎉 Environment ready for training!")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("\n💡 Installing required packages...")
    import subprocess
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 
                          'transformers', 'accelerate', 'datasets'])
    print("✅ Packages installed. Please restart kernel and run again.")
    raise


TESTING ENVIRONMENT SETUP
✅ Path setup successful
✅ Transformers version: 4.39.3



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Fake_News_Detection_BERT\venv\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "d:\Fake_News_Detection_BERT\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\Fake_News_Detection_BERT\venv\Lib\site-packages\ipykernel\kernelapp.py", line 736, in start
    self.io_loop.start()
  File "d:\Fake_News_D

✅ Core transformers imports successful




✅ Tokenizer test successful
Transformers 4.39.3 loaded successfully
✅ Config import successful
❌ Environment setup failed: cannot import name 'EARLY_STOPPING_AVAILABLE' from 'src.train' (d:\Fake_News_Detection_BERT\notebooks\..\src\train.py)
💡 Installing transformers...
✅ Transformers installed. Please restart kernel.


# ============================================================================
# NOTEBOOK 04: BERT MODEL TRAINING
# ============================================================================

## 🎯 Objective
Fine-tune a BERT model for fake news detection and compare performance with the baseline model.

## 📋 What we'll do:
1. **Load preprocessed data** from notebook 02
2. **Prepare PyTorch datasets** for BERT training
3. **Fine-tune BERT model** using Hugging Face Transformers
4. **Evaluate performance** on train/val/test sets
5. **Compare with baseline** model performance
6. **Save model** and results

---


In [None]:
# CELL 2: Complete Imports
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import warnings
warnings.filterwarnings('ignore')

from transformers import AutoTokenizer

# Import all required modules
from src.config import (
    DataConfig, ModelConfig, TrainingConfig, 
    PROCESSED_DATA_DIR, METRICS_DIR, VISUALIZATIONS_DIR, MODELS_DIR
)
from src.dataset import create_dataset_from_dataframe
from src.train import BertTrainer, save_training_results
from src.evaluate import (
    compute_extended_metrics,
    plot_confusion_matrix, 
    plot_roc_curve,
    compare_models,
    save_evaluation_results
)
from src.utils import save_json

# Setup
plt.style.use('default')
sns.set_palette("husl")
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print("="*80)
print("IMPORTS COMPLETED")
print("="*80)
print(f"🖥️  Device: {device}")
print(f"🤖 Model: {ModelConfig.MODEL_NAME}")
print(f"📊 Ready to train!")
print("="*80)

✅ Imports successful!
🖥️  Device: cpu

🔍 Testing transformers availability...
✅ Transformers imports successful!
✅ Tokenizer test successful!
📊 Ready to train BERT model!


In [None]:
# CELL 3: Load Data
# ============================================================================
print("="*80)
print("LOADING PREPROCESSED DATA")
print("="*80)

# Load datasets
train_df = pd.read_csv(DataConfig.TRAIN_PATH)
val_df = pd.read_csv(DataConfig.VAL_PATH)
test_df = pd.read_csv(DataConfig.TEST_PATH)

print(f"\n📊 Data loaded successfully:")
print(f"   Train: {train_df.shape[0]:>7,} samples")
print(f"   Val:   {val_df.shape[0]:>7,} samples")
print(f"   Test:  {test_df.shape[0]:>7,} samples")
print(f"   Total: {(train_df.shape[0] + val_df.shape[0] + test_df.shape[0]):>7,} samples")

# Check label distribution
print(f"\n📈 Label distribution:")
for name, df in [('Train', train_df), ('Val', val_df), ('Test', test_df)]:
    fake_pct = (df['label'] == 1).sum() / len(df) * 100
    print(f"   {name:5} - Fake: {fake_pct:5.2f}%, Real: {100-fake_pct:5.2f}%")


LOADING DATA AND PREPARING TOKENIZER

📊 Data loaded successfully!
   Train set: 95,244 samples
   Val set:   20,409 samples
   Test set:  20,410 samples

🔤 Loading tokenizer: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


📋 Tokenizer info:
   Vocab size: 50,265
   Max length: 256
   Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}

🧪 Sample tokenization:
   Original text length: 85 chars
   Tokenized length: 26 tokens
   Sample tokens: [0, 642, 718, 5992, 32, 6749, 154, 31, 821, 7043]...

✅ Import verification:
   MODELS_DIR: d:\Fake_News_Detection_BERT\notebooks\..\models
   METRICS_DIR: d:\Fake_News_Detection_BERT\notebooks\..\results\metrics
   VISUALIZATIONS_DIR: d:\Fake_News_Detection_BERT\notebooks\..\results\visualizations


In [None]:
# CELL 4: Load Tokenizer
# ============================================================================
print("="*80)
print("LOADING TOKENIZER")
print("="*80)

tokenizer = AutoTokenizer.from_pretrained(ModelConfig.MODEL_NAME)

print(f"\n🔤 Tokenizer: {ModelConfig.MODEL_NAME}")
print(f"   Vocab size: {tokenizer.vocab_size:,}")
print(f"   Max length: {ModelConfig.MAX_LENGTH}")
print(f"   Padding: {ModelConfig.PADDING}")
print(f"   Truncation: {ModelConfig.TRUNCATION}")

# Test tokenization
sample_text = train_df.iloc[0]['cleaned_content']
sample_tokens = tokenizer.encode(sample_text, max_length=ModelConfig.MAX_LENGTH, truncation=True)

print(f"\n🧪 Tokenization test:")
print(f"   Input length: {len(sample_text)} chars")
print(f"   Output length: {len(sample_tokens)} tokens")
print(f"   First 10 tokens: {sample_tokens[:10]}")


INFO:src.dataset:Dataset initialized with 95244 samples
INFO:src.dataset:Max length: 256
INFO:src.dataset:Dataset initialized with 20409 samples
INFO:src.dataset:Max length: 256
INFO:src.dataset:Dataset initialized with 20410 samples
INFO:src.dataset:Max length: 256


CREATING PYTORCH DATASETS

📊 Datasets created:
   Train dataset: 95244 samples
   Val dataset:   20409 samples
   Test dataset:  20410 samples

🧪 Sample batch info:
   Input IDs shape: torch.Size([1, 256])
   Attention mask shape: torch.Size([1, 256])
   Labels: 1

📝 Sample decoded text (first 200 chars):
   pilots are resigning from german air force – they don ’ t want to fight against russia....


In [None]:
# CELL 5: Create PyTorch Datasets
# ============================================================================
print("="*80)
print("CREATING PYTORCH DATASETS")
print("="*80)

# Create datasets with proper error handling
try:
    train_dataset = create_dataset_from_dataframe(train_df, tokenizer)
    val_dataset = create_dataset_from_dataframe(val_df, tokenizer)
    test_dataset = create_dataset_from_dataframe(test_df, tokenizer)
    
    print(f"\n✅ Datasets created successfully:")
    print(f"   Train: {len(train_dataset):,} samples")
    print(f"   Val:   {len(val_dataset):,} samples")
    print(f"   Test:  {len(test_dataset):,} samples")
    
    # Verify dataset structure
    sample = train_dataset[0]
    print(f"\n🔍 Dataset structure:")
    print(f"   input_ids shape: {sample['input_ids'].shape}")
    print(f"   attention_mask shape: {sample['attention_mask'].shape}")
    print(f"   labels: {sample['labels'].item()}")
    
except Exception as e:
    print(f"❌ Error creating datasets: {e}")
    raise

TRAINING BERT MODEL
🔍 Checking transformers availability...
❌ Transformers import failed: cannot import name 'HF_DATASETS_DISABLE_PROGRESS_BARS' from 'datasets.config' (d:\Fake_News_Detection_BERT\.venv\Lib\site-packages\datasets\config.py)
💡 Please install transformers: pip install transformers


NameError: name 'TRANSFORMERS_AVAILABLE' is not defined

In [None]:
# CELL 6: Initialize Trainer
# ============================================================================
print("="*80)
print("INITIALIZING TRAINER")
print("="*80)

# Create output directory
output_dir = MODELS_DIR / "roberta"
output_dir.mkdir(parents=True, exist_ok=True)

# Initialize trainer
bert_trainer = BertTrainer(
    model_name=ModelConfig.MODEL_NAME,
    output_dir=str(output_dir)
)

print(f"✅ Trainer initialized:")
print(f"   Model: {ModelConfig.MODEL_NAME}")
print(f"   Output: {output_dir}")


In [None]:
# CELL 7: Train Model
# ============================================================================
print("="*80)
print("TRAINING MODEL")
print("="*80)

# Display training configuration
print(f"\n⚙️  Training Configuration:")
print(f"   Epochs: {ModelConfig.NUM_EPOCHS}")
print(f"   Batch size: {ModelConfig.BATCH_SIZE}")
print(f"   Learning rate: {ModelConfig.LEARNING_RATE}")
print(f"   Warmup steps: {ModelConfig.WARMUP_STEPS}")
print(f"   Weight decay: {ModelConfig.WEIGHT_DECAY}")
print(f"   Device: {device}")
print(f"   FP16: {TrainingConfig.USE_FP16 and torch.cuda.is_available()}")

# Train the model
print(f"\n🚀 Starting training...\n")

try:
    train_results = bert_trainer.train(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        num_epochs=ModelConfig.NUM_EPOCHS,
        batch_size=ModelConfig.BATCH_SIZE,
        learning_rate=ModelConfig.LEARNING_RATE,
        warmup_steps=ModelConfig.WARMUP_STEPS,
        weight_decay=ModelConfig.WEIGHT_DECAY
    )
    
    print("\n✅ Training completed successfully!")
    
except Exception as e:
    print(f"\n❌ Training failed: {e}")
    raise

# Display training summary
print("\n" + "="*80)
print("TRAINING SUMMARY")
print("="*80)
print(f"⏱️  Training time: {train_results['training_time']:.2f}s ({train_results['training_time']/60:.2f}m)")
print(f"\n📊 Final validation metrics:")
for k, v in train_results['eval_metrics'].items():
    if isinstance(v, (int, float)):
        print(f"   {k:20}: {v:.4f}")

In [None]:
# CELL 8: Evaluate on Test Set
# ============================================================================
print("="*80)
print("TEST SET EVALUATION")
print("="*80)

try:
    # Evaluate
    test_results = bert_trainer.evaluate(test_dataset)
    
    print("\n✅ Test evaluation completed!")
    print(f"\n📊 Test metrics:")
    for k, v in test_results.items():
        if isinstance(v, (int, float)):
            print(f"   {k:20}: {v:.4f}")
    
except Exception as e:
    print(f"❌ Evaluation failed: {e}")
    raise

In [None]:
# CELL 9: Get Detailed Predictions
# ============================================================================
print("="*80)
print("GENERATING PREDICTIONS")
print("="*80)

try:
    # Get predictions
    predictions = bert_trainer.trainer.predict(test_dataset)
    
    # Extract predictions and probabilities
    y_test_pred = np.argmax(predictions.predictions, axis=1)
    y_test_proba = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()
    y_test_true = test_df['label'].values
    
    print(f"\n✅ Predictions generated:")
    print(f"   Samples: {len(y_test_true):,}")
    print(f"   Predictions shape: {y_test_pred.shape}")
    print(f"   Probabilities shape: {y_test_proba.shape}")
    
    # Prediction distribution
    unique, counts = np.unique(y_test_pred, return_counts=True)
    print(f"\n📊 Prediction distribution:")
    for label, count in zip(unique, counts):
        label_name = "Real" if label == 0 else "Fake"
        pct = count / len(y_test_pred) * 100
        print(f"   {label_name}: {count:>6,} ({pct:5.2f}%)")
    
    # True label distribution
    unique, counts = np.unique(y_test_true, return_counts=True)
    print(f"\n📊 True label distribution:")
    for label, count in zip(unique, counts):
        label_name = "Real" if label == 0 else "Fake"
        pct = count / len(y_test_true) * 100
        print(f"   {label_name}: {count:>6,} ({pct:5.2f}%)")
    
except Exception as e:
    print(f"❌ Failed to generate predictions: {e}")
    raise


In [None]:
# CELL 10: Compute Extended Metrics
# ============================================================================
print("="*80)
print("COMPUTING EXTENDED METRICS")
print("="*80)

try:
    # Compute comprehensive metrics
    roberta_evaluation = compute_extended_metrics(
        y_true=y_test_true,
        y_pred=y_test_pred,
        y_proba=y_test_proba
    )
    
    # Add model name
    roberta_evaluation['model_name'] = f"RoBERTa ({ModelConfig.MODEL_NAME})"
    
    print("\n✅ Extended metrics computed!")
    print(f"\n📊 Complete evaluation results:")
    print(f"   Accuracy:  {roberta_evaluation['accuracy']:.4f}")
    print(f"   Precision: {roberta_evaluation['precision']:.4f}")
    print(f"   Recall:    {roberta_evaluation['recall']:.4f}")
    print(f"   F1-score:  {roberta_evaluation['f1']:.4f}")
    if 'roc_auc' in roberta_evaluation:
        print(f"   ROC-AUC:   {roberta_evaluation['roc_auc']:.4f}")
    if 'average_precision' in roberta_evaluation:
        print(f"   Avg Precision: {roberta_evaluation['average_precision']:.4f}")
    
    # Print classification report
    from sklearn.metrics import classification_report
    print(f"\n📋 Classification Report:")
    print(classification_report(y_test_true, y_test_pred, 
                                target_names=['Real', 'Fake'], 
                                digits=4))
    
except Exception as e:
    print(f"❌ Failed to compute metrics: {e}")
    raise


In [None]:
# CELL 11: Create Visualizations
# ============================================================================
print("="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Create visualization directory
viz_dir = VISUALIZATIONS_DIR / 'roberta'
viz_dir.mkdir(parents=True, exist_ok=True)

try:
    # Confusion Matrix
    print("\n📊 Creating confusion matrix...")
    plot_confusion_matrix(
        y_true=y_test_true,
        y_pred=y_test_pred,
        model_name=f"RoBERTa ({ModelConfig.MODEL_NAME})",
        save_path=viz_dir / "confusion_matrix.png"
    )
    print(f"   ✅ Saved: {viz_dir / 'confusion_matrix.png'}")
    
    # ROC Curve
    print("\n📊 Creating ROC curve...")
    plot_roc_curve(
        y_true=y_test_true,
        y_proba=y_test_proba,
        model_name=f"RoBERTa ({ModelConfig.MODEL_NAME})",
        save_path=viz_dir / "roc_curve.png"
    )
    print(f"   ✅ Saved: {viz_dir / 'roc_curve.png'}")
    
    print("\n✅ All visualizations created!")
    
except Exception as e:
    print(f"❌ Visualization failed: {e}")
    # Continue even if visualization fails


In [None]:
# CELL 12: Compare with Baseline
# ============================================================================
print("="*80)
print("MODEL COMPARISON")
print("="*80)

import json

try:
    # Try to load baseline results
    baseline_path = METRICS_DIR / "baseline_evaluation_results.json"
    
    if baseline_path.exists():
        with open(baseline_path, 'r') as f:
            baseline_evaluation = json.load(f)
        
        print("✅ Baseline results loaded")
        
        # Compare models
        print("\n📊 Creating comparison visualization...")
        comparison_df = compare_models(
            [baseline_evaluation, roberta_evaluation],
            save_path=viz_dir / "model_comparison.png"
        )
        
        # Calculate improvements
        metrics_to_compare = ['accuracy', 'precision', 'recall', 'f1']
        if 'roc_auc' in baseline_evaluation and 'roc_auc' in roberta_evaluation:
            metrics_to_compare.append('roc_auc')
        
        print(f"\n🚀 Performance Improvements (RoBERTa vs Baseline):")
        for metric in metrics_to_compare:
            if metric in baseline_evaluation and metric in roberta_evaluation:
                baseline_val = baseline_evaluation[metric]
                roberta_val = roberta_evaluation[metric]
                improvement = roberta_val - baseline_val
                pct_improvement = (improvement / baseline_val) * 100 if baseline_val > 0 else 0
                
                print(f"   {metric.capitalize():15}: {baseline_val:.4f} → {roberta_val:.4f} "
                      f"(+{improvement:.4f}, +{pct_improvement:.2f}%)")
        
    else:
        print("⚠️  Baseline results not found")
        print(f"   Looking for: {baseline_path}")
        print("   Skipping comparison")
        comparison_df = None
        
except Exception as e:
    print(f"⚠️  Comparison failed: {e}")
    comparison_df = None

In [None]:
# CELL 13: Save All Results
# ============================================================================
print("="*80)
print("SAVING RESULTS")
print("="*80)

# Create metrics directory
metrics_dir = METRICS_DIR / 'roberta'
metrics_dir.mkdir(parents=True, exist_ok=True)

try:
    # 1. Save evaluation results
    eval_path = metrics_dir / "evaluation_results.json"
    save_evaluation_results(roberta_evaluation, str(eval_path))
    print(f"✅ Evaluation: {eval_path}")
    
    # 2. Save training results
    train_path = metrics_dir / "training_results.json"
    save_training_results(train_results, str(train_path))
    print(f"✅ Training:   {train_path}")
    
    # 3. Save predictions
    predictions_data = {
        'y_true': y_test_true.tolist(),
        'y_pred': y_test_pred.tolist(),
        'y_proba': y_test_proba.tolist(),
        'model_name': f"RoBERTa ({ModelConfig.MODEL_NAME})",
        'num_samples': len(y_test_true),
        'num_correct': int((y_test_true == y_test_pred).sum()),
        'accuracy': float((y_test_true == y_test_pred).mean())
    }
    pred_path = metrics_dir / "test_predictions.json"
    save_json(predictions_data, pred_path)
    print(f"✅ Predictions: {pred_path}")
    
    # 4. Save model
    bert_trainer.save_model()
    print(f"✅ Model:      {output_dir}")
    
    print("\n✅ All results saved successfully!")
    
except Exception as e:
    print(f"❌ Failed to save results: {e}")
    raise


In [None]:
# CELL 14: Final Summary
# ============================================================================
print("\n" + "="*80)
print("🎉 ROBERTA MODEL TRAINING COMPLETE!")
print("="*80)

print("\n✅ Completed Tasks:")
print("   ✓ Loaded and prepared data")
print("   ✓ Created PyTorch datasets")
print("   ✓ Fine-tuned RoBERTa model")
print("   ✓ Evaluated on test set")
print("   ✓ Generated predictions")
print("   ✓ Computed extended metrics")
print("   ✓ Created visualizations")
if comparison_df is not None:
    print("   ✓ Compared with baseline")
print("   ✓ Saved all results")

print(f"\n📊 Final Model Performance:")
print(f"   Model:     {ModelConfig.MODEL_NAME}")
print(f"   Device:    {device}")
print(f"   Accuracy:  {roberta_evaluation['accuracy']:.4f}")
print(f"   Precision: {roberta_evaluation['precision']:.4f}")
print(f"   Recall:    {roberta_evaluation['recall']:.4f}")
print(f"   F1-score:  {roberta_evaluation['f1']:.4f}")
if 'roc_auc' in roberta_evaluation:
    print(f"   ROC-AUC:   {roberta_evaluation['roc_auc']:.4f}")
print(f"   Training:  {train_results['training_time']:.2f}s")

print(f"\n📁 Output Locations:")
print(f"   Model:         {output_dir}")
print(f"   Metrics:       {metrics_dir}")
print(f"   Visualizations: {viz_dir}")

if comparison_df is not None:
    print(f"\n🚀 Best Performance Gains:")
    for metric in ['accuracy', 'f1']:
        if metric in roberta_evaluation and metric in baseline_evaluation:
            improvement = roberta_evaluation[metric] - baseline_evaluation[metric]
            print(f"   {metric.capitalize()}: +{improvement:.4f} (+{improvement*100:.2f}%)")

print("\n🎯 Next Steps:")
print("   → Deploy model to production")
print("   → Build FastAPI endpoint")
print("   → Create frontend interface")
print("   → Monitor model performance")

print("\n" + "="*80)
print("🎊 PROJECT READY FOR DEPLOYMENT!")
print("="*80)