In [12]:
# ============================================================================
# CELL 0: Test Environment Setup
# ============================================================================
print("="*80)
print("TESTING ENVIRONMENT SETUP")
print("="*80)

# Test basic imports
try:
    import sys
    sys.path.append('..')
    print("✅ Path setup successful")
except Exception as e:
    print(f"❌ Path setup failed: {e}")

# Test transformers
try:
    import transformers
    print(f"✅ Transformers version: {transformers.__version__}")
    
    # Test specific imports
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    print("✅ AutoTokenizer and AutoModel imports successful")
    
    # Try to import Trainer separately
    try:
        from transformers import Trainer
        print("✅ Trainer import successful")
    except ImportError:
        print("⚠️ Trainer not available in this transformers version")
        print("💡 Trying alternative import...")
        from transformers.trainer import Trainer
        print("✅ Trainer import successful (alternative)")
    
    # Test tokenizer
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    print("✅ Tokenizer creation successful")
    
    # Test our modules
    from src.config import ModelConfig, DataConfig
    print("✅ Config import successful")
    
    from src.train import TRANSFORMERS_AVAILABLE
    print(f"✅ TRANSFORMERS_AVAILABLE: {TRANSFORMERS_AVAILABLE}")
    
    print("\n🎉 Environment setup complete! Ready for BERT training.")
    
except Exception as e:
    print(f"❌ Environment setup failed: {e}")
    print("💡 Please check your Python environment and dependencies")


TESTING ENVIRONMENT SETUP
✅ Path setup successful
✅ Transformers version: 4.57.0
✅ AutoTokenizer and AutoModel imports successful
⚠️ Trainer not available in this transformers version
💡 Trying alternative import...
❌ Environment setup failed: cannot import name 'HF_DATASETS_DISABLE_PROGRESS_BARS' from 'datasets.config' (d:\Fake_News_Detection_BERT\.venv\Lib\site-packages\datasets\config.py)
💡 Please check your Python environment and dependencies


# ============================================================================
# NOTEBOOK 04: BERT MODEL TRAINING
# ============================================================================

## 🎯 Objective
Fine-tune a BERT model for fake news detection and compare performance with the baseline model.

## 📋 What we'll do:
1. **Load preprocessed data** from notebook 02
2. **Prepare PyTorch datasets** for BERT training
3. **Fine-tune BERT model** using Hugging Face Transformers
4. **Evaluate performance** on train/val/test sets
5. **Compare with baseline** model performance
6. **Save model** and results

---


In [None]:
# ============================================================================
# CELL 1: Imports and Setup
# ============================================================================
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

# Import from src
from src.config import (
    DataConfig, ModelConfig, TrainingConfig, 
    PROCESSED_DATA_DIR, METRICS_DIR, VISUALIZATIONS_DIR, MODELS_DIR
)
from src.dataset import create_data_loaders
from src.train import BertTrainer, train_bert_model
from src.evaluate import (
    evaluate_model, 
    plot_confusion_matrix, 
    plot_roc_curve,
    compare_models,
    save_evaluation_results
)
from src.utils import save_json

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ Imports successful!")
print(f"🖥️  Device: {device}")

# Test transformers availability
print(f"\n🔍 Testing transformers availability...")
try:
    from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification
    print("✅ Transformers imports successful!")
    
    # Test tokenizer
    test_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    print("✅ Tokenizer test successful!")
    
    print(f"📊 Ready to train BERT model!")
except ImportError as e:
    print(f"❌ Transformers not available: {e}")
    print("💡 Please install: pip install transformers")
    print("⚠️  Only baseline model will be available")


Only baseline model training will be available.
✅ Imports successful!
🖥️  Device: cpu
📊 Ready to train BERT model!


In [None]:
# ============================================================================
# CELL 2: Load Data and Prepare Tokenizer
# ============================================================================
print("="*80)
print("LOADING DATA AND PREPARING TOKENIZER")
print("="*80)

# Load the processed datasets
train_df = pd.read_csv(DataConfig.TRAIN_PATH)
val_df = pd.read_csv(DataConfig.VAL_PATH)
test_df = pd.read_csv(DataConfig.TEST_PATH)

print(f"\n📊 Data loaded successfully!")
print(f"   Train set: {train_df.shape[0]:,} samples")
print(f"   Val set:   {val_df.shape[0]:,} samples")
print(f"   Test set:  {test_df.shape[0]:,} samples")

# Load tokenizer
print(f"\n🔤 Loading tokenizer: {ModelConfig.MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(ModelConfig.MODEL_NAME)

print(f"\n📋 Tokenizer info:")
print(f"   Vocab size: {tokenizer.vocab_size:,}")
print(f"   Max length: {ModelConfig.MAX_LENGTH}")
print(f"   Special tokens: {tokenizer.special_tokens_map}")

# Test tokenization on sample text
sample_text = train_df.iloc[0]['cleaned_content']
sample_tokens = tokenizer.encode(sample_text, max_length=ModelConfig.MAX_LENGTH, truncation=True)
print(f"\n🧪 Sample tokenization:")
print(f"   Original text length: {len(sample_text)} chars")
print(f"   Tokenized length: {len(sample_tokens)} tokens")
print(f"   Sample tokens: {sample_tokens[:10]}...")

# Verify imports
print(f"\n✅ Import verification:")
print(f"   MODELS_DIR: {MODELS_DIR}")
print(f"   METRICS_DIR: {METRICS_DIR}")
print(f"   VISUALIZATIONS_DIR: {VISUALIZATIONS_DIR}")


LOADING DATA AND PREPARING TOKENIZER

📊 Data loaded successfully!
   Train set: 95,244 samples
   Val set:   20,409 samples
   Test set:  20,410 samples

🔤 Loading tokenizer: distilbert-base-uncased


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


📋 Tokenizer info:
   Vocab size: 30,522
   Max length: 256
   Special tokens: {'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}

🧪 Sample tokenization:
   Original text length: 85 chars
   Tokenized length: 20 tokens
   Sample tokens: [101, 8221, 2024, 24642, 2013, 2446, 2250, 2486, 1516, 2027]...


In [4]:
# ============================================================================
# CELL 3: Create PyTorch Datasets
# ============================================================================
print("="*80)
print("CREATING PYTORCH DATASETS")
print("="*80)

from src.dataset import create_dataset_from_dataframe

# Create datasets
train_dataset = create_dataset_from_dataframe(train_df, tokenizer)
val_dataset = create_dataset_from_dataframe(val_df, tokenizer)
test_dataset = create_dataset_from_dataframe(test_df, tokenizer)

print(f"\n📊 Datasets created:")
print(f"   Train dataset: {len(train_dataset)} samples")
print(f"   Val dataset:   {len(val_dataset)} samples")
print(f"   Test dataset:  {len(test_dataset)} samples")

# Test dataset sample
sample_item = train_dataset[0]
sample_batch = {
    'input_ids': sample_item['input_ids'].unsqueeze(0),  # Add batch dimension
    'attention_mask': sample_item['attention_mask'].unsqueeze(0),
    'labels': sample_item['labels'].unsqueeze(0)
}

print(f"\n🧪 Sample batch info:")
print(f"   Input IDs shape: {sample_batch['input_ids'].shape}")
print(f"   Attention mask shape: {sample_batch['attention_mask'].shape}")
print(f"   Labels: {sample_batch['labels'].item()}")

# Decode sample tokens
sample_input_ids = sample_item['input_ids']
sample_decoded = tokenizer.decode(sample_input_ids, skip_special_tokens=True)
print(f"\n📝 Sample decoded text (first 200 chars):")
print(f"   {sample_decoded[:200]}...")


INFO:src.dataset:Dataset initialized with 95244 samples
INFO:src.dataset:Max length: 256
INFO:src.dataset:Dataset initialized with 20409 samples
INFO:src.dataset:Max length: 256
INFO:src.dataset:Dataset initialized with 20410 samples
INFO:src.dataset:Max length: 256


CREATING PYTORCH DATASETS

📊 Datasets created:
   Train dataset: 95244 samples
   Val dataset:   20409 samples
   Test dataset:  20410 samples

🧪 Sample batch info:
   Input IDs shape: torch.Size([1, 256])
   Attention mask shape: torch.Size([1, 256])
   Labels: 1

📝 Sample decoded text (first 200 chars):
   pilots are resigning from german air force – they don ’ t want to fight against russia....


In [13]:
# ============================================================================
# CELL 4: Train BERT Model
# ============================================================================
print("="*80)
print("TRAINING BERT MODEL")
print("="*80)

# Import MODELS_DIR if not already imported
from src.config import MODELS_DIR

# Debug transformers availability
print("🔍 Checking transformers availability...")
try:
    from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification
    print("✅ Transformers imports successful!")
    
    # Check TRANSFORMERS_AVAILABLE flag
    from src.train import TRANSFORMERS_AVAILABLE
    print(f"📊 TRANSFORMERS_AVAILABLE flag: {TRANSFORMERS_AVAILABLE}")
    
    if not TRANSFORMERS_AVAILABLE:
        print("⚠️ TRANSFORMERS_AVAILABLE is False, trying to fix...")
        import transformers
        print(f"✅ Transformers version: {transformers.__version__}")
        
except ImportError as e:
    print(f"❌ Transformers import failed: {e}")
    print("💡 Please install transformers: pip install transformers")

# Create BERT trainer
if TRANSFORMERS_AVAILABLE:
    bert_trainer = BertTrainer(
        model_name=ModelConfig.MODEL_NAME,
        output_dir=MODELS_DIR / "bert"
    )
    print("✅ BERT trainer created successfully!")
else:
    print("❌ Cannot create BERT trainer - transformers not available")
    bert_trainer = None

# Train the model
if bert_trainer is not None:
    print(f"\n🚀 Starting BERT training...")
    print(f"   Model: {ModelConfig.MODEL_NAME}")
    print(f"   Epochs: {ModelConfig.NUM_EPOCHS}")
    print(f"   Batch size: {ModelConfig.BATCH_SIZE}")
    print(f"   Learning rate: {ModelConfig.LEARNING_RATE}")
    print(f"   Max length: {ModelConfig.MAX_LENGTH}")

    train_results = bert_trainer.train(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        num_epochs=ModelConfig.NUM_EPOCHS,
        batch_size=ModelConfig.BATCH_SIZE,
        learning_rate=ModelConfig.LEARNING_RATE,
        warmup_steps=ModelConfig.WARMUP_STEPS,
        weight_decay=ModelConfig.WEIGHT_DECAY
    )

    print("\n✅ BERT model training completed!")
    print(f"\n📊 Training Results Summary:")
    print(f"   Training time: {train_results['training_time']:.2f} seconds")
    print(f"   Final validation metrics: {train_results['eval_metrics']}")
else:
    print("\n❌ Cannot start BERT training - transformers not available")
    print("💡 Please install transformers: pip install transformers")
    train_results = None


TRAINING BERT MODEL
🔍 Checking transformers availability...
❌ Transformers import failed: cannot import name 'HF_DATASETS_DISABLE_PROGRESS_BARS' from 'datasets.config' (d:\Fake_News_Detection_BERT\.venv\Lib\site-packages\datasets\config.py)
💡 Please install transformers: pip install transformers


NameError: name 'TRANSFORMERS_AVAILABLE' is not defined

In [None]:
# ============================================================================
# CELL 5: Evaluate on Test Set
# ============================================================================
print("="*80)
print("EVALUATING ON TEST SET")
print("="*80)

# Import MODELS_DIR if not already imported
from src.config import MODELS_DIR

# Evaluate on test set
test_results = bert_trainer.evaluate(test_dataset)

print("\n✅ Test evaluation completed!")
print(f"\n📊 Test Results:")
for metric, value in test_results.items():
    if isinstance(value, (int, float)):
        print(f"   {metric.capitalize()}: {value:.4f}")

# Save the trained model
bert_trainer.save_model()
print(f"\n💾 Model saved to: {MODELS_DIR / 'bert'}")


In [None]:
# ============================================================================
# CELL 6: Get Predictions and Probabilities
# ============================================================================
print("="*80)
print("GETTING PREDICTIONS AND PROBABILITIES")
print("="*80)

# Get predictions on test set
predictions = bert_trainer.trainer.predict(test_dataset)
y_test_pred = np.argmax(predictions.predictions, axis=1)
y_test_proba = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Get true labels
y_test_true = test_df['label'].values

print(f"\n📊 Predictions generated:")
print(f"   Test samples: {len(y_test_true)}")
print(f"   Predictions shape: {y_test_pred.shape}")
print(f"   Probabilities shape: {y_test_proba.shape}")

# Show prediction distribution
unique, counts = np.unique(y_test_pred, return_counts=True)
print(f"\n📈 Prediction distribution:")
for label, count in zip(unique, counts):
    label_name = "Real" if label == 0 else "Fake"
    percentage = count / len(y_test_pred) * 100
    print(f"   {label_name}: {count:,} ({percentage:.1f}%)")


In [None]:
# ============================================================================
# CELL 7: Comprehensive Evaluation
# ============================================================================
print("="*80)
print("COMPREHENSIVE MODEL EVALUATION")
print("="*80)

# Comprehensive evaluation
bert_evaluation = evaluate_model(
    y_true=y_test_true,
    y_pred=y_test_pred,
    y_proba=y_test_proba,
    model_name="BERT (DistilBERT)"
)

print(f"\n📊 Detailed BERT Test Results:")
for metric, value in bert_evaluation.items():
    if isinstance(value, (int, float)):
        print(f"   {metric.capitalize()}: {value:.4f}")

# Print classification report
from sklearn.metrics import classification_report
print(f"\n📋 Classification Report:")
print(classification_report(y_test_true, y_test_pred, target_names=['Real', 'Fake']))


In [None]:
# ============================================================================
# CELL 8: Visualizations
# ============================================================================
print("="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Create confusion matrix
plot_confusion_matrix(
    y_true=y_test_true,
    y_pred=y_test_pred,
    model_name="BERT (DistilBERT)",
    save_path=VISUALIZATIONS_DIR / "bert_confusion_matrix.png"
)

print("\n✅ Confusion matrix saved!")

# Create ROC curve
plot_roc_curve(
    y_true=y_test_true,
    y_proba=y_test_proba,
    model_name="BERT (DistilBERT)",
    save_path=VISUALIZATIONS_DIR / "bert_roc_curve.png"
)

print("✅ ROC curve saved!")


In [None]:
# ============================================================================
# CELL 9: Compare with Baseline Model
# ============================================================================
print("="*80)
print("COMPARING WITH BASELINE MODEL")
print("="*80)

# Load baseline results
import json
try:
    with open(METRICS_DIR / "baseline_evaluation_results.json", 'r') as f:
        baseline_evaluation = json.load(f)
    
    print("\n📊 Loading baseline results for comparison...")
    
    # Compare models
    comparison_df = compare_models(
        [baseline_evaluation, bert_evaluation],
        save_path=VISUALIZATIONS_DIR / "model_comparison.png"
    )
    
    # Calculate improvements
    accuracy_improvement = bert_evaluation['accuracy'] - baseline_evaluation['accuracy']
    f1_improvement = bert_evaluation['f1'] - baseline_evaluation['f1']
    roc_auc_improvement = bert_evaluation.get('roc_auc', 0) - baseline_evaluation.get('roc_auc', 0)
    
    print(f"\n🚀 BERT vs Baseline Improvements:")
    print(f"   Accuracy: +{accuracy_improvement:.4f} ({accuracy_improvement*100:.2f}%)")
    print(f"   F1-score: +{f1_improvement:.4f} ({f1_improvement*100:.2f}%)")
    print(f"   ROC-AUC:  +{roc_auc_improvement:.4f} ({roc_auc_improvement*100:.2f}%)")
    
except FileNotFoundError:
    print("⚠️  Baseline results not found. Run notebook 03 first to compare models.")
    comparison_df = None


In [None]:
# ============================================================================
# CELL 10: Save Results and Model
# ============================================================================
print("="*80)
print("SAVING RESULTS AND MODEL")
print("="*80)

# Import MODELS_DIR if not already imported
from src.config import MODELS_DIR

# Save BERT evaluation results
save_evaluation_results(
    bert_evaluation,
    METRICS_DIR / "bert_evaluation_results.json"
)

# Save training results
save_json(
    train_results,
    METRICS_DIR / "bert_training_results.json"
)

# Save test predictions
predictions_data = {
    'y_true': y_test_true.tolist(),
    'y_pred': y_test_pred.tolist(),
    'y_proba': y_test_proba.tolist(),
    'model_name': 'BERT (DistilBERT)',
    'test_samples': len(y_test_true)
}

save_json(
    predictions_data,
    METRICS_DIR / "bert_test_predictions.json"
)

print("\n✅ Files saved:")
print(f"   📊 BERT evaluation:    {METRICS_DIR / 'bert_evaluation_results.json'}")
print(f"   📈 Training results:   {METRICS_DIR / 'bert_training_results.json'}")
print(f"   🎯 Test predictions:   {METRICS_DIR / 'bert_test_predictions.json'}")
print(f"   🤖 Model directory:    {MODELS_DIR / 'bert'}")

# Verify saved files
import os
print(f"\n🔍 Verifying saved files:")
for file_path in [
    METRICS_DIR / "bert_evaluation_results.json",
    METRICS_DIR / "bert_training_results.json",
    METRICS_DIR / "bert_test_predictions.json",
    MODELS_DIR / "bert"
]:
    if os.path.exists(file_path):
        if os.path.isfile(file_path):
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f"   ✓ {file_path} ({size_mb:.2f} MB)")
        else:
            print(f"   ✓ {file_path} (directory)")
    else:
        print(f"   ✗ {file_path} (not found)")


In [None]:
# ============================================================================
# CELL 11: Final Summary
# ============================================================================
print("="*80)
print("BERT MODEL TRAINING COMPLETE! ✅")
print("="*80)

print("\n📌 What we accomplished:")
print("   ✓ Fine-tuned BERT model for fake news detection")
print("   ✓ Achieved excellent performance on test set")
print("   ✓ Created comprehensive visualizations")
print("   ✓ Compared with baseline model performance")
print("   ✓ Saved model and all results")

print(f"\n🎯 BERT Model Performance:")
print(f"   📊 Test Accuracy: {bert_evaluation['accuracy']:.4f}")
print(f"   📊 Test F1-score: {bert_evaluation['f1']:.4f}")
print(f"   📊 Test ROC-AUC:  {bert_evaluation.get('roc_auc', 0):.4f}")
print(f"   ⏱️  Training time: {train_results['training_time']:.2f} seconds")

if comparison_df is not None:
    print(f"\n🚀 Performance Improvements over Baseline:")
    accuracy_improvement = bert_evaluation['accuracy'] - baseline_evaluation['accuracy']
    f1_improvement = bert_evaluation['f1'] - baseline_evaluation['f1']
    print(f"   📈 Accuracy: +{accuracy_improvement:.4f} ({accuracy_improvement*100:.2f}%)")
    print(f"   📈 F1-score: +{f1_improvement:.4f} ({f1_improvement*100:.2f}%)")

print("\n🎉 Project Status:")
print("   ✅ Data preprocessing completed (Notebook 02)")
print("   ✅ Baseline model trained (Notebook 03)")
print("   ✅ BERT model trained (Notebook 04)")
print("   🎯 Ready for deployment and API development!")

print("\n" + "="*80)
