In [None]:
# ============================================================================
# CELL -3: Strip emojis from prints (Kaggle/Colab safe)
# ============================================================================
import builtins, re

_emoji_pattern = re.compile(
    "[\U0001F600-\U0001F64F]"  # emoticons
    "|[\U0001F300-\U0001F5FF]"  # symbols & pictographs
    "|[\U0001F680-\U0001F6FF]"  # transport & map symbols
    "|[\U0001F1E0-\U0001F1FF]"  # flags
    "|[\u2600-\u26FF]"          # misc symbols
    "|[\u2700-\u27BF]"          # dingbats
    , flags=re.UNICODE
)

_orig_print = builtins.print

def _clean_text(obj):
    try:
        s = str(obj)
        s = _emoji_pattern.sub('', s)
        return s
    except Exception:
        return str(obj)

def print(*args, **kwargs):
    cleaned = [_clean_text(a) for a in args]
    return _orig_print(*cleaned, **kwargs)



In [None]:
# ============================================================================
# CELL -2: Kaggle/Colab Dependency Setup
# ============================================================================
import sys
import subprocess

packages = [
    'transformers',
    'datasets',
    'accelerate',
    'scikit-learn',
    'pandas',
    'matplotlib',
    'seaborn'
]

def pip_install(pkgs):
    for p in pkgs:
        try:
            __import__(p.replace('-', '_'))
        except Exception:
            print(f"Installing {p}...")
            subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', p])

# Try requirements.txt first if present, else fallback to individual installs
try:
    import os
    if os.path.exists('requirements.txt'):
        print('Installing from requirements.txt ...')
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', '-r', 'requirements.txt'])
    else:
        pip_install(packages)
except Exception as e:
    print(f"requirements.txt install failed: {e}; installing core packages...")
    pip_install(packages)

print('✅ Dependencies ready')



In [1]:
# ============================================================================
# CELL 0: Test Environment Setup
# ============================================================================
print("="*80)
print("TESTING ENVIRONMENT SETUP")
print("="*80)

# Test basic imports
try:
    import sys
    sys.path.append('..')
    print("✅ Path setup successful")
except Exception as e:
    print(f"❌ Path setup failed: {e}")

# Test transformers
try:
    import transformers
    print(f"✅ Transformers version: {transformers.__version__}")
    
    # Test specific imports
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    print("✅ AutoTokenizer and AutoModel imports successful")
    
    # Try to import Trainer separately
    try:
        from transformers import Trainer
        print("✅ Trainer import successful")
    except ImportError:
        print("⚠️ Trainer not available in this transformers version")
        print("💡 Trying alternative import...")
        from transformers.trainer import Trainer
        print("✅ Trainer import successful (alternative)")
    
    # Test tokenizer
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    print("✅ Tokenizer creation successful")
    
    # Test our modules
    from src.config import ModelConfig, DataConfig
    print("✅ Config import successful")
    
    from src.train import TRANSFORMERS_AVAILABLE
    print(f"✅ TRANSFORMERS_AVAILABLE: {TRANSFORMERS_AVAILABLE}")
    
    print("\n🎉 Environment setup complete! Ready for BERT training.")
    
except Exception as e:
    print(f"❌ Environment setup failed: {e}")
    print("💡 Please check your Python environment and dependencies")


TESTING ENVIRONMENT SETUP
✅ Path setup successful
✅ Transformers version: 4.39.3
✅ AutoTokenizer and AutoModel imports successful



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "d:\Fake_News_Detection_BERT\venv\Lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "d:\Fake_News_Detection_BERT\venv\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "d:\Fake_News_Detection_BERT\venv\Lib\site-packages\ipykernel\kernelapp.py", line 736, in start
    self.io_loop.start()
  File "d:\Fake_News_D

✅ Trainer import successful




✅ Tokenizer creation successful
✅ Transformers 4.39.3 loaded successfully
✅ Config import successful
✅ TRANSFORMERS_AVAILABLE: True

🎉 Environment setup complete! Ready for BERT training.


In [None]:
# ============================================================================
# CELL -0: Ensure Project Directories
# ============================================================================
from src.config import create_directories
create_directories()



# ============================================================================
# NOTEBOOK 04: BERT MODEL TRAINING
# ============================================================================

## 🎯 Objective
Fine-tune a BERT model for fake news detection and compare performance with the baseline model.

## 📋 What we'll do:
1. **Load preprocessed data** from notebook 02
2. **Prepare PyTorch datasets** for BERT training
3. **Fine-tune BERT model** using Hugging Face Transformers
4. **Evaluate performance** on train/val/test sets
5. **Compare with baseline** model performance
6. **Save model** and results

---


In [2]:
# ============================================================================
# CELL 1: Imports and Setup
# ============================================================================
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import warnings
warnings.filterwarnings('ignore')

# Import from src
from src.config import (
    DataConfig, ModelConfig, TrainingConfig, 
    PROCESSED_DATA_DIR, METRICS_DIR, VISUALIZATIONS_DIR, MODELS_DIR
)
from src.dataset import create_data_loaders
from src.train import BertTrainer, train_bert_model
from src.evaluate import (
    evaluate_model, 
    plot_confusion_matrix, 
    plot_roc_curve,
    compare_models,
    save_evaluation_results
)
from src.utils import save_json

# Set style
plt.style.use('default')
sns.set_palette("husl")

# Check device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"✅ Imports successful!")
print(f"🖥️  Device: {device}")

# Test transformers availability
print(f"\n🔍 Testing transformers availability...")
try:
    from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification
    print("✅ Transformers imports successful!")
    
    # Test tokenizer
    test_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    print("✅ Tokenizer test successful!")
    
    print(f"📊 Ready to train BERT model!")
except ImportError as e:
    print(f"❌ Transformers not available: {e}")
    print("💡 Please install: pip install transformers")
    print("⚠️  Only baseline model will be available")


✅ Imports successful!
🖥️  Device: cpu

🔍 Testing transformers availability...
✅ Transformers imports successful!
✅ Tokenizer test successful!
📊 Ready to train BERT model!


In [3]:
# ============================================================================
# CELL 2: Load Data and Prepare Tokenizer
# ============================================================================
print("="*80)
print("LOADING DATA AND PREPARING TOKENIZER")
print("="*80)

# Load the processed datasets
train_df = pd.read_csv(DataConfig.TRAIN_PATH)
val_df = pd.read_csv(DataConfig.VAL_PATH)
test_df = pd.read_csv(DataConfig.TEST_PATH)

print(f"\n📊 Data loaded successfully!")
print(f"   Train set: {train_df.shape[0]:,} samples")
print(f"   Val set:   {val_df.shape[0]:,} samples")
print(f"   Test set:  {test_df.shape[0]:,} samples")

# Load tokenizer
print(f"\n🔤 Loading tokenizer: {ModelConfig.MODEL_NAME}")
tokenizer = AutoTokenizer.from_pretrained(ModelConfig.MODEL_NAME)

print(f"\n📋 Tokenizer info:")
print(f"   Vocab size: {tokenizer.vocab_size:,}")
print(f"   Max length: {ModelConfig.MAX_LENGTH}")
print(f"   Special tokens: {tokenizer.special_tokens_map}")

# Test tokenization on sample text
sample_text = train_df.iloc[0]['cleaned_content']
sample_tokens = tokenizer.encode(sample_text, max_length=ModelConfig.MAX_LENGTH, truncation=True)
print(f"\n🧪 Sample tokenization:")
print(f"   Original text length: {len(sample_text)} chars")
print(f"   Tokenized length: {len(sample_tokens)} tokens")
print(f"   Sample tokens: {sample_tokens[:10]}...")

# Verify imports
print(f"\n✅ Import verification:")
print(f"   MODELS_DIR: {MODELS_DIR}")
print(f"   METRICS_DIR: {METRICS_DIR}")
print(f"   VISUALIZATIONS_DIR: {VISUALIZATIONS_DIR}")


LOADING DATA AND PREPARING TOKENIZER

📊 Data loaded successfully!
   Train set: 95,244 samples
   Val set:   20,409 samples
   Test set:  20,410 samples

🔤 Loading tokenizer: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]


📋 Tokenizer info:
   Vocab size: 50,265
   Max length: 256
   Special tokens: {'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}

🧪 Sample tokenization:
   Original text length: 85 chars
   Tokenized length: 26 tokens
   Sample tokens: [0, 642, 718, 5992, 32, 6749, 154, 31, 821, 7043]...

✅ Import verification:
   MODELS_DIR: d:\Fake_News_Detection_BERT\notebooks\..\models
   METRICS_DIR: d:\Fake_News_Detection_BERT\notebooks\..\results\metrics
   VISUALIZATIONS_DIR: d:\Fake_News_Detection_BERT\notebooks\..\results\visualizations


In [4]:
# ============================================================================
# CELL 3: Create PyTorch Datasets
# ============================================================================
print("="*80)
print("CREATING PYTORCH DATASETS")
print("="*80)

from src.dataset import create_dataset_from_dataframe

# Create datasets
train_dataset = create_dataset_from_dataframe(train_df, tokenizer)
val_dataset = create_dataset_from_dataframe(val_df, tokenizer)
test_dataset = create_dataset_from_dataframe(test_df, tokenizer)

print(f"\n📊 Datasets created:")
print(f"   Train dataset: {len(train_dataset)} samples")
print(f"   Val dataset:   {len(val_dataset)} samples")
print(f"   Test dataset:  {len(test_dataset)} samples")

# Test dataset sample
sample_item = train_dataset[0]
sample_batch = {
    'input_ids': sample_item['input_ids'].unsqueeze(0),  # Add batch dimension
    'attention_mask': sample_item['attention_mask'].unsqueeze(0),
    'labels': sample_item['labels'].unsqueeze(0)
}

print(f"\n🧪 Sample batch info:")
print(f"   Input IDs shape: {sample_batch['input_ids'].shape}")
print(f"   Attention mask shape: {sample_batch['attention_mask'].shape}")
print(f"   Labels: {sample_batch['labels'].item()}")

# Decode sample tokens
sample_input_ids = sample_item['input_ids']
sample_decoded = tokenizer.decode(sample_input_ids, skip_special_tokens=True)
print(f"\n📝 Sample decoded text (first 200 chars):")
print(f"   {sample_decoded[:200]}...")


INFO:src.dataset:Dataset initialized with 95244 samples
INFO:src.dataset:Max length: 256
INFO:src.dataset:Dataset initialized with 20409 samples
INFO:src.dataset:Max length: 256
INFO:src.dataset:Dataset initialized with 20410 samples
INFO:src.dataset:Max length: 256


CREATING PYTORCH DATASETS

📊 Datasets created:
   Train dataset: 95244 samples
   Val dataset:   20409 samples
   Test dataset:  20410 samples

🧪 Sample batch info:
   Input IDs shape: torch.Size([1, 256])
   Attention mask shape: torch.Size([1, 256])
   Labels: 1

📝 Sample decoded text (first 200 chars):
   pilots are resigning from german air force – they don ’ t want to fight against russia....


In [None]:
# ============================================================================
# CELL 4: Train BERT Model
# ============================================================================
print("="*80)
print("TRAINING BERT MODEL")
print("="*80)

# Import MODELS_DIR if not already imported
from src.config import MODELS_DIR

# Debug transformers availability
print("🔍 Checking transformers availability...")
try:
    from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification
    print("✅ Transformers imports successful!")
    
    # Check TRANSFORMERS_AVAILABLE flag
    from src.train import TRANSFORMERS_AVAILABLE
    print(f"📊 TRANSFORMERS_AVAILABLE flag: {TRANSFORMERS_AVAILABLE}")
    
    if not TRANSFORMERS_AVAILABLE:
        print("⚠️ TRANSFORMERS_AVAILABLE is False, trying to fix...")
        import transformers
        print(f"✅ Transformers version: {transformers.__version__}")
        
except ImportError as e:
    print(f"❌ Transformers import failed: {e}")
    print("💡 Please install transformers: pip install transformers")

# Create BERT trainer
if TRANSFORMERS_AVAILABLE:
    bert_trainer = BertTrainer(
        model_name=ModelConfig.MODEL_NAME,
        output_dir=MODELS_DIR / "roberta"
    )
    print("✅ RoBERTa trainer created successfully!")
else:
    print("❌ Cannot create BERT trainer - transformers not available")
    bert_trainer = None

# Train the model
if bert_trainer is not None:
    print(f"\n🚀 Starting RoBERTa training...")
    print(f"   Model: {ModelConfig.MODEL_NAME}")
    print(f"   Epochs: {ModelConfig.NUM_EPOCHS}")
    print(f"   Batch size: {ModelConfig.BATCH_SIZE}")
    print(f"   Learning rate: {ModelConfig.LEARNING_RATE}")
    print(f"   Max length: {ModelConfig.MAX_LENGTH}")

    train_results = bert_trainer.train(
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        num_epochs=ModelConfig.NUM_EPOCHS,
        batch_size=ModelConfig.BATCH_SIZE,
        learning_rate=ModelConfig.LEARNING_RATE,
        warmup_steps=ModelConfig.WARMUP_STEPS,
        weight_decay=ModelConfig.WEIGHT_DECAY
    )

    print("\n✅ RoBERTa model training completed!")
    print(f"\n📊 Training Results Summary:")
    print(f"   Training time: {train_results['training_time']:.2f} seconds")
    print(f"   Final validation metrics: {train_results['eval_metrics']}")
else:
    print("\n❌ Cannot start BERT training - transformers not available")
    print("💡 Please install transformers: pip install transformers")
    train_results = None


TRAINING BERT MODEL
🔍 Checking transformers availability...
❌ Transformers import failed: cannot import name 'HF_DATASETS_DISABLE_PROGRESS_BARS' from 'datasets.config' (d:\Fake_News_Detection_BERT\.venv\Lib\site-packages\datasets\config.py)
💡 Please install transformers: pip install transformers


NameError: name 'TRANSFORMERS_AVAILABLE' is not defined

In [None]:
# ============================================================================
# CELL 5: Evaluate on Test Set
# ============================================================================
print("="*80)
print("EVALUATING ON TEST SET")
print("="*80)

# Import MODELS_DIR if not already imported
from src.config import MODELS_DIR

# Evaluate on test set
test_results = bert_trainer.evaluate(test_dataset)

print("\n✅ Test evaluation completed!")
print(f"\n📊 Test Results (RoBERTa):")
for metric, value in test_results.items():
    if isinstance(value, (int, float)):
        print(f"   {metric.capitalize()}: {value:.4f}")

# Save the trained model
bert_trainer.save_model()
print(f"\n💾 Model saved to: {MODELS_DIR / 'roberta'}")


In [None]:
# ============================================================================
# CELL 6: Get Predictions and Probabilities
# ============================================================================
print("="*80)
print("GETTING PREDICTIONS AND PROBABILITIES")
print("="*80)

# Get predictions on test set
predictions = bert_trainer.trainer.predict(test_dataset)
y_test_pred = np.argmax(predictions.predictions, axis=1)
y_test_proba = torch.softmax(torch.tensor(predictions.predictions), dim=1).numpy()

# Get true labels
y_test_true = test_df['label'].values

print(f"\n📊 Predictions generated:")
print(f"   Test samples: {len(y_test_true)}")
print(f"   Predictions shape: {y_test_pred.shape}")
print(f"   Probabilities shape: {y_test_proba.shape}")

# Show prediction distribution
unique, counts = np.unique(y_test_pred, return_counts=True)
print(f"\n📈 Prediction distribution:")
for label, count in zip(unique, counts):
    label_name = "Real" if label == 0 else "Fake"
    percentage = count / len(y_test_pred) * 100
    print(f"   {label_name}: {count:,} ({percentage:.1f}%)")


In [None]:
# ============================================================================
# CELL 7: Comprehensive Evaluation
# ============================================================================
print("="*80)
print("COMPREHENSIVE MODEL EVALUATION")
print("="*80)

# Comprehensive evaluation
bert_evaluation = evaluate_model(
    y_true=y_test_true,
    y_pred=y_test_pred,
    y_proba=y_test_proba,
    model_name=f"RoBERTa ({ModelConfig.MODEL_NAME})"
)

print(f"\n📊 Detailed RoBERTa Test Results:")
for metric, value in bert_evaluation.items():
    if isinstance(value, (int, float)):
        print(f"   {metric.capitalize()}: {value:.4f}")

# Print classification report
from sklearn.metrics import classification_report
print(f"\n📋 Classification Report:")
print(classification_report(y_test_true, y_test_pred, target_names=['Real', 'Fake']))


In [None]:
# ============================================================================
# CELL 8: Visualizations
# ============================================================================
print("="*80)
print("CREATING VISUALIZATIONS")
print("="*80)

# Create confusion matrix
plot_confusion_matrix(
    y_true=y_test_true,
    y_pred=y_test_pred,
    model_name=f"RoBERTa ({ModelConfig.MODEL_NAME})",
    save_path=VISUALIZATIONS_DIR / "roberta_confusion_matrix.png"
)

print("\n✅ Confusion matrix saved!")

# Create ROC curve
plot_roc_curve(
    y_true=y_test_true,
    y_proba=y_test_proba,
    model_name=f"RoBERTa ({ModelConfig.MODEL_NAME})",
    save_path=VISUALIZATIONS_DIR / "roberta_roc_curve.png"
)

print("✅ ROC curve saved!")


In [None]:
# ============================================================================
# CELL 9: Compare with Baseline Model
# ============================================================================
print("="*80)
print("COMPARING WITH BASELINE MODEL")
print("="*80)

# Load baseline results
import json
try:
    with open(METRICS_DIR / "baseline_evaluation_results.json", 'r') as f:
        baseline_evaluation = json.load(f)
    
    print("\n📊 Loading baseline results for comparison...")
    
    # Compare models
    comparison_df = compare_models(
        [baseline_evaluation, bert_evaluation],
        save_path=VISUALIZATIONS_DIR / "model_comparison.png"
    )
    
    # Calculate improvements
    accuracy_improvement = bert_evaluation['accuracy'] - baseline_evaluation['accuracy']
    f1_improvement = bert_evaluation['f1'] - baseline_evaluation['f1']
    roc_auc_improvement = bert_evaluation.get('roc_auc', 0) - baseline_evaluation.get('roc_auc', 0)
    
    print(f"\n🚀 RoBERTa vs Baseline Improvements:")
    print(f"   Accuracy: +{accuracy_improvement:.4f} ({accuracy_improvement*100:.2f}%)")
    print(f"   F1-score: +{f1_improvement:.4f} ({f1_improvement*100:.2f}%)")
    print(f"   ROC-AUC:  +{roc_auc_improvement:.4f} ({roc_auc_improvement*100:.2f}%)")
    
except FileNotFoundError:
    print("⚠️  Baseline results not found. Run notebook 03 first to compare models.")
    comparison_df = None


In [None]:
# ============================================================================
# CELL 10: Save Results and Model
# ============================================================================
print("="*80)
print("SAVING RESULTS AND MODEL")
print("="*80)

# Import MODELS_DIR if not already imported
from src.config import MODELS_DIR

# Save RoBERTa evaluation results
save_evaluation_results(
    bert_evaluation,
    METRICS_DIR / "roberta_evaluation_results.json"
)

# Save training results
save_json(
    train_results,
    METRICS_DIR / "roberta_training_results.json"
)

# Save test predictions
predictions_data = {
    'y_true': y_test_true.tolist(),
    'y_pred': y_test_pred.tolist(),
    'y_proba': y_test_proba.tolist(),
    'model_name': f"RoBERTa ({ModelConfig.MODEL_NAME})",
    'test_samples': len(y_test_true)
}

save_json(
    predictions_data,
    METRICS_DIR / "roberta_test_predictions.json"
)

print("\n✅ Files saved:")
print("   📊 RoBERTa evaluation: " + str(METRICS_DIR / 'roberta_evaluation_results.json'))
print(f"   📈 Training results:   {METRICS_DIR / 'roberta_training_results.json'}")
print(f"   🎯 Test predictions:   {METRICS_DIR / 'roberta_test_predictions.json'}")
print(f"   🤖 Model directory:    {MODELS_DIR / 'roberta'}")

# Verify saved files
import os
print(f"\n🔍 Verifying saved files:")
for file_path in [
    METRICS_DIR / "roberta_evaluation_results.json",
    METRICS_DIR / "roberta_training_results.json",
    METRICS_DIR / "roberta_test_predictions.json",
    MODELS_DIR / "roberta"
]:
    if os.path.exists(file_path):
        if os.path.isfile(file_path):
            size_mb = os.path.getsize(file_path) / (1024 * 1024)
            print(f"   ✓ {file_path} ({size_mb:.2f} MB)")
        else:
            print(f"   ✓ {file_path} (directory)")
    else:
        print(f"   ✗ {file_path} (not found)")


In [None]:
# ============================================================================
# CELL 11: Final Summary
# ============================================================================
print("="*80)
print("BERT MODEL TRAINING COMPLETE! ✅")
print("="*80)

print("\n📌 What we accomplished:")
print("   ✓ Fine-tuned BERT model for fake news detection")
print("   ✓ Achieved excellent performance on test set")
print("   ✓ Created comprehensive visualizations")
print("   ✓ Compared with baseline model performance")
print("   ✓ Saved model and all results")

print(f"\n🎯 BERT Model Performance:")
print(f"   📊 Test Accuracy: {bert_evaluation['accuracy']:.4f}")
print(f"   📊 Test F1-score: {bert_evaluation['f1']:.4f}")
print(f"   📊 Test ROC-AUC:  {bert_evaluation.get('roc_auc', 0):.4f}")
print(f"   ⏱️  Training time: {train_results['training_time']:.2f} seconds")

if comparison_df is not None:
    print(f"\n🚀 Performance Improvements over Baseline:")
    accuracy_improvement = bert_evaluation['accuracy'] - baseline_evaluation['accuracy']
    f1_improvement = bert_evaluation['f1'] - baseline_evaluation['f1']
    print(f"   📈 Accuracy: +{accuracy_improvement:.4f} ({accuracy_improvement*100:.2f}%)")
    print(f"   📈 F1-score: +{f1_improvement:.4f} ({f1_improvement*100:.2f}%)")

print("\n🎉 Project Status:")
print("   ✅ Data preprocessing completed (Notebook 02)")
print("   ✅ Baseline model trained (Notebook 03)")
print("   ✅ BERT model trained (Notebook 04)")
print("   🎯 Ready for deployment and API development!")

print("\n" + "="*80)


In [None]:
# ============================================================================
# CELL 12: Evaluate on Your Own CSV (Real-world Data)
# ============================================================================
from pathlib import Path
import pandas as pd
import numpy as np

from src.config import PREDICTIONS_DIR, DataConfig
from src.evaluate import compute_extended_metrics, plot_confusion_matrix, plot_roc_curve

# How to use:
# 1) Kaggle: Add your CSV as an input dataset or use the file navigator to upload.
# 2) Colab: Set csv_path to a file in /content or use files.upload().
# 3) Local: Set csv_path to your local file path.

# Configure your CSV path here (string). Leave empty to use uploader on Colab.
csv_path = ""  # e.g., "/kaggle/input/mydata/eval.csv" or "/content/eval.csv"

# Try Colab uploader if no path supplied
if not csv_path:
    try:
        from google.colab import files  # type: ignore
        print("No csv_path set. Use the dialog to upload a CSV...")
        uploaded = files.upload()
        if uploaded:
            csv_path = list(uploaded.keys())[0]
            print(f"✅ Uploaded: {csv_path}")
    except Exception:
        pass

assert csv_path, "Please set csv_path to your CSV file or upload one."

# Load CSV
user_df = pd.read_csv(csv_path)
print(f"Loaded CSV with shape: {user_df.shape}")
print("Columns:", list(user_df.columns))

# Detect text column
text_col_candidates = [
    DataConfig.CLEANED_TEXT_COLUMN,
    DataConfig.TEXT_COLUMN,
    'text', 'content', 'clean_text', 'cleaned_text'
]
text_col = None
for c in text_col_candidates:
    if c in user_df.columns:
        text_col = c
        break
assert text_col is not None, f"Could not find a text column. Please include one of: {text_col_candidates}"

# Optional label column
label_col = DataConfig.LABEL_COLUMN if DataConfig.LABEL_COLUMN in user_df.columns else None

# Build a lightweight dataset for inference
from torch.utils.data import Dataset
import torch

class InferenceDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=ModelConfig.MAX_LENGTH):
        self.texts = list(map(str, texts))
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            truncation=ModelConfig.TRUNCATION,
            padding=ModelConfig.PADDING,
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }

# Use tokenizer from training section
assert tokenizer is not None, "Tokenizer not available. Please run previous cells to load the model/tokenizer."

infer_ds = InferenceDataset(user_df[text_col].values, tokenizer)

# Predict
pred_out = bert_trainer.trainer.predict(infer_ds)
preds = np.argmax(pred_out.predictions, axis=1)
proba = torch.softmax(torch.tensor(pred_out.predictions), dim=1).numpy()

# Save predictions CSV
custom_pred_dir = Path(PREDICTIONS_DIR) / 'roberta'
custom_pred_dir.mkdir(parents=True, exist_ok=True)
custom_csv = custom_pred_dir / 'custom_predictions.csv'

save_cols = {
    'text': user_df[text_col].tolist(),
    'pred': preds.tolist(),
}
# Include probabilities
if proba.ndim == 2 and proba.shape[1] == 2:
    save_cols['proba_real'] = proba[:, 0].tolist()
    save_cols['proba_fake'] = proba[:, 1].tolist()

# Include labels if present
if label_col is not None:
    save_cols['label'] = user_df[label_col].tolist()

pd.DataFrame(save_cols).to_csv(custom_csv, index=False)
print(f"💾 Saved custom predictions to: {custom_csv}")

# If labels exist, compute extended metrics and plots
if label_col is not None:
    y_true = user_df[label_col].values
    y_pred = preds
    y_proba = proba
    ext = compute_extended_metrics(y_true, y_pred, y_proba)
    custom_metrics_json = Path(METRICS_DIR) / 'roberta' / 'custom_extended_metrics.json'
    custom_metrics_json.parent.mkdir(parents=True, exist_ok=True)
    save_evaluation_results(ext, custom_metrics_json)
    print(f"✅ Saved custom extended metrics to: {custom_metrics_json}")

    # Plots
    plot_confusion_matrix(y_true, y_pred, model_name=f"RoBERTa ({ModelConfig.MODEL_NAME})",
                          save_path=Path(VISUALIZATIONS_DIR) / 'roberta' / 'custom_confusion_matrix.png')
    if y_proba is not None:
        plot_roc_curve(y_true, y_proba, model_name=f"RoBERTa ({ModelConfig.MODEL_NAME})",
                       save_path=Path(VISUALIZATIONS_DIR) / 'roberta' / 'custom_roc_curve.png')
else:
    print("No label column found; generated predictions only.")

