# 🔍 Real-Time Fake News Detector - Experimentation Notebook

This notebook demonstrates how to use the fake news detection system and experiment with different approaches.

## Setup and Installation

First, make sure you have installed all dependencies:

In [None]:
# Install dependencies (run this cell if packages are not installed)
# !pip install -r ../requirements.txt

In [None]:
# Import necessary libraries
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('../src')

# Import our custom modules
from preprocessing import TextPreprocessor, preprocess_dataset
from train_model import FakeNewsClassifier, create_sample_dataset
from predict import FakeNewsDetector, quick_predict

print("📦 All libraries imported successfully!")

## 1. Data Exploration and Preprocessing

In [None]:
# Create sample dataset for experimentation
print("🔄 Creating sample dataset...")
df = create_sample_dataset()

print(f"Dataset shape: {df.shape}")
print(f"\nLabel distribution:")
print(df['label'].value_counts())

# Display first few examples
print(f"\n📰 Sample articles:")
df.head()

In [None]:
# Visualize label distribution
plt.figure(figsize=(8, 6))
labels = ['Fake', 'Real']
counts = df['label'].value_counts().sort_index()
colors = ['#ff6b6b', '#4ecdc4']

plt.pie(counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Fake vs Real News Articles')
plt.axis('equal')
plt.show()

# Bar plot
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='label', palette=colors)
plt.title('Count of Fake vs Real News Articles')
plt.xlabel('Label (0=Fake, 1=Real)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Fake', 'Real'])
plt.show()

In [None]:
# Explore text preprocessing
print("🔧 Testing text preprocessing...")

# Create preprocessor
preprocessor = TextPreprocessor(remove_stopwords=True, use_stemming=False)

# Test on sample text
sample_text = """<h1>BREAKING NEWS!!!</h1> 
Scientists have DISCOVERED that aliens are living among us!!! 
Visit https://fakenews.com for more details. Contact us at info@fake.com!!!"""

cleaned_text = preprocessor.clean_text(sample_text)

print("Original text:")
print(sample_text)
print("\nCleaned text:")
print(cleaned_text)

# Preprocess the entire dataset
df_processed = preprocess_dataset(df, 'text', preprocessor)
print(f"\n✅ Dataset preprocessed successfully!")
print(f"Original text length (avg): {df['text'].str.len().mean():.1f} characters")
print(f"Processed text length (avg): {df_processed['text'].str.len().mean():.1f} characters")

## 2. Model Training and Comparison

In [None]:
# Train different models and compare performance
from sklearn.model_selection import train_test_split

print("🤖 Training and comparing different models...")

# Prepare data
X = df['text'].tolist()
y = df['label'].tolist()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

# Train models
models = {}
results = {}

for model_type in ['logistic', 'random_forest']:
    print(f"\n{'='*20} Training {model_type.upper()} {'='*20}")
    
    # Initialize and train
    classifier = FakeNewsClassifier(model_type=model_type, use_preprocessing=True)
    classifier.train(X_train, y_train, X_test, y_test)
    
    # Store model
    models[model_type] = classifier
    
    # Evaluate
    predictions = classifier.predict(X_test)
    accuracy = classifier.evaluate(X_test, y_test)
    results[model_type] = accuracy

print(f"\n{'='*50}")
print("📊 Model Comparison:")
for model_type, accuracy in results.items():
    print(f"{model_type.upper()}: {accuracy:.4f}")

In [None]:
# Visualize model performance comparison
plt.figure(figsize=(10, 6))

model_names = list(results.keys())
accuracies = list(results.values())

bars = plt.bar(model_names, accuracies, color=['#3498db', '#e74c3c'], alpha=0.8)
plt.title('Model Performance Comparison', fontsize=16, fontweight='bold')
plt.xlabel('Model Type', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.ylim(0, 1)

# Add value labels on bars
for bar, accuracy in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{accuracy:.3f}', ha='center', va='bottom', fontweight='bold')

plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

## 3. Detailed Model Analysis

In [None]:
# Analyze the best performing model in detail
best_model_type = max(results, key=results.get)
best_model = models[best_model_type]

print(f"📈 Analyzing best model: {best_model_type.upper()}")
print(f"Best accuracy: {results[best_model_type]:.4f}")

# Get predictions for confusion matrix
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Fake', 'Real'], yticklabels=['Fake', 'Real'])
plt.title(f'Confusion Matrix - {best_model_type.upper()} Model')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

# Print detailed classification report
print(f"\n📋 Detailed Classification Report:")
target_names = ['Fake', 'Real']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# Analyze prediction confidence distribution
confidence_scores = [max(prob) for prob in y_prob]

plt.figure(figsize=(12, 5))

# Confidence distribution
plt.subplot(1, 2, 1)
plt.hist(confidence_scores, bins=20, alpha=0.7, color='skyblue', edgecolor='black')
plt.title('Distribution of Prediction Confidence')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.axvline(np.mean(confidence_scores), color='red', linestyle='--', 
            label=f'Mean: {np.mean(confidence_scores):.3f}')
plt.legend()
plt.grid(alpha=0.3)

# Confidence by prediction correctness
plt.subplot(1, 2, 2)
correct_predictions = (y_test == y_pred)
correct_confidence = [conf for conf, correct in zip(confidence_scores, correct_predictions) if correct]
incorrect_confidence = [conf for conf, correct in zip(confidence_scores, correct_predictions) if not correct]

plt.hist(correct_confidence, bins=15, alpha=0.7, label='Correct Predictions', color='green')
plt.hist(incorrect_confidence, bins=15, alpha=0.7, label='Incorrect Predictions', color='red')
plt.title('Confidence by Prediction Correctness')
plt.xlabel('Confidence Score')
plt.ylabel('Frequency')
plt.legend()
plt.grid(alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Average confidence for correct predictions: {np.mean(correct_confidence):.3f}")
print(f"Average confidence for incorrect predictions: {np.mean(incorrect_confidence):.3f}")

## 4. Interactive Testing

In [None]:
# Save the best model for use in prediction module
print(f"💾 Saving the best model ({best_model_type})...")
best_model.save_model('../models')
print("✅ Model saved successfully!")

In [None]:
# Test with custom examples
def test_custom_news(text, model_type='logistic'):
    """Test a custom news article."""
    detector = FakeNewsDetector(model_type=model_type)
    if detector.is_loaded:
        result = detector.predict(text)
        
        # Format output
        prediction = result['prediction']
        confidence = result['confidence']
        emoji = "✅" if prediction == "REAL" else "❌"
        
        print(f"📰 Article: {text[:100]}{'...' if len(text) > 100 else ''}")
        print(f"{emoji} Prediction: {prediction} (Confidence: {confidence}%)")
        print(f"📊 Probabilities: Fake={result['probability_fake']:.3f}, Real={result['probability_real']:.3f}")
        print("-" * 80)
        
        return result
    else:
        print("❌ Model not loaded. Please train the model first.")
        return None

# Test examples
test_articles = [
    "Local university receives federal grant to study climate change impacts on agriculture",
    "Scientists discover aliens living on Mars and planning invasion of Earth next year",
    "Stock market closes up 3% following positive quarterly earnings reports from tech sector",
    "Miracle weight loss pill discovered that allows you to lose 50 pounds in one week",
    "City council approves new budget allocation for public transportation improvements",
    "Government admits to hiding cure for cancer for past 30 years to protect pharmaceutical profits"
]

print("🧪 Testing with sample articles:")
print("=" * 80)

for article in test_articles:
    test_custom_news(article, best_model_type)

In [None]:
# Interactive testing cell - modify this to test your own articles
custom_article = """
Breaking: Scientists at a major university have announced a breakthrough in renewable energy 
technology that could revolutionize how we power our homes and businesses. The new solar panel 
design increases efficiency by 40% while reducing manufacturing costs by 25%. The research team 
plans to begin commercial production within the next two years.
""".strip()

print("🔍 Testing your custom article:")
print("=" * 80)
test_custom_news(custom_article, best_model_type)

## 5. Feature Analysis (Advanced)

Let's analyze what features the model considers important for classification.

In [None]:
# Analyze most important features (for logistic regression)
if best_model_type == 'logistic':
    print("🔍 Analyzing most important features for classification...")
    
    # Get feature names and coefficients
    feature_names = best_model.vectorizer.get_feature_names_out()
    coefficients = best_model.model.coef_[0]
    
    # Sort by absolute coefficient value
    feature_importance = list(zip(feature_names, coefficients))
    feature_importance.sort(key=lambda x: abs(x[1]), reverse=True)
    
    # Top features for fake news (negative coefficients)
    fake_features = [(name, coef) for name, coef in feature_importance if coef < 0][:10]
    # Top features for real news (positive coefficients)
    real_features = [(name, coef) for name, coef in feature_importance if coef > 0][:10]
    
    print("\n📰 Top words/phrases indicating FAKE news:")
    for i, (feature, coef) in enumerate(fake_features, 1):
        print(f"{i:2d}. {feature:<20} (coefficient: {coef:.4f})")
    
    print("\n✅ Top words/phrases indicating REAL news:")
    for i, (feature, coef) in enumerate(real_features, 1):
        print(f"{i:2d}. {feature:<20} (coefficient: {coef:.4f})")
    
    # Visualize top features
    plt.figure(figsize=(12, 8))
    
    # Combine top fake and real features for plotting
    plot_features = fake_features[-5:] + real_features[:5]  # 5 from each
    feature_names_plot = [f[0] for f in plot_features]
    coefficients_plot = [f[1] for f in plot_features]
    colors = ['red' if c < 0 else 'green' for c in coefficients_plot]
    
    plt.barh(range(len(feature_names_plot)), coefficients_plot, color=colors, alpha=0.7)
    plt.yticks(range(len(feature_names_plot)), feature_names_plot)
    plt.xlabel('Coefficient Value')
    plt.title('Most Important Features for Fake News Classification')
    plt.axvline(x=0, color='black', linestyle='-', alpha=0.3)
    plt.grid(axis='x', alpha=0.3)
    
    # Add legend
    plt.text(0.02, 0.98, 'Red: Indicates FAKE news', transform=plt.gca().transAxes, 
             verticalalignment='top', color='red', fontweight='bold')
    plt.text(0.02, 0.93, 'Green: Indicates REAL news', transform=plt.gca().transAxes, 
             verticalalignment='top', color='green', fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
else:
    print("Feature importance analysis is currently only available for logistic regression models.")

## 6. Model Deployment Ready

Your model is now ready for deployment! Here's what you can do next:

In [None]:
print("🚀 Your Fake News Detector is ready for deployment!")
print("="*60)

print("\n📊 Model Performance Summary:")
for model_type, accuracy in results.items():
    print(f"  • {model_type.upper()}: {accuracy:.1%} accuracy")

print(f"\n🏆 Best Model: {best_model_type.upper()} ({results[best_model_type]:.1%} accuracy)")

print("\n🎯 Next Steps:")
print("1. 🌐 Start the Flask API:")
print("   cd ../api && python app.py")
print("   Then visit: http://localhost:5000")

print("\n2. 📚 Get a real dataset:")
print("   • Download from Kaggle: 'Fake News Dataset'")
print("   • Place CSV in ../data/ folder")
print("   • Retrain with: python ../src/train_model.py")

print("\n3. 🔧 Improve the model:")
print("   • Try different preprocessing options")
print("   • Experiment with BERT/RoBERTa models")
print("   • Add more features (source credibility, etc.)")

print("\n4. 🚀 Deploy to production:")
print("   • Containerize with Docker")
print("   • Deploy to cloud (AWS, GCP, Azure)")
print("   • Add monitoring and logging")

print("\n✨ Happy fake news detecting!")