# Fake News Detection Dataset - Exploratory Data Analysis

This notebook performs comprehensive EDA on the ISOT Fake News Dataset to understand patterns that distinguish fake news from authentic journalism.

**Dataset:** ISOT Fake News Dataset from Kaggle
**Goal:** Identify linguistic and structural patterns for fake news classification

In [2]:
!pip install textstat



In [3]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from datetime import datetime
from textstat import flesch_reading_ease, flesch_kincaid_grade
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings

# Set style for plots
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

## 1. Setup and Data Loading

In [None]:
# Load the datasets
# Make sure to download the dataset from Kaggle and place the CSV files in the same directory
fake_df = pd.read_csv('News_dataset/Fake.csv')
true_df = pd.read_csv('News_dataset/True.csv')

# Add labels
fake_df['label'] = 0  # 0 = Fake
true_df['label'] = 1  # 1 = True

# Combine datasets
df = pd.concat([fake_df, true_df], ignore_index=True)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Dataset loaded successfully!")
print(f"Total articles: {len(df):,}")
print(f"Fake articles: {len(fake_df):,} ({len(fake_df)/len(df)*100:.1f}%)")
print(f"True articles: {len(true_df):,} ({len(true_df)/len(df)*100:.1f}%)")
print(f"\nDataset shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

FileNotFoundError: [Errno 2] No such file or directory: 'Fake.csv'

## 2. Data Quality Assessment and Missing Data Patterns

In [None]:
# Data Quality Assessment
print("\n- Missing Values:")
missing_data = df.isnull().sum()
print(missing_data)
print(f"\nTotal missing values: {missing_data.sum()}")

print("\n- Duplicate Records:")
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

print("\n- Data Types:")
print(df.dtypes)

print("\n- Basic Statistics:")
print(df.describe(include='all'))

In [None]:
# Check for empty or very short articles
print("--- Content Quality Check ---")

# Text length analysis
df['text_length'] = df['text'].str.len()
df['title_length'] = df['title'].str.len()

print(f"Articles with empty text: {(df['text_length'] == 0).sum()}")
print(f"Articles with very short text (<50 chars): {(df['text_length'] < 50).sum()}")
print(f"Articles with empty titles: {(df['title_length'] == 0).sum()}")

# Subject distribution
print("\nSubject Distribution:")
subject_counts = df['subject'].value_counts()
print(subject_counts)

# Date format analysis
print("\nDate Format Sample:")
print(df['date'].head(10))

## 3. Descriptive Statistics and Feature Engineering

In [None]:
# Feature Engineering for Linguistic Analysis
def extract_linguistic_features(text):
    if pd.isna(text) or len(text) == 0:
        return pd.Series([0, 0, 0, 0, 0, 0, 0, 0])

    # Basic counts
    text_length = len(text)
    word_count = len(text.split())

    # Punctuation analysis
    exclamation_count = text.count('!')
    question_count = text.count('?')

    # Capitalization analysis
    caps_count = sum(1 for c in text if c.isupper())
    caps_ratio = caps_count / text_length if text_length > 0 else 0

    # Ratios
    exclamation_ratio = exclamation_count / word_count if word_count > 0 else 0
    question_ratio = question_count / word_count if word_count > 0 else 0

    return pd.Series([
        text_length, word_count, exclamation_count, question_count,
        caps_count, caps_ratio, exclamation_ratio, question_ratio
    ])

# Apply feature extraction
print("Extracting linguistic features...")

# Extract features for text content
text_features = df['text'].apply(extract_linguistic_features)
text_features.columns = [
    'text_length', 'text_word_count', 'text_exclamation_count', 'text_question_count',
    'text_caps_count', 'text_caps_ratio', 'text_exclamation_ratio', 'text_question_ratio'
]

# Extract features for titles
title_features = df['title'].apply(extract_linguistic_features)
title_features.columns = [
    'title_length', 'title_word_count', 'title_exclamation_count', 'title_question_count',
    'title_caps_count', 'title_caps_ratio', 'title_exclamation_ratio', 'title_question_ratio'
]

# Combine features with original dataset
df_features = pd.concat([df, text_features, title_features], axis=1)

print("Feature extraction completed!")
print(f"New dataset shape: {df_features.shape}")

In [None]:
# Descriptive Statistics by Class

# Key features to analyze
key_features = [
    'text_length', 'text_word_count', 'title_length', 'title_word_count',
    'text_caps_ratio', 'title_caps_ratio', 'text_exclamation_ratio', 'title_exclamation_ratio'
]

# Group by label and calculate statistics
stats_by_class = df_features.groupby('label')[key_features].agg(['mean', 'std', 'median'])

print("\nStatistics by Class (0=Fake, 1=True):")
print(stats_by_class.round(3))

# Create comparison table
comparison_data = []
for feature in key_features:
    fake_mean = df_features[df_features['label'] == 0][feature].mean()
    true_mean = df_features[df_features['label'] == 1][feature].mean()
    fake_std = df_features[df_features['label'] == 0][feature].std()
    true_std = df_features[df_features['label'] == 1][feature].std()

    comparison_data.append({
        'Feature': feature,
        'Fake_Mean': fake_mean,
        'Fake_Std': fake_std,
        'True_Mean': true_mean,
        'True_Std': true_std,
        'Difference': abs(fake_mean - true_mean)
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nDetailed Feature Comparison:")
print(comparison_df.round(3))

## 4. Distribution Plots and Visualizations

In [None]:
# ROBUST VERSION - This will definitely work
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Fake News Dataset - Distribution Analysis', fontsize=16, fontweight='bold')

# Data preparation with error checking
print("Preparing data for plotting...")
fake_data = df_features[df_features['label'] == 0].copy()
true_data = df_features[df_features['label'] == 1].copy()

print(f"Fake data shape: {fake_data.shape}")
print(f"True data shape: {true_data.shape}")

# 1. Class Distribution
ax1 = axes[0, 0]
class_counts = df_features['label'].value_counts()
wedges, texts, autotexts = ax1.pie(class_counts.values, labels=['Fake News', 'True News'],
                                  autopct='%1.1f%%', colors=['#ff6b6b', '#4ecdc4'], startangle=90)
ax1.set_title('Class Distribution', fontweight='bold')

# 2. Text Length Distribution - Super safe version
ax2 = axes[0, 1]
try:
    # Convert to numpy arrays and remove any NaN values
    fake_text_len = fake_data['text_length'].dropna().values.flatten()
    true_text_len = true_data['text_length'].dropna().values.flatten()

    # Filter extreme values
    fake_text_len = fake_text_len[fake_text_len < 15000]
    true_text_len = true_text_len[true_text_len < 15000]

    ax2.hist(fake_text_len, bins=50, alpha=0.7, density=True, label='Fake News', color='red')
    ax2.hist(true_text_len, bins=50, alpha=0.7, density=True, label='True News', color='blue')
    ax2.set_xlabel('Article Text Length (characters)')
    ax2.set_ylabel('Density')
    ax2.set_title('Text Length Distribution', fontweight='bold')
    ax2.legend()
    ax2.set_xlim(0, 10000)

except Exception as e:
    print(f"Error in text length plot: {e}")
    ax2.text(0.5, 0.5, 'Text Length Distribution\n(Error in data)',
             ha='center', va='center', transform=ax2.transAxes)

# 3. Title Length Distribution - Super safe version
ax3 = axes[0, 2]
try:
    fake_title_len = fake_data['title_length'].dropna().values.flatten()
    true_title_len = true_data['title_length'].dropna().values.flatten()

    # Filter extreme values
    fake_title_len = fake_title_len[fake_title_len < 500]
    true_title_len = true_title_len[true_title_len < 500]

    ax3.hist(fake_title_len, bins=30, alpha=0.7, density=True, label='Fake News', color='red')
    ax3.hist(true_title_len, bins=30, alpha=0.7, density=True, label='True News', color='blue')
    ax3.set_xlabel('Title Length (characters)')
    ax3.set_ylabel('Density')
    ax3.set_title('Title Length Distribution', fontweight='bold')
    ax3.legend()

except Exception as e:
    print(f"Error in title length plot: {e}")
    ax3.text(0.5, 0.5, 'Title Length Distribution\n(Error in data)',
             ha='center', va='center', transform=ax3.transAxes)

# 4. Box Plot - Super safe version
ax4 = axes[1, 0]
try:
    fake_text_filtered = fake_data['text_length'].dropna()
    true_text_filtered = true_data['text_length'].dropna()

    # Filter outliers for better visualization
    fake_text_filtered = fake_text_filtered[fake_text_filtered <= 10000]
    true_text_filtered = true_text_filtered[true_text_filtered <= 10000]

    data_to_plot = [fake_text_filtered.values, true_text_filtered.values]
    bp = ax4.boxplot(data_to_plot, labels=['Fake News', 'True News'], patch_artist=True)
    bp['boxes'][0].set_facecolor('#ff6b6b')
    bp['boxes'][1].set_facecolor('#4ecdc4')
    ax4.set_ylabel('Text Length (characters)')
    ax4.set_title('Text Length Box Plot', fontweight='bold')

except Exception as e:
    print(f"Error in box plot: {e}")
    ax4.text(0.5, 0.5, 'Text Length Box Plot\n(Error in data)',
             ha='center', va='center', transform=ax4.transAxes)

# 5. Subject Distribution - Super safe version
ax5 = axes[1, 1]
try:
    # Simple bar chart of subject distribution
    subject_counts = df_features['subject'].value_counts()
    x_pos = range(len(subject_counts))

    bars = ax5.bar(x_pos, subject_counts.values, color='skyblue')
    ax5.set_title('Subject Distribution', fontweight='bold')
    ax5.set_xlabel('Subject Category')
    ax5.set_ylabel('Count')
    ax5.set_xticks(x_pos)
    ax5.set_xticklabels(subject_counts.index, rotation=45, ha='right')

    # Add value labels on bars
    for bar in bars:
        height = bar.get_height()
        ax5.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}', ha='center', va='bottom')

except Exception as e:
    print(f"Error in subject plot: {e}")
    ax5.text(0.5, 0.5, 'Subject Distribution\n(Error in data)',
             ha='center', va='center', transform=ax5.transAxes)

# 6. Word Count Distribution - Super safe version
ax6 = axes[1, 2]
try:
    fake_word_count = fake_data['text_word_count'].dropna().values.flatten()
    true_word_count = true_data['text_word_count'].dropna().values.flatten()

    # Filter extreme values
    fake_word_count = fake_word_count[fake_word_count < 3000]
    true_word_count = true_word_count[true_word_count < 3000]

    ax6.hist(fake_word_count, bins=50, alpha=0.7, density=True, label='Fake News', color='red')
    ax6.hist(true_word_count, bins=50, alpha=0.7, density=True, label='True News', color='blue')
    ax6.set_xlabel('Word Count')
    ax6.set_ylabel('Density')
    ax6.set_title('Word Count Distribution', fontweight='bold')
    ax6.legend()
    ax6.set_xlim(0, 2000)

except Exception as e:
    print(f"Error in word count plot: {e}")
    ax6.text(0.5, 0.5, 'Word Count Distribution\n(Error in data)',
             ha='center', va='center', transform=ax6.transAxes)

plt.tight_layout()
plt.show()

# Print insights - Safe version
try:
    print("=== KEY DISTRIBUTION INSIGHTS ===")
    fake_text_mean = fake_data['text_length'].mean()
    true_text_mean = true_data['text_length'].mean()
    fake_title_mean = fake_data['title_length'].mean()
    true_title_mean = true_data['title_length'].mean()
    fake_word_mean = fake_data['text_word_count'].mean()
    true_word_mean = true_data['text_word_count'].mean()

    print(f"Average text length - Fake: {fake_text_mean:.0f}, True: {true_text_mean:.0f}")
    print(f"Average title length - Fake: {fake_title_mean:.1f}, True: {true_title_mean:.1f}")
    print(f"Average word count - Fake: {fake_word_mean:.0f}, True: {true_word_mean:.0f}")

    print(f"\nData Quality Check:")
    print(f"Fake articles count: {len(fake_data)}")
    print(f"True articles count: {len(true_data)}")
    print(f"Total articles: {len(df_features)}")

except Exception as e:
    print(f"Error in printing insights: {e}")
    print("Please check your data structure.")

## 5. Correlation Analysis and Heatmap

In [None]:
# Correlation Analysis
print("=== CORRELATION ANALYSIS ===")

# Select features for correlation analysis
correlation_features = [
    'label', 'text_length', 'title_length', 'text_word_count', 'title_word_count',
    'text_caps_ratio', 'title_caps_ratio', 'text_exclamation_ratio',
    'title_exclamation_ratio', 'text_question_ratio', 'title_question_ratio'
]

# Calculate correlation matrix
corr_matrix = df_features[correlation_features].corr()

# Create correlation heatmap
plt.figure(figsize=(12, 10))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))  # Mask upper triangle

sns.heatmap(corr_matrix, annot=True, cmap='RdBu_r', center=0,
            square=True, fmt='.3f', cbar_kws={'shrink': 0.8},
            mask=mask)
plt.title('Feature Correlation Matrix\n(Focus on relationship with label)', fontsize=14, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

# Feature-target correlations
target_correlations = corr_matrix['label'].drop('label').sort_values(key=abs, ascending=False)
print("\nFeature-Target Correlations (sorted by absolute value):")
for feature, corr in target_correlations.items():
    print(f"{feature:<25}: {corr:6.3f}")

# Identify strongest predictors
strong_predictors = target_correlations[abs(target_correlations) > 0.1]
print(f"\nStrong predictors (|correlation| > 0.1): {len(strong_predictors)}")
for feature, corr in strong_predictors.items():
    direction = "positively" if corr > 0 else "negatively"
    print(f"  {feature} is {direction} correlated with authenticity (r={corr:.3f})")

## 6. Outlier Analysis and Anomaly Detection

In [None]:

# First, check for and fix duplicate columns
print("Checking for duplicate columns...")
print(f"DataFrame shape: {df_features.shape}")
print(f"Number of unique column names: {df_features.columns.nunique()}")
print(f"Total number of columns: {len(df_features.columns)}")

# Check for duplicate columns
duplicate_cols = df_features.columns[df_features.columns.duplicated()].tolist()
if duplicate_cols:
    print("-  Duplicate column names detected!")
    print("Duplicates:", duplicate_cols)

    # Fix duplicate columns by removing duplicates (keeps first occurrence)
    print("ðŸ”§ Fixing duplicate columns...")
    df_features = df_features.loc[:, ~df_features.columns.duplicated()]
    print("Fixed. New shape:", df_features.shape)
else:
    print("- No duplicate columns found")

# Function to detect outliers using 99th percentile method
def detect_outliers_percentile(data, feature, lower_percentile=1, upper_percentile=99):
    lower_bound = data[feature].quantile(lower_percentile / 100)
    upper_bound = data[feature].quantile(upper_percentile / 100)

    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)]
    return outliers, lower_bound, upper_bound

# Function to detect only upper outliers (for features where low values aren't concerning)
def detect_upper_outliers_percentile(data, feature, upper_percentile=99):
    upper_bound = data[feature].quantile(upper_percentile / 100)
    outliers = data[data[feature] > upper_bound]
    return outliers, upper_bound

# Analyze outliers for key features
outlier_features = ['text_length', 'title_length']


# Comprehensive outlier analysis with multiple percentile thresholds
percentile_thresholds = [95, 99, 99.5, 99.9]
print("\n=== MULTI-THRESHOLD OUTLIER ANALYSIS ===")

for threshold in percentile_thresholds:
    print(f"\n--- {threshold}th Percentile Outliers ---")
    outlier_summary = []

    for feature in outlier_features:
        # For length features, only look at upper outliers (unusually long)
        outliers, upper_bound = detect_upper_outliers_percentile(df_features, feature, threshold)
        lower_bound = None

        fake_outliers = len(outliers[outliers['label'] == 0])
        true_outliers = len(outliers[outliers['label'] == 1])

        outlier_summary.append({
            'Feature': feature,
            'Total_Outliers': len(outliers),
            'Fake_Outliers': fake_outliers,
            'True_Outliers': true_outliers,
            'Outlier_Percentage': len(outliers) / len(df_features) * 100,
            'Lower_Bound': lower_bound,
            'Upper_Bound': upper_bound,
            'Fake_Rate_in_Outliers': fake_outliers / len(outliers) * 100 if len(outliers) > 0 else 0
        })

    outlier_df = pd.DataFrame(outlier_summary)
    print(outlier_df.round(3))

# Detailed analysis using 99th percentile
print("\n=== DETAILED 99th PERCENTILE ANALYSIS ===")
outlier_summary_99 = []

for feature in outlier_features:
    # Get feature statistics
    feature_stats = {
        'mean': df_features[feature].mean(),
        'median': df_features[feature].median(),
        'std': df_features[feature].std(),
        'min': df_features[feature].min(),
        'max': df_features[feature].max(),
        '95th': df_features[feature].quantile(0.95),
        '99th': df_features[feature].quantile(0.99),
        '99.9th': df_features[feature].quantile(0.999)
    }

    # Detect 99th percentile outliers (upper outliers only for length features)
    outliers, upper_bound = detect_upper_outliers_percentile(df_features, feature, 99)
    lower_bound = None

    fake_outliers = len(outliers[outliers['label'] == 0])
    true_outliers = len(outliers[outliers['label'] == 1])

    print(f"\n--- {feature.replace('_', ' ').title()} Analysis ---")
    print(f"Statistics: Mean={feature_stats['mean']:.2f}, Median={feature_stats['median']:.2f}, Std={feature_stats['std']:.2f}")
    print(f"Percentiles: 95th={feature_stats['95th']:.2f}, 99th={feature_stats['99th']:.2f}, 99.9th={feature_stats['99.9th']:.2f}")
    print(f"99th Percentile Threshold: {upper_bound:.2f}")
    print(f"Total Outliers: {len(outliers)} ({len(outliers)/len(df_features)*100:.2f}%)")
    print(f"Fake News Outliers: {fake_outliers} ({fake_outliers/len(outliers)*100:.1f}% of outliers)")
    print(f"True News Outliers: {true_outliers} ({true_outliers/len(outliers)*100:.1f}% of outliers)")

    outlier_summary_99.append({
        'Feature': feature,
        'Total_Outliers': len(outliers),
        'Fake_Outliers': fake_outliers,
        'True_Outliers': true_outliers,
        'Outlier_Percentage': len(outliers) / len(df_features) * 100,
        'Fake_Rate_in_Outliers': fake_outliers / len(outliers) * 100 if len(outliers) > 0 else 0,
        'Upper_Threshold': upper_bound,
        'Lower_Threshold': lower_bound
    })

# Visualize outliers with 99th percentile thresholds
fig, axes = plt.subplots(1, 2, figsize=(16, 6))
fig.suptitle('99th Percentile Outlier Analysis - Box Plots by Class', fontsize=16, fontweight='bold')

for idx, feature in enumerate(outlier_features):
    ax = axes[idx]

    # Create box plot
    fake_data = df_features[df_features['label'] == 0][feature]
    true_data = df_features[df_features['label'] == 1][feature]

    data_to_plot = [fake_data, true_data]
    bp = ax.boxplot(data_to_plot, labels=['Fake News', 'True News'], patch_artist=True)
    bp['boxes'][0].set_facecolor('#ff6b6b')
    bp['boxes'][1].set_facecolor('#4ecdc4')

    # Add 99th percentile line
    percentile_99 = df_features[feature].quantile(0.99)
    ax.axhline(y=percentile_99, color='red', linestyle='--', linewidth=2, alpha=0.7,
               label=f'99th Percentile: {percentile_99:.2f}')

    ax.set_title(f'{feature.replace("_", " ").title()}', fontweight='bold')
    ax.set_ylabel('Value')
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Extreme value analysis with 99th percentile context
print("\n=== EXTREME VALUE ANALYSIS WITH PERCENTILE CONTEXT ===")
for feature in outlier_features:
    feature_data = df_features[feature]
    percentiles = [1, 5, 10, 25, 50, 75, 90, 95, 99, 99.5, 99.9]

    print(f"\n{feature.replace('_', ' ').title()} Distribution:")
    for p in percentiles:
        value = feature_data.quantile(p/100)
        print(f"  {p:5.1f}th percentile: {value:8.2f}")

    print(f"  Minimum: {feature_data.min():8.2f}")
    print(f"  Maximum: {feature_data.max():8.2f}")

# Identify most extreme cases
print("\n=== MOST EXTREME CASES ===")
extreme_cases = []

for feature in outlier_features:
    # Get top 10 longest values (since all features are length-based)
    top_extreme = df_features.nlargest(10, feature)[['label', feature]]
    extreme_type = 'longest'

    fake_count = (top_extreme['label'] == 0).sum()
    true_count = (top_extreme['label'] == 1).sum()

    print(f"\nTop 10 {extreme_type} {feature.replace('_', ' ')}:")
    print(f"  Fake news: {fake_count}/10 ({fake_count/10*100:.1f}%)")
    print(f"  True news: {true_count}/10 ({true_count/10*100:.1f}%)")
    print(f"  Range: {top_extreme[feature].min():.2f} - {top_extreme[feature].max():.2f}")

# Summary insights
print("\n=== KEY INSIGHTS ===")
outlier_df_99 = pd.DataFrame(outlier_summary_99)
print("99th Percentile Outlier Summary:")
print(outlier_df_99[['Feature', 'Total_Outliers', 'Outlier_Percentage', 'Fake_Rate_in_Outliers']].round(2))

print("\nPattern Analysis:")
for _, row in outlier_df_99.iterrows():
    feature = row['Feature']
    fake_rate = row['Fake_Rate_in_Outliers']
    if fake_rate > 60:
        print(f"! {feature}: Fake news overrepresented in outliers ({fake_rate:.1f}%)")
    elif fake_rate < 40:
        print(f"â„¹âœ“  {feature}: True news overrepresented in outliers ({fake_rate:.1f}%)")
    else:
        print(f"~  {feature}: Balanced representation in outliers ({fake_rate:.1f}%)")

## 7. Summary of Key Findings (Will add later)