# Data Exploration for Fake News Detection
## 7th Semester CSE Interim Project

This notebook explores the dataset and performs initial analysis.

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from preprocessing import TextPreprocessor, create_sample_dataset

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Dataset

In [None]:
# Create sample dataset
df = create_sample_dataset('../data/sample_data.csv')
print(f"Dataset shape: {df.shape}")
df.head()

## 2. Class Distribution

In [None]:
# Plot class distribution
class_counts = df['label'].value_counts()
plt.bar(['Real', 'Fake'], class_counts.values, color=['#2ecc71', '#e74c3c'])
plt.title('Class Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Count')
plt.show()

print(f"Real news: {class_counts[0]} ({class_counts[0]/len(df)*100:.1f}%)")
print(f"Fake news: {class_counts[1]} ({class_counts[1]/len(df)*100:.1f}%)")

## 3. Text Length Analysis

In [None]:
# Calculate text lengths
df['text_length'] = df['text'].apply(len)
df['word_count'] = df['text'].apply(lambda x: len(x.split()))

# Plot distributions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for label, color in [(0, '#2ecc71'), (1, '#e74c3c')]:
    data = df[df['label'] == label]['word_count']
    axes[0].hist(data, alpha=0.6, label='Real' if label == 0 else 'Fake', color=color)

axes[0].set_xlabel('Word Count')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Word Count Distribution')
axes[0].legend()

df.boxplot(column='word_count', by='label', ax=axes[1])
axes[1].set_xlabel('Label (0=Real, 1=Fake)')
axes[1].set_ylabel('Word Count')
axes[1].set_title('Word Count by Class')

plt.tight_layout()
plt.show()

## 4. Text Preprocessing

In [None]:
# Test preprocessing
preprocessor = TextPreprocessor()

sample_text = df.iloc[0]['text']
cleaned_text = preprocessor.clean_text(sample_text)

print("Original:")
print(sample_text)
print("\nCleaned:")
print(cleaned_text)

## 5. Summary Statistics

In [None]:
print("Dataset Summary:")
print("=" * 50)
print(f"Total samples: {len(df)}")
print(f"Average word count: {df['word_count'].mean():.1f}")
print(f"Median word count: {df['word_count'].median():.1f}")
print(f"Max word count: {df['word_count'].max()}")
print(f"Min word count: {df['word_count'].min()}")
print("=" * 50)