# Fake News Detection
## EDA and Preprocessing Demo

### Step 1: Import Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

### Step 2: Load Data

In [None]:
fake_data = pd.read_csv('datasets/Fake.csv')
true_data = pd.read_csv('datasets/True.csv')

print("Data loaded successfully!")
print(f"Fake news: {fake_data.shape}")
print(f"True news: {true_data.shape}")

---
# Part 1: Exploratory Data Analysis (EDA)
---

### Step 3: View Data

In [None]:
# Check fake news
print("Fake News Sample:")
fake_data.head()

In [None]:
# Check true news
print("True News Sample:")
true_data.head()

### Step 4: Check Data Info

In [None]:
print("Fake News Info:")
print(fake_data.info())
print("\nMissing values:", fake_data.isnull().sum().sum())

In [None]:
print("True News Info:")
print(true_data.info())
print("\nMissing values:", true_data.isnull().sum().sum())

### Step 5: Visualize Class Distribution

In [None]:
# Simple bar chart
plt.figure(figsize=(8, 5))
categories = ['Fake News', 'True News']
counts = [len(fake_data), len(true_data)]

plt.bar(categories, counts, color=['red', 'green'], alpha=0.7)
plt.title('Dataset Distribution', fontsize=14, fontweight='bold')
plt.ylabel('Number of Articles')
plt.tight_layout()
plt.show()

print(f"Total Fake: {counts[0]}")
print(f"Total True: {counts[1]}")

### Step 6: Subject Distribution

In [None]:
# Check subjects in fake news
print("Fake News Subjects:")
print(fake_data['subject'].value_counts())

In [None]:
# Check subjects in true news
print("True News Subjects:")
print(true_data['subject'].value_counts())

In [None]:
# Visualize subjects
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

fake_data['subject'].value_counts().plot(kind='bar', ax=ax1, color='red', alpha=0.7)
ax1.set_title('Fake News Subjects')
ax1.set_ylabel('Count')

true_data['subject'].value_counts().plot(kind='bar', ax=ax2, color='green', alpha=0.7)
ax2.set_title('True News Subjects')
ax2.set_ylabel('Count')

plt.tight_layout()
plt.show()

### Step 7: Text Length Analysis

In [None]:
# Calculate text length
fake_data['length'] = fake_data['text'].apply(len)
true_data['length'] = true_data['text'].apply(len)

print("Average text length:")
print(f"Fake News: {fake_data['length'].mean():.0f} characters")
print(f"True News: {true_data['length'].mean():.0f} characters")

In [None]:
# Simple histogram
plt.figure(figsize=(10, 5))
plt.hist([fake_data['length'], true_data['length']], 
         bins=30, label=['Fake', 'True'], color=['red', 'green'], alpha=0.6)
plt.title('Text Length Distribution')
plt.xlabel('Length (characters)')
plt.ylabel('Frequency')
plt.legend()
plt.tight_layout()
plt.show()

---
# Part 2: Data Preprocessing
---

### Step 8: Add Labels

In [None]:
# Add class labels
fake_data['class'] = 0  # Fake = 0
true_data['class'] = 1  # True = 1

print("Labels added!")
print("Fake class:", fake_data['class'].unique())
print("True class:", true_data['class'].unique())

### Step 9: Merge Datasets

In [None]:
# Combine both datasets
data = pd.concat([fake_data, true_data], axis=0)

print(f"Merged dataset shape: {data.shape}")
print(f"Total articles: {len(data)}")

### Step 10: Keep Only Important Columns

In [None]:
# Keep only text and class
data = data[['text', 'class']]

print("Final columns:", data.columns.tolist())
data.head()

### Step 11: Shuffle Data

In [None]:
# Shuffle randomly
data = data.sample(frac=1).reset_index(drop=True)

print("Data shuffled!")
data.head()

### Step 12: Text Cleaning

In [None]:
# Cleaning function
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'https?://\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text.strip()

# Test it
sample = "Check this OUT!!! Visit https://example.com for more... #FakeNews123"
print("Original:", sample)
print("Cleaned:", clean_text(sample))

### Step 13: Apply Cleaning

In [None]:
# Clean all text
print("Cleaning text...")
data['text'] = data['text'].apply(clean_text)
print("Done!")

# Show sample
print("\nCleaned sample:")
print(data['text'][0][:150])

### Step 14: Final Check

In [None]:
# Final dataset summary
print("=== Final Dataset ===")
print(f"Shape: {data.shape}")
print(f"\nClass counts:")
print(data['class'].value_counts())
print(f"\nMissing values: {data.isnull().sum().sum()}")
print("\n✓ Dataset ready for modeling!")

---
## Summary

**EDA Done:**
- ✓ Checked data structure
- ✓ Visualized class distribution
- ✓ Analyzed subjects
- ✓ Compared text lengths

**Preprocessing Done:**
- ✓ Added class labels
- ✓ Merged datasets
- ✓ Removed extra columns
- ✓ Shuffled data
- ✓ Cleaned text

**Result:** Clean dataset ready for training!