# Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on Reddit posts data.

## Objectives
1. Understand data structure and types
2. Identify missing values and data quality issues
3. Analyze distributions of key variables
4. Explore relationships between variables
5. Generate insights and visualizations

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Configuration
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully")

In [None]:
# Load data
data_path = Path("../data/processed/ingested_data.csv")

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f"Loaded {len(df)} rows and {len(df.columns)} columns")
else:
    # Fallback to output directory
    output_path = Path("../data/output")
    csv_files = list(output_path.glob("*.csv"))
    if csv_files:
        latest_file = max(csv_files, key=lambda p: p.stat().st_mtime)
        df = pd.read_csv(latest_file)
        print(f"Loaded {len(df)} rows from {latest_file.name}")
    else:
        raise FileNotFoundError("No data files found")

## 1. Data Overview

In [None]:
# Basic information
print("=" * 50)
print("DATA OVERVIEW")
print("=" * 50)
print(f"\nShape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData Types:\n{df.dtypes}")
print(f"\nMemory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

In [None]:
# First few rows
df.head()

In [None]:
# Statistical summary
df.describe(include='all')

## 2. Missing Values Analysis

In [None]:
# Missing values
missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing Count': df.isnull().sum(),
    'Missing Percentage': (df.isnull().sum() / len(df) * 100).round(2)
}).sort_values('Missing Count', ascending=False)

print("Missing Values Analysis:")
print(missing_data[missing_data['Missing Count'] > 0])

# Visualize missing values
if missing_data['Missing Count'].sum() > 0:
    plt.figure(figsize=(10, 6))
    missing_data[missing_data['Missing Count'] > 0].plot(
        x='Column', y='Missing Percentage', kind='barh', figsize=(10, 6)
    )
    plt.title('Missing Values by Column')
    plt.xlabel('Missing Percentage (%)')
    plt.tight_layout()
    plt.show()

## 3. Subreddit Distribution

In [None]:
# Subreddit distribution
if 'subreddit' in df.columns:
    subreddit_counts = df['subreddit'].value_counts()
    
    print("Subreddit Distribution:")
    print(subreddit_counts)
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar chart
    subreddit_counts.plot(kind='bar', ax=axes[0], color='steelblue')
    axes[0].set_title('Posts by Subreddit')
    axes[0].set_xlabel('Subreddit')
    axes[0].set_ylabel('Number of Posts')
    axes[0].tick_params(axis='x', rotation=45)
    
    # Pie chart
    subreddit_counts.plot(kind='pie', ax=axes[1], autopct='%1.1f%%')
    axes[1].set_title('Subreddit Distribution')
    axes[1].set_ylabel('')
    
    plt.tight_layout()
    plt.show()

## 4. Score and Engagement Analysis

In [None]:
# Score distribution
if 'score' in df.columns:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Score distribution
    df['score'].hist(bins=50, ax=axes[0, 0], color='skyblue', edgecolor='black')
    axes[0, 0].set_title('Score Distribution')
    axes[0, 0].set_xlabel('Score')
    axes[0, 0].set_ylabel('Frequency')
    
    # Log scale
    df['score'].apply(lambda x: np.log1p(x) if x >= 0 else 0).hist(bins=50, ax=axes[0, 1], color='lightgreen', edgecolor='black')
    axes[0, 1].set_title('Score Distribution (Log Scale)')
    axes[0, 1].set_xlabel('Log(Score + 1)')
    axes[0, 1].set_ylabel('Frequency')
    
    # Comments distribution
    if 'num_comments' in df.columns:
        df['num_comments'].hist(bins=50, ax=axes[1, 0], color='coral', edgecolor='black')
        axes[1, 0].set_title('Number of Comments Distribution')
        axes[1, 0].set_xlabel('Number of Comments')
        axes[1, 0].set_ylabel('Frequency')
        
        # Score vs Comments
        axes[1, 1].scatter(df['num_comments'], df['score'], alpha=0.5, s=10)
        axes[1, 1].set_title('Score vs Number of Comments')
        axes[1, 1].set_xlabel('Number of Comments')
        axes[1, 1].set_ylabel('Score')
    
    plt.tight_layout()
    plt.show()
    
    # Summary statistics
    print("\nScore Statistics:")
    print(df['score'].describe())
    if 'num_comments' in df.columns:
        print("\nComments Statistics:")
        print(df['num_comments'].describe())

## 5. Temporal Analysis

In [None]:
# Convert created_utc to datetime if needed
if 'created_utc' in df.columns:
    if not pd.api.types.is_datetime64_any_dtype(df['created_utc']):
        df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s', errors='coerce')
    
    df['created_date'] = df['created_utc'].dt.date
    df['created_hour'] = df['created_utc'].dt.hour
    df['created_day_of_week'] = df['created_utc'].dt.day_name()
    
    # Temporal visualizations
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Posts over time
    daily_counts = df.groupby('created_date').size()
    daily_counts.plot(ax=axes[0, 0], color='steelblue')
    axes[0, 0].set_title('Posts Over Time')
    axes[0, 0].set_xlabel('Date')
    axes[0, 0].set_ylabel('Number of Posts')
    axes[0, 0].tick_params(axis='x', rotation=45)
    
    # Posts by hour
    hourly_counts = df['created_hour'].value_counts().sort_index()
    hourly_counts.plot(kind='bar', ax=axes[0, 1], color='lightcoral')
    axes[0, 1].set_title('Posts by Hour of Day')
    axes[0, 1].set_xlabel('Hour')
    axes[0, 1].set_ylabel('Number of Posts')
    
    # Posts by day of week
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    day_counts = df['created_day_of_week'].value_counts().reindex(day_order)
    day_counts.plot(kind='bar', ax=axes[1, 0], color='mediumseagreen')
    axes[1, 0].set_title('Posts by Day of Week')
    axes[1, 0].set_xlabel('Day of Week')
    axes[1, 0].set_ylabel('Number of Posts')
    axes[1, 0].tick_params(axis='x', rotation=45)
    
    # Average score by hour
    if 'score' in df.columns:
        avg_score_by_hour = df.groupby('created_hour')['score'].mean()
        avg_score_by_hour.plot(ax=axes[1, 1], color='gold', marker='o')
        axes[1, 1].set_title('Average Score by Hour')
        axes[1, 1].set_xlabel('Hour')
        axes[1, 1].set_ylabel('Average Score')
    
    plt.tight_layout()
    plt.show()

## 6. Text Analysis

In [None]:
# Text length analysis
if 'title' in df.columns:
    df['title_length'] = df['title'].str.len()
    df['title_word_count'] = df['title'].str.split().str.len()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    df['title_length'].hist(bins=50, ax=axes[0], color='plum', edgecolor='black')
    axes[0].set_title('Title Length Distribution')
    axes[0].set_xlabel('Character Count')
    axes[0].set_ylabel('Frequency')
    
    df['title_word_count'].hist(bins=30, ax=axes[1], color='khaki', edgecolor='black')
    axes[1].set_title('Title Word Count Distribution')
    axes[1].set_xlabel('Word Count')
    axes[1].set_ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    print("\nTitle Statistics:")
    print(f"Average length: {df['title_length'].mean():.2f} characters")
    print(f"Average word count: {df['title_word_count'].mean():.2f} words")

## 7. Correlation Analysis

In [None]:
# Correlation matrix
numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 1:
    correlation_matrix = df[numeric_cols].corr()
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Correlation Matrix')
    plt.tight_layout()
    plt.show()
    
    # Print strong correlations
    print("\nStrong Correlations (|r| > 0.5):")
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            corr_val = correlation_matrix.iloc[i, j]
            if abs(corr_val) > 0.5:
                print(f"{correlation_matrix.columns[i]} - {correlation_matrix.columns[j]}: {corr_val:.3f}")

## 8. Key Insights Summary

In [None]:
# Generate summary insights
print("=" * 50)
print("EDA SUMMARY INSIGHTS")
print("=" * 50)

print(f"\n1. Dataset Size: {len(df)} posts, {len(df.columns)} features")
print(f"\n2. Date Range: {df['created_date'].min()} to {df['created_date'].max()}")
print(f"\n3. Subreddits: {df['subreddit'].nunique()} unique subreddits")
if 'score' in df.columns:
    print(f"\n4. Score Range: {df['score'].min()} to {df['score'].max()} (mean: {df['score'].mean():.2f})")
if 'num_comments' in df.columns:
    print(f"\n5. Comments Range: {df['num_comments'].min()} to {df['num_comments'].max()} (mean: {df['num_comments'].mean():.2f})")
print(f"\n6. Missing Values: {df.isnull().sum().sum()} total missing values")
print(f"\n7. Duplicate IDs: {df['id'].duplicated().sum()} duplicates")

# Save processed data for next steps
output_path = Path("../data/processed/eda_data.csv")
output_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(output_path, index=False)
print(f"\n8. Processed data saved to: {output_path}")