In [None]:
# üß† 02_Exploratory_Data_Analysis.ipynb
# Author: Reckless_Babu
# Description: EDA for Mental Health Detection project using cleaned dataset.

# === Imports ===
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.probability import FreqDist
import os

# === Load Cleaned Dataset ===
data_path = "../data/processed/cleaned_data_nltk.csv"
df = pd.read_csv(data_path)

print(f"‚úÖ Data Loaded Successfully! Shape: {df.shape}")
print(df.head())

# === Basic Info ===
print("\nüìä Dataset Info:")
print(df.info())

print("\nüîç Missing Values:")
print(df.isnull().sum())

# === Text Length Analysis ===
df['text_length'] = df['clean_text'].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(8,5))
plt.hist(df['text_length'], bins=30, edgecolor='black')
plt.title("üìù Text Length Distribution")
plt.xlabel("Number of Words")
plt.ylabel("Frequency")
plt.show()

# === Word Frequency ===
all_words = ' '.join(df['clean_text']).split()
freq_dist = nltk.FreqDist(all_words)

# Convert to DataFrame
freq_df = pd.DataFrame(freq_dist.most_common(20), columns=['Word', 'Frequency'])

plt.figure(figsize=(10,5))
sns.barplot(x='Frequency', y='Word', data=freq_df)
plt.title("üî† Top 20 Most Frequent Words")
plt.show()

# === WordCloud Visualization ===
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['clean_text']))
plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("üå•Ô∏è Word Cloud of Cleaned Text")
plt.show()

# === Optional: Sentiment Label Counts (if dataset has 'label' column) ===
if 'label' in df.columns:
    plt.figure(figsize=(8,5))
    sns.countplot(data=df, x='label', order=df['label'].value_counts().index)
    plt.title("üí¨ Emotion / Sentiment Distribution")
    plt.xlabel("Label")
    plt.ylabel("Count")
    plt.show()

# === Common Bigrams (pairs of words) ===
from nltk import bigrams
from collections import Counter

all_bigrams = list(bigrams(all_words))
bigram_freq = Counter(all_bigrams)
bigram_df = pd.DataFrame(bigram_freq.most_common(15), columns=['Bigram', 'Count'])

plt.figure(figsize=(10,5))
sns.barplot(y=bigram_df['Bigram'].astype(str), x=bigram_df['Count'])
plt.title("ü™∂ Top 15 Bigrams")
plt.xlabel("Count")
plt.ylabel("Word Pairs")
plt.show()

print("\n‚úÖ EDA Completed Successfully!")
