# Exploratory Data Analysis: Mental Health Problems

## Introduction

This notebook provides comprehensive exploratory data analysis (EDA) on mental health survey data. The goal is to uncover insights about factors affecting mental health and identify patterns that could help in understanding and addressing mental health challenges.

### Objectives
1. Understand the distribution of mental health indicators across demographics
2. Identify correlations between lifestyle factors and mental health scores
3. Analyze the impact of work-related factors on mental well-being
4. Provide actionable insights for mental health interventions

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set visualization styles
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
%matplotlib inline

print("Libraries loaded successfully!")

## 1. Data Loading and Initial Exploration

In [None]:
# Load the dataset
df = pd.read_csv('../data/mental_health_survey.csv')

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows of the dataset:")
df.head()

In [None]:
# Dataset information
print("Dataset Information:")
print("="*50)
df.info()

In [None]:
# Statistical summary of numerical columns
print("Statistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing[missing > 0]

In [None]:
# Display unique values for categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print("Unique values in categorical columns:")
print("="*50)
for col in categorical_cols:
    print(f"\n{col}: {df[col].nunique()} unique values")
    print(df[col].value_counts())

## 2. Data Cleaning and Preprocessing

In [None]:
# Create a copy for analysis
df_clean = df.copy()

# Handle 'N/A' values in work_life_balance
df_clean['work_life_balance'] = df_clean['work_life_balance'].replace('N/A', 'Unknown')

# Create age groups for analysis
bins = [0, 25, 35, 45, 55, 100]
labels = ['18-25', '26-35', '36-45', '46-55', '55+']
df_clean['age_group'] = pd.cut(df_clean['age'], bins=bins, labels=labels)

# Create work hours categories
df_clean['work_hours_category'] = pd.cut(
    df_clean['work_hours_per_week'],
    bins=[-1, 0, 35, 45, 55, 100],
    labels=['Not Working', 'Part-time', 'Standard', 'Overtime', 'Excessive']
)

print("Data preprocessing completed!")
print(f"\nAge Group Distribution:")
print(df_clean['age_group'].value_counts())

## 3. Demographic Analysis

In [None]:
# Create a figure with subplots for demographic overview
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Age distribution
axes[0, 0].hist(df_clean['age'], bins=15, edgecolor='black', color='steelblue', alpha=0.7)
axes[0, 0].set_xlabel('Age', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontsize=12)
axes[0, 0].set_title('Age Distribution', fontsize=14, fontweight='bold')
axes[0, 0].axvline(df_clean['age'].mean(), color='red', linestyle='--', label=f'Mean: {df_clean["age"].mean():.1f}')
axes[0, 0].legend()

# Gender distribution
gender_counts = df_clean['gender'].value_counts()
axes[0, 1].pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%',
               colors=sns.color_palette('pastel'), explode=[0.05]*len(gender_counts))
axes[0, 1].set_title('Gender Distribution', fontsize=14, fontweight='bold')

# Education level distribution
education_order = ["High School", "Associate's", "Bachelor's", "Master's", "MBA", "Doctorate"]
edu_counts = df_clean['education_level'].value_counts().reindex(education_order).dropna()
axes[1, 0].barh(edu_counts.index, edu_counts.values, color='teal', alpha=0.7)
axes[1, 0].set_xlabel('Count', fontsize=12)
axes[1, 0].set_title('Education Level Distribution', fontsize=14, fontweight='bold')

# Marital status distribution
marital_counts = df_clean['marital_status'].value_counts()
axes[1, 1].bar(marital_counts.index, marital_counts.values, color='coral', alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Marital Status', fontsize=12)
axes[1, 1].set_ylabel('Count', fontsize=12)
axes[1, 1].set_title('Marital Status Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../data/demographic_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 4. Mental Health Indicators Analysis

In [None]:
# Mental health score distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Overall mental health score
axes[0].hist(df_clean['overall_mental_health_score'], bins=20, edgecolor='black', color='purple', alpha=0.7)
axes[0].set_xlabel('Overall Mental Health Score', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Overall Mental Health Score Distribution', fontsize=14, fontweight='bold')
axes[0].axvline(df_clean['overall_mental_health_score'].mean(), color='red', linestyle='--',
                label=f'Mean: {df_clean["overall_mental_health_score"].mean():.1f}')
axes[0].legend()

# Anxiety score distribution
axes[1].hist(df_clean['anxiety_score'], bins=10, edgecolor='black', color='orange', alpha=0.7)
axes[1].set_xlabel('Anxiety Score', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Anxiety Score Distribution', fontsize=14, fontweight='bold')
axes[1].axvline(df_clean['anxiety_score'].mean(), color='red', linestyle='--',
                label=f'Mean: {df_clean["anxiety_score"].mean():.1f}')
axes[1].legend()

# Depression score distribution
axes[2].hist(df_clean['depression_score'], bins=10, edgecolor='black', color='indianred', alpha=0.7)
axes[2].set_xlabel('Depression Score', fontsize=12)
axes[2].set_ylabel('Frequency', fontsize=12)
axes[2].set_title('Depression Score Distribution', fontsize=14, fontweight='bold')
axes[2].axvline(df_clean['depression_score'].mean(), color='blue', linestyle='--',
                label=f'Mean: {df_clean["depression_score"].mean():.1f}')
axes[2].legend()

plt.tight_layout()
plt.savefig('../data/mental_health_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

In [None]:
# Treatment seeking analysis
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Treatment seeking distribution
treatment_counts = df_clean['has_sought_treatment'].value_counts()
colors = ['#ff6b6b', '#4ecdc4']
axes[0].pie(treatment_counts.values, labels=['Has Not Sought Treatment', 'Has Sought Treatment'],
            autopct='%1.1f%%', colors=colors, explode=[0.02, 0.02], startangle=90)
axes[0].set_title('Treatment Seeking Behavior', fontsize=14, fontweight='bold')

# Mental health scores by treatment seeking
treatment_stats = df_clean.groupby('has_sought_treatment').agg({
    'anxiety_score': 'mean',
    'depression_score': 'mean',
    'overall_mental_health_score': 'mean'
}).round(2)

x = np.arange(len(treatment_stats.index))
width = 0.25

bars1 = axes[1].bar(x - width, treatment_stats['anxiety_score'], width, label='Anxiety Score', color='orange')
bars2 = axes[1].bar(x, treatment_stats['depression_score'], width, label='Depression Score', color='indianred')
bars3 = axes[1].bar(x + width, treatment_stats['overall_mental_health_score']/10, width,
                    label='Mental Health Score/10', color='purple')

axes[1].set_xlabel('Has Sought Treatment', fontsize=12)
axes[1].set_ylabel('Average Score', fontsize=12)
axes[1].set_title('Mental Health Indicators by Treatment Status', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(['No', 'Yes'])
axes[1].legend()

plt.tight_layout()
plt.savefig('../data/treatment_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nTreatment Seeking Statistics:")
print(treatment_stats)

## 5. Lifestyle Factors and Mental Health

In [None]:
# Sleep and Physical Activity Analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Sleep hours vs Mental Health Score
sleep_mental = df_clean.groupby('sleep_hours_per_day')['overall_mental_health_score'].mean()
axes[0, 0].bar(sleep_mental.index, sleep_mental.values, color='navy', alpha=0.7, edgecolor='black')
axes[0, 0].set_xlabel('Sleep Hours per Day', fontsize=12)
axes[0, 0].set_ylabel('Average Mental Health Score', fontsize=12)
axes[0, 0].set_title('Sleep Hours vs Mental Health Score', fontsize=14, fontweight='bold')

# Physical Activity vs Mental Health Score
activity_mental = df_clean.groupby('physical_activity_days_per_week')['overall_mental_health_score'].mean()
axes[0, 1].bar(activity_mental.index, activity_mental.values, color='green', alpha=0.7, edgecolor='black')
axes[0, 1].set_xlabel('Physical Activity Days per Week', fontsize=12)
axes[0, 1].set_ylabel('Average Mental Health Score', fontsize=12)
axes[0, 1].set_title('Physical Activity vs Mental Health Score', fontsize=14, fontweight='bold')

# Sleep hours vs Anxiety Score
sleep_anxiety = df_clean.groupby('sleep_hours_per_day')['anxiety_score'].mean()
axes[1, 0].plot(sleep_anxiety.index, sleep_anxiety.values, marker='o', linewidth=2, color='orange')
axes[1, 0].fill_between(sleep_anxiety.index, sleep_anxiety.values, alpha=0.3, color='orange')
axes[1, 0].set_xlabel('Sleep Hours per Day', fontsize=12)
axes[1, 0].set_ylabel('Average Anxiety Score', fontsize=12)
axes[1, 0].set_title('Sleep Hours vs Anxiety Score', fontsize=14, fontweight='bold')

# Physical Activity vs Depression Score
activity_depression = df_clean.groupby('physical_activity_days_per_week')['depression_score'].mean()
axes[1, 1].plot(activity_depression.index, activity_depression.values, marker='s', linewidth=2, color='indianred')
axes[1, 1].fill_between(activity_depression.index, activity_depression.values, alpha=0.3, color='indianred')
axes[1, 1].set_xlabel('Physical Activity Days per Week', fontsize=12)
axes[1, 1].set_ylabel('Average Depression Score', fontsize=12)
axes[1, 1].set_title('Physical Activity vs Depression Score', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../data/lifestyle_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Work-Related Factors Analysis

In [None]:
# Work hours and stress level analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Work hours category distribution
work_hours_order = ['Not Working', 'Part-time', 'Standard', 'Overtime', 'Excessive']
work_hours_counts = df_clean['work_hours_category'].value_counts().reindex(work_hours_order)
axes[0, 0].bar(work_hours_counts.index, work_hours_counts.values, color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].set_xlabel('Work Hours Category', fontsize=12)
axes[0, 0].set_ylabel('Count', fontsize=12)
axes[0, 0].set_title('Distribution of Work Hours Categories', fontsize=14, fontweight='bold')
axes[0, 0].tick_params(axis='x', rotation=45)

# Stress level distribution
stress_counts = df_clean['stress_level'].value_counts()
colors = {'Low': 'green', 'Medium': 'yellow', 'High': 'red'}
axes[0, 1].bar(stress_counts.index, stress_counts.values,
               color=[colors.get(x, 'gray') for x in stress_counts.index], alpha=0.7, edgecolor='black')
axes[0, 1].set_xlabel('Stress Level', fontsize=12)
axes[0, 1].set_ylabel('Count', fontsize=12)
axes[0, 1].set_title('Stress Level Distribution', fontsize=14, fontweight='bold')

# Mental health by stress level
stress_mental = df_clean.groupby('stress_level')['overall_mental_health_score'].mean()
stress_order = ['Low', 'Medium', 'High']
stress_mental = stress_mental.reindex(stress_order)
axes[1, 0].bar(stress_mental.index, stress_mental.values,
               color=[colors.get(x, 'gray') for x in stress_mental.index], alpha=0.7, edgecolor='black')
axes[1, 0].set_xlabel('Stress Level', fontsize=12)
axes[1, 0].set_ylabel('Average Mental Health Score', fontsize=12)
axes[1, 0].set_title('Mental Health Score by Stress Level', fontsize=14, fontweight='bold')

# Work-life balance vs mental health
wlb_order = ['Poor', 'Fair', 'Good', 'Excellent', 'Unknown']
wlb_mental = df_clean.groupby('work_life_balance')['overall_mental_health_score'].mean()
wlb_mental = wlb_mental.reindex(wlb_order).dropna()
colors_wlb = ['#ff6b6b', '#feca57', '#48dbfb', '#1dd1a1', '#c8d6e5']
axes[1, 1].bar(wlb_mental.index, wlb_mental.values, color=colors_wlb[:len(wlb_mental)], alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Work-Life Balance', fontsize=12)
axes[1, 1].set_ylabel('Average Mental Health Score', fontsize=12)
axes[1, 1].set_title('Mental Health Score by Work-Life Balance', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../data/work_factors_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 7. Social Support and Mental Health

In [None]:
# Social support analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Social support distribution
support_order = ['Poor', 'Fair', 'Good', 'Excellent']
support_counts = df_clean['social_support'].value_counts().reindex(support_order)
colors_support = ['#ff6b6b', '#feca57', '#48dbfb', '#1dd1a1']
axes[0].bar(support_counts.index, support_counts.values, color=colors_support, alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Social Support Level', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].set_title('Social Support Distribution', fontsize=14, fontweight='bold')

# Social support vs mental health indicators
support_stats = df_clean.groupby('social_support').agg({
    'anxiety_score': 'mean',
    'depression_score': 'mean',
    'overall_mental_health_score': 'mean'
}).reindex(support_order)

x = np.arange(len(support_order))
width = 0.25

bars1 = axes[1].bar(x - width, support_stats['anxiety_score'], width, label='Anxiety Score', color='orange')
bars2 = axes[1].bar(x, support_stats['depression_score'], width, label='Depression Score', color='indianred')
bars3 = axes[1].bar(x + width, support_stats['overall_mental_health_score']/10, width,
                    label='Mental Health Score/10', color='purple')

axes[1].set_xlabel('Social Support Level', fontsize=12)
axes[1].set_ylabel('Average Score', fontsize=12)
axes[1].set_title('Mental Health Indicators by Social Support Level', fontsize=14, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(support_order)
axes[1].legend()

plt.tight_layout()
plt.savefig('../data/social_support_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nSocial Support Statistics:")
print(support_stats.round(2))

## 8. Correlation Analysis

In [None]:
# Select numerical columns for correlation
numerical_cols = ['age', 'work_hours_per_week', 'sleep_hours_per_day',
                  'physical_activity_days_per_week', 'anxiety_score',
                  'depression_score', 'overall_mental_health_score']

# Calculate correlation matrix
correlation_matrix = df_clean[numerical_cols].corr()

# Create heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='RdYlBu_r',
            center=0, square=True, linewidths=0.5, fmt='.2f',
            annot_kws={'size': 10})
plt.title('Correlation Matrix of Mental Health Factors', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/correlation_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

# Key correlations with mental health score
print("\nCorrelations with Overall Mental Health Score:")
print("="*50)
mental_health_corr = correlation_matrix['overall_mental_health_score'].drop('overall_mental_health_score').sort_values()
print(mental_health_corr)

## 9. Occupation-Based Analysis

In [None]:
# Occupation analysis
occupation_stats = df_clean.groupby('occupation').agg({
    'anxiety_score': 'mean',
    'depression_score': 'mean',
    'overall_mental_health_score': 'mean',
    'id': 'count'
}).rename(columns={'id': 'count'}).round(2)

# Top 10 occupations with lowest mental health scores
lowest_mental_health = occupation_stats.nsmallest(10, 'overall_mental_health_score')

# Top 10 occupations with highest mental health scores
highest_mental_health = occupation_stats.nlargest(10, 'overall_mental_health_score')

print("Occupations with LOWEST Mental Health Scores:")
print("="*60)
print(lowest_mental_health[['overall_mental_health_score', 'anxiety_score', 'depression_score']])

print("\n\nOccupations with HIGHEST Mental Health Scores:")
print("="*60)
print(highest_mental_health[['overall_mental_health_score', 'anxiety_score', 'depression_score']])

In [None]:
# Visualize occupation mental health comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Lowest mental health scores
axes[0].barh(lowest_mental_health.index, lowest_mental_health['overall_mental_health_score'],
             color='indianred', alpha=0.7, edgecolor='black')
axes[0].set_xlabel('Mental Health Score', fontsize=12)
axes[0].set_title('Occupations with Lowest Mental Health Scores', fontsize=14, fontweight='bold')
axes[0].set_xlim(0, 100)

# Highest mental health scores
axes[1].barh(highest_mental_health.index, highest_mental_health['overall_mental_health_score'],
             color='seagreen', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Mental Health Score', fontsize=12)
axes[1].set_title('Occupations with Highest Mental Health Scores', fontsize=14, fontweight='bold')
axes[1].set_xlim(0, 100)

plt.tight_layout()
plt.savefig('../data/occupation_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 10. Age Group Analysis

In [None]:
# Age group analysis
age_stats = df_clean.groupby('age_group').agg({
    'anxiety_score': 'mean',
    'depression_score': 'mean',
    'overall_mental_health_score': 'mean',
    'has_sought_treatment': lambda x: (x == 'Yes').sum() / len(x) * 100
}).round(2)
age_stats.columns = ['Avg Anxiety', 'Avg Depression', 'Avg Mental Health', 'Treatment Rate %']

print("Mental Health Statistics by Age Group:")
print("="*60)
print(age_stats)

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Mental health by age group
age_groups = age_stats.index.tolist()
x = np.arange(len(age_groups))
width = 0.25

bars1 = axes[0].bar(x - width, age_stats['Avg Anxiety'], width, label='Anxiety', color='orange')
bars2 = axes[0].bar(x, age_stats['Avg Depression'], width, label='Depression', color='indianred')
bars3 = axes[0].bar(x + width, age_stats['Avg Mental Health']/10, width,
                    label='Mental Health/10', color='purple')

axes[0].set_xlabel('Age Group', fontsize=12)
axes[0].set_ylabel('Average Score', fontsize=12)
axes[0].set_title('Mental Health Indicators by Age Group', fontsize=14, fontweight='bold')
axes[0].set_xticks(x)
axes[0].set_xticklabels(age_groups)
axes[0].legend()

# Treatment seeking by age group
axes[1].bar(age_groups, age_stats['Treatment Rate %'], color='teal', alpha=0.7, edgecolor='black')
axes[1].set_xlabel('Age Group', fontsize=12)
axes[1].set_ylabel('Treatment Rate (%)', fontsize=12)
axes[1].set_title('Treatment Seeking Rate by Age Group', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../data/age_group_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

## 11. Key Insights and Findings

In [None]:
# Summary statistics
print("="*70)
print("KEY INSIGHTS FROM MENTAL HEALTH EXPLORATORY DATA ANALYSIS")
print("="*70)

print("\n1. OVERALL MENTAL HEALTH STATISTICS:")
print("-"*50)
print(f"   Average Mental Health Score: {df_clean['overall_mental_health_score'].mean():.1f}/100")
print(f"   Average Anxiety Score: {df_clean['anxiety_score'].mean():.1f}/10")
print(f"   Average Depression Score: {df_clean['depression_score'].mean():.1f}/10")
print(f"   Treatment Seeking Rate: {(df_clean['has_sought_treatment'] == 'Yes').mean()*100:.1f}%")

print("\n2. LIFESTYLE FACTORS IMPACT:")
print("-"*50)
sleep_corr = correlation_matrix.loc['sleep_hours_per_day', 'overall_mental_health_score']
activity_corr = correlation_matrix.loc['physical_activity_days_per_week', 'overall_mental_health_score']
print(f"   Sleep-Mental Health Correlation: {sleep_corr:.3f}")
print(f"   Physical Activity-Mental Health Correlation: {activity_corr:.3f}")
print("   Higher sleep hours and physical activity are associated with better mental health.")

print("\n3. WORK-RELATED FINDINGS:")
print("-"*50)
high_stress_score = df_clean[df_clean['stress_level'] == 'High']['overall_mental_health_score'].mean()
low_stress_score = df_clean[df_clean['stress_level'] == 'Low']['overall_mental_health_score'].mean()
print(f"   High Stress Level Avg Mental Health: {high_stress_score:.1f}")
print(f"   Low Stress Level Avg Mental Health: {low_stress_score:.1f}")
print(f"   Stress Impact: {low_stress_score - high_stress_score:.1f} point difference")

print("\n4. SOCIAL SUPPORT IMPORTANCE:")
print("-"*50)
excellent_support = df_clean[df_clean['social_support'] == 'Excellent']['overall_mental_health_score'].mean()
poor_support = df_clean[df_clean['social_support'] == 'Poor']['overall_mental_health_score'].mean()
print(f"   Excellent Social Support Avg Score: {excellent_support:.1f}")
print(f"   Poor Social Support Avg Score: {poor_support:.1f}")
print(f"   Social Support Impact: {excellent_support - poor_support:.1f} point difference")

print("\n5. HIGH-RISK GROUPS IDENTIFIED:")
print("-"*50)
print(f"   - Individuals with high stress and poor work-life balance")
print(f"   - Workers in high-pressure occupations (Executives, Investment Bankers, Lawyers)")
print(f"   - Those with limited social support")
print(f"   - People with inadequate sleep (<6 hours)")

print("\n" + "="*70)

## 12. Recommendations

Based on our exploratory data analysis, here are key recommendations for addressing mental health challenges:

### For Individuals:
1. **Prioritize Sleep**: Aim for 7-8 hours of sleep daily as it shows strong correlation with better mental health.
2. **Regular Physical Activity**: Engage in physical activity 4-5 days per week to reduce anxiety and depression.
3. **Build Social Networks**: Cultivate strong social support systems for emotional resilience.
4. **Seek Help Early**: Those experiencing high stress or poor mental health should consider professional support.

### For Organizations:
1. **Work-Life Balance Programs**: Implement flexible working arrangements and reasonable workloads.
2. **Mental Health Resources**: Provide access to counseling and mental health support.
3. **Stress Management Training**: Offer workshops on stress management and coping strategies.
4. **Supportive Culture**: Foster an environment where mental health discussions are normalized.

### For Policy Makers:
1. **Mental Health Awareness**: Invest in public mental health awareness campaigns.
2. **Accessible Treatment**: Ensure mental health services are affordable and accessible.
3. **Workplace Regulations**: Establish guidelines for reasonable working hours and conditions.
4. **Early Intervention Programs**: Focus on preventive measures and early detection.

In [None]:
# Final summary visualization
fig = plt.figure(figsize=(14, 8))

# Create a summary dashboard
gs = fig.add_gridspec(2, 3, hspace=0.3, wspace=0.3)

# Mental Health Score Gauge (simplified as bar)
ax1 = fig.add_subplot(gs[0, 0])
avg_score = df_clean['overall_mental_health_score'].mean()
ax1.barh(['Mental Health\nScore'], [avg_score], color='purple', alpha=0.7)
ax1.barh(['Mental Health\nScore'], [100-avg_score], left=[avg_score], color='lightgray', alpha=0.5)
ax1.set_xlim(0, 100)
ax1.set_title(f'Average Score: {avg_score:.1f}/100', fontsize=12, fontweight='bold')

# Treatment Rate
ax2 = fig.add_subplot(gs[0, 1])
treatment_rate = (df_clean['has_sought_treatment'] == 'Yes').mean() * 100
ax2.pie([treatment_rate, 100-treatment_rate], labels=['Sought Treatment', 'No Treatment'],
        autopct='%1.1f%%', colors=['#4ecdc4', '#ff6b6b'], startangle=90)
ax2.set_title('Treatment Seeking Rate', fontsize=12, fontweight='bold')

# Stress Distribution
ax3 = fig.add_subplot(gs[0, 2])
stress_counts = df_clean['stress_level'].value_counts()
colors = {'Low': '#1dd1a1', 'Medium': '#feca57', 'High': '#ff6b6b'}
ax3.pie(stress_counts.values, labels=stress_counts.index,
        autopct='%1.1f%%', colors=[colors[x] for x in stress_counts.index], startangle=90)
ax3.set_title('Stress Level Distribution', fontsize=12, fontweight='bold')

# Key Metrics
ax4 = fig.add_subplot(gs[1, :])
metrics = ['Avg Anxiety\nScore', 'Avg Depression\nScore', 'Avg Sleep\n(hours)',
           'Avg Physical Activity\n(days/week)', 'High Stress\n(%)']
values = [
    df_clean['anxiety_score'].mean(),
    df_clean['depression_score'].mean(),
    df_clean['sleep_hours_per_day'].mean(),
    df_clean['physical_activity_days_per_week'].mean(),
    (df_clean['stress_level'] == 'High').mean() * 100
]

colors_metrics = ['orange', 'indianred', 'navy', 'green', 'red']
bars = ax4.bar(metrics, values, color=colors_metrics, alpha=0.7, edgecolor='black')

# Add value labels on bars
for bar, val in zip(bars, values):
    ax4.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5,
             f'{val:.1f}', ha='center', va='bottom', fontsize=10, fontweight='bold')

ax4.set_ylabel('Value', fontsize=12)
ax4.set_title('Key Mental Health Metrics Summary', fontsize=14, fontweight='bold')

plt.suptitle('Mental Health EDA - Summary Dashboard', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('../data/summary_dashboard.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n" + "="*70)
print("EXPLORATORY DATA ANALYSIS COMPLETED SUCCESSFULLY!")
print("="*70)
print("\nAll visualizations have been saved to the 'data' directory.")