In [2]:
# comprehensive_analysis_visualizations.py
# Comprehensive analysis of HDI patterns, data quality, and latest suicide rates

import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

def load_data():
    """Load and prepare the dataset"""
    try:
        df = pd.read_csv("../../Final/final_clean_dataset.csv")
        print(f"✅ Data loaded: {len(df)} rows, {df['Country Name'].nunique()} countries")
        return df
    except Exception as e:
        print(f"❌ Error loading data: {e}")
        return None

def create_hdi_time_series_analysis(df):
    """Create time series scatterplot with quadratic fit for HDI patterns"""
    
    # Filter data for 2019-2024
    df_time = df[df['Year'].between(2019, 2024)].copy()
    
    # Create the visualization
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            'HDI vs Suicide Rate Over Time (2019-2024)',
            'HDI Growth Patterns by Continent',
            'Suicide Rate Trends by Development Level',
            'HDI-Suicide Correlation Evolution'
        ],
        specs=[[{"secondary_y": False}, {"secondary_y": False}],
               [{"secondary_y": False}, {"secondary_y": False}]],
        vertical_spacing=0.1,
        horizontal_spacing=0.1
    )
    
    # 1. Main scatterplot with quadratic fit
    colors = px.colors.qualitative.Set3
    years = sorted(df_time['Year'].unique())
    
    for i, year in enumerate(years):
        year_data = df_time[df_time['Year'] == year].dropna(subset=['HDI', 'Suicide_rate'])
        
        if len(year_data) > 10:  # Only plot years with sufficient data
            # Add scatter points
            fig.add_trace(
                go.Scatter(
                    x=year_data['HDI'],
                    y=year_data['Suicide_rate'],
                    mode='markers',
                    name=f'{year}',
                    marker=dict(
                        size=8,
                        color=colors[i % len(colors)],
                        opacity=0.7,
                        line=dict(width=1, color='black')
                    ),
                    text=year_data['Country Name'],
                    hovertemplate='<b>%{text}</b><br>HDI: %{x:.3f}<br>Suicide Rate: %{y:.2f}<br>Year: %{name}<extra></extra>',
                    showlegend=True
                ),
                row=1, col=1
            )
            
            # Fit quadratic model for this year
            try:
                year_data['HDI_sq'] = year_data['HDI'] ** 2
                model = smf.ols('Suicide_rate ~ HDI + HDI_sq', data=year_data).fit()
                
                # Generate prediction line
                hdi_range = np.linspace(year_data['HDI'].min(), year_data['HDI'].max(), 100)
                hdi_sq_range = hdi_range ** 2
                predictions = model.predict(pd.DataFrame({'HDI': hdi_range, 'HDI_sq': hdi_sq_range}))
                
                # Add quadratic fit line
                fig.add_trace(
                    go.Scatter(
                        x=hdi_range,
                        y=predictions,
                        mode='lines',
                        name=f'Fit {year}',
                        line=dict(color=colors[i % len(colors)], width=3, dash='dash'),
                        showlegend=False,
                        hoverinfo='skip'
                    ),
                    row=1, col=1
                )
            except:
                continue
    
    fig.update_xaxes(title_text="Human Development Index (HDI)", row=1, col=1)
    fig.update_yaxes(title_text="Suicide Rate (per 100,000)", row=1, col=1)
    
    # 2. HDI Growth by Continent
    continent_growth = df_time.groupby(['Year', 'continent'])['HDI'].mean().reset_index()
    
    for continent in continent_growth['continent'].unique():
        continent_data = continent_growth[continent_growth['continent'] == continent]
        fig.add_trace(
            go.Scatter(
                x=continent_data['Year'],
                y=continent_data['HDI'],
                mode='lines+markers',
                name=continent,
                line=dict(width=3),
                marker=dict(size=8),
                showlegend=False,
                hovertemplate=f'<b>{continent}</b><br>Year: %{{x}}<br>Avg HDI: %{{y:.3f}}<extra></extra>'
            ),
            row=1, col=2
        )
    
    fig.update_xaxes(title_text="Year", row=1, col=2)
    fig.update_yaxes(title_text="Average HDI", row=1, col=2)
    
    # 3. Suicide Rate Trends by Development Level
    df_time['Development_Level'] = pd.cut(
        df_time['HDI'],
        bins=[0, 0.55, 0.70, 0.80, 1.0],
        labels=['Low', 'Medium', 'High', 'Very High']
    )
    
    dev_trends = df_time.groupby(['Year', 'Development_Level'])['Suicide_rate'].mean().reset_index()
    
    for level in dev_trends['Development_Level'].unique():
        level_data = dev_trends[dev_trends['Development_Level'] == level]
        fig.add_trace(
            go.Scatter(
                x=level_data['Year'],
                y=level_data['Suicide_rate'],
                mode='lines+markers',
                name=level,
                line=dict(width=3),
                marker=dict(size=8),
                showlegend=False,
                hovertemplate=f'<b>{level} Development</b><br>Year: %{{x}}<br>Suicide Rate: %{{y:.2f}}<extra></extra>'
            ),
            row=2, col=1
        )
    
    fig.update_xaxes(title_text="Year", row=2, col=1)
    fig.update_yaxes(title_text="Average Suicide Rate", row=2, col=1)
    
    # 4. Correlation Evolution
    correlations = []
    for year in years:
        year_data = df_time[df_time['Year'] == year].dropna(subset=['HDI', 'Suicide_rate'])
        if len(year_data) > 10:
            corr, p_value = pearsonr(year_data['HDI'], year_data['Suicide_rate'])
            correlations.append({'Year': year, 'Correlation': corr, 'P_Value': p_value})
    
    if correlations:
        corr_df = pd.DataFrame(correlations)
        fig.add_trace(
            go.Scatter(
                x=corr_df['Year'],
                y=corr_df['Correlation'],
                mode='lines+markers+text',
                line=dict(color='red', width=4),
                marker=dict(size=12, color='red'),
                text=corr_df['Correlation'].round(3),
                textposition="top center",
                name='HDI-Suicide Correlation',
                hovertemplate='Year: %{x}<br>Correlation: %{y:.3f}<br>P-value: %{customdata:.4f}<extra></extra>',
                customdata=corr_df['P_Value']
            ),
            row=2, col=2
        )
        
        # Add significance markers
        significant_years = corr_df[corr_df['P_Value'] < 0.05]['Year']
        for year in significant_years:
            fig.add_annotation(
                x=year,
                y=corr_df[corr_df['Year'] == year]['Correlation'].iloc[0],
                text="★",
                showarrow=False,
                font=dict(color="green", size=16),
                row=2, col=2
            )
    
    fig.update_xaxes(title_text="Year", row=2, col=2)
    fig.update_yaxes(title_text="Correlation Coefficient", row=2, col=2)
    
    # Update layout
    fig.update_layout(
        height=1000,
        width=1400,
        title_text="Comprehensive HDI and Suicide Rate Analysis (2019-2024)",
        showlegend=True,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    
    return fig

def create_data_quality_analysis(df):
    """Analyze and visualize data quality patterns"""
    
    # Create summary of data quality flags
    quality_summary = df['Low_data_quality_flag'].value_counts().reset_index()
    quality_summary.columns = ['Quality_Flag', 'Count']
    
    # Create subplots
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            'Data Quality Distribution',
            'Data Quality by Continent',
            'Data Quality Over Time',
            'Impact of Data Quality on Key Metrics'
        ],
        specs=[[{"type": "pie"}, {"type": "bar"}],
               [{"type": "scatter"}, {"type": "box"}]]
    )
    
    # 1. Data Quality Distribution (Pie chart)
    fig.add_trace(
        go.Pie(
            labels=quality_summary['Quality_Flag'],
            values=quality_summary['Count'],
            hole=0.4,
            textinfo='percent+label',
            hoverinfo='label+value+percent',
            marker=dict(colors=['#00cc96', '#ff7f0e']),  # Green for sufficient, orange for low
            name="Data Quality"
        ),
        row=1, col=1
    )
    
    # 2. Data Quality by Continent
    continent_quality = df.groupby(['continent', 'Low_data_quality_flag']).size().unstack(fill_value=0)
    continent_quality_pct = continent_quality.div(continent_quality.sum(axis=1), axis=0) * 100
    
    for quality_flag in continent_quality_pct.columns:
        fig.add_trace(
            go.Bar(
                x=continent_quality_pct.index,
                y=continent_quality_pct[quality_flag],
                name=quality_flag,
                text=continent_quality_pct[quality_flag].round(1),
                textposition='auto',
                hovertemplate='<b>%{x}</b><br>%{y:.1f}% %{fullData.name}<extra></extra>'
            ),
            row=1, col=2
        )
    
    fig.update_xaxes(title_text="Continent", row=1, col=2)
    fig.update_yaxes(title_text="Percentage (%)", row=1, col=2)
    
    # 3. Data Quality Over Time
    time_quality = df.groupby(['Year', 'Low_data_quality_flag']).size().unstack(fill_value=0)
    
    for quality_flag in time_quality.columns:
        fig.add_trace(
            go.Scatter(
                x=time_quality.index,
                y=time_quality[quality_flag],
                mode='lines+markers',
                name=quality_flag,
                line=dict(width=3),
                marker=dict(size=8),
                hovertemplate='Year: %{x}<br>Count: %{y}<br>%{fullData.name}<extra></extra>'
            ),
            row=2, col=1
        )
    
    fig.update_xaxes(title_text="Year", row=2, col=1)
    fig.update_yaxes(title_text="Number of Countries", row=2, col=1)
    
    # 4. Impact on Key Metrics
    metrics_impact = []
    for metric in ['HDI', 'Suicide_rate', 'GDP_per_capita']:
        for quality_flag in df['Low_data_quality_flag'].unique():
            quality_data = df[df['Low_data_quality_flag'] == quality_flag][metric].dropna()
            if len(quality_data) > 0:
                metrics_impact.append({
                    'Metric': metric,
                    'Quality_Flag': quality_flag,
                    'Mean': quality_data.mean(),
                    'Std': quality_data.std(),
                    'Count': len(quality_data)
                })
    
    metrics_df = pd.DataFrame(metrics_impact)
    
    for metric in metrics_df['Metric'].unique():
        metric_data = metrics_df[metrics_df['Metric'] == metric]
        fig.add_trace(
            go.Box(
                y=metric_data['Mean'],
                x=metric_data['Quality_Flag'],
                name=metric,
                boxpoints='all',
                jitter=0.3,
                pointpos=-1.8,
                hovertemplate='<b>%{x}</b><br>Metric: %{fullData.name}<br>Mean: %{y:.3f}<extra></extra>'
            ),
            row=2, col=2
        )
    
    fig.update_xaxes(title_text="Data Quality Flag", row=2, col=2)
    fig.update_yaxes(title_text="Metric Value", row=2, col=2)
    
    # Update layout
    fig.update_layout(
        height=800,
        width=1200,
        title_text="Comprehensive Data Quality Analysis",
        showlegend=True,
        legend=dict(orientation="h", yanchor="bottom", y=1.02, xanchor="right", x=1)
    )
    
    return fig

def create_latest_suicide_analysis(df):
    """Analyze latest suicide rates by income group and other dimensions"""
    
    # Find the latest year with suicide rate data
    suicide_years = df[df['Suicide_rate'].notna()]['Year'].unique()
    latest_year = max(suicide_years)
    
    print(f"📅 Latest year with suicide rate data: {latest_year}")
    
    # Filter for latest year
    latest_data = df[df['Year'] == latest_year].copy()
    
    # Create comprehensive analysis
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=[
            f'Suicide Rate by Income Group ({latest_year})',
            f'Suicide Rate Distribution by Continent ({latest_year})',
            f'HDI vs Suicide Rate by Data Quality ({latest_year})',
            f'Top 20 Countries by Suicide Rate ({latest_year})'
        ],
        specs=[[{"type": "box"}, {"type": "box"}],
               [{"type": "scatter"}, {"type": "bar"}]]
    )
    
    # 1. Suicide Rate by Income Group
    income_order = ['Low', 'Lower-Middle', 'Upper-Middle', 'High']
    
    for income_group in income_order:
        group_data = latest_data[latest_data['income_group_auto'] == income_group]['Suicide_rate'].dropna()
        if len(group_data) > 0:
            fig.add_trace(
                go.Box(
                    y=group_data,
                    name=income_group,
                    boxpoints='all',
                    jitter=0.3,
                    pointpos=-1.8,
                    marker=dict(size=6, opacity=0.6),
                    line=dict(width=2),
                    hovertemplate='<b>%{x}</b><br>Suicide Rate: %{y:.2f}<extra></extra>'
                ),
                row=1, col=1
            )
    
    fig.update_xaxes(title_text="Income Group", row=1, col=1)
    fig.update_yaxes(title_text="Suicide Rate (per 100,000)", row=1, col=1)
    
    # 2. Suicide Rate by Continent
    for continent in latest_data['continent'].dropna().unique():
        continent_data = latest_data[latest_data['continent'] == continent]['Suicide_rate'].dropna()
        if len(continent_data) > 0:
            fig.add_trace(
                go.Box(
                    y=continent_data,
                    name=continent,
                    boxpoints='all',
                    jitter=0.3,
                    pointpos=-1.8,
                    marker=dict(size=6, opacity=0.6),
                    line=dict(width=2),
                    showlegend=False,
                    hovertemplate='<b>%{x}</b><br>Suicide Rate: %{y:.2f}<extra></extra>'
                ),
                row=1, col=2
            )
    
    fig.update_xaxes(title_text="Continent", row=1, col=2)
    fig.update_yaxes(title_text="Suicide Rate (per 100,000)", row=1, col=2)
    
    # 3. HDI vs Suicide Rate by Data Quality
    quality_colors = {'✅ Sufficient data': 'green', '⚠️ Low data reliability': 'red'}
    
    for quality_flag in latest_data['Low_data_quality_flag'].unique():
        quality_data = latest_data[latest_data['Low_data_quality_flag'] == quality_flag]
        quality_data = quality_data.dropna(subset=['HDI', 'Suicide_rate'])
        
        if len(quality_data) > 0:
            fig.add_trace(
                go.Scatter(
                    x=quality_data['HDI'],
                    y=quality_data['Suicide_rate'],
                    mode='markers',
                    name=quality_flag,
                    marker=dict(
                        size=10,
                        color=quality_colors.get(quality_flag, 'gray'),
                        opacity=0.7,
                        line=dict(width=1, color='black')
                    ),
                    text=quality_data['Country Name'],
                    hovertemplate='<b>%{text}</b><br>HDI: %{x:.3f}<br>Suicide Rate: %{y:.2f}<br>Quality: %{fullData.name}<extra></extra>',
                    showlegend=False
                ),
                row=2, col=1
            )
    
    # Add trend line
    trend_data = latest_data.dropna(subset=['HDI', 'Suicide_rate'])
    if len(trend_data) > 10:
        z = np.polyfit(trend_data['HDI'], trend_data['Suicide_rate'], 2)
        p = np.poly1d(z)
        hdi_range = np.linspace(trend_data['HDI'].min(), trend_data['HDI'].max(), 100)
        fig.add_trace(
            go.Scatter(
                x=hdi_range,
                y=p(hdi_range),
                mode='lines',
                name='Quadratic Trend',
                line=dict(color='black', width=3, dash='dash'),
                showlegend=False,
                hoverinfo='skip'
            ),
            row=2, col=1
        )
    
    fig.update_xaxes(title_text="Human Development Index (HDI)", row=2, col=1)
    fig.update_yaxes(title_text="Suicide Rate (per 100,000)", row=2, col=1)
    
    # 4. Top 20 Countries by Suicide Rate
    top_countries = latest_data.nlargest(20, 'Suicide_rate')[['Country Name', 'Suicide_rate', 'income_group_auto', 'continent']]
    
    fig.add_trace(
        go.Bar(
            x=top_countries['Suicide_rate'],
            y=top_countries['Country Name'],
            orientation='h',
            marker=dict(
                color=top_countries['Suicide_rate'],
                colorscale='Reds',
                showscale=True,
                colorbar=dict(title="Suicide Rate")
            ),
            text=top_countries['Suicide_rate'].round(2),
            textposition='auto',
            hovertemplate='<b>%{y}</b><br>Suicide Rate: %{x:.2f}<br>Income Group: %{customdata[0]}<br>Continent: %{customdata[1]}<extra></extra>',
            customdata=top_countries[['income_group_auto', 'continent']]
        ),
        row=2, col=2
    )
    
    fig.update_xaxes(title_text="Suicide Rate (per 100,000)", row=2, col=2)
    fig.update_yaxes(title_text="Country", row=2, col=2)
    
    # Update layout
    fig.update_layout(
        height=1000,
        width=1400,
        title_text=f"Comprehensive Suicide Rate Analysis - Latest Data ({latest_year})",
        showlegend=True
    )
    
    return fig, latest_year

def create_detailed_summary_table(df):
    """Create detailed summary tables for analysis"""
    
    # Latest year analysis
    latest_year = df['Year'].max()
    latest_data = df[df['Year'] == latest_year]
    
    # Summary by Income Group
    income_summary = latest_data.groupby('income_group_auto').agg({
        'Suicide_rate': ['mean', 'median', 'std', 'count'],
        'HDI': 'mean',
        'GDP_per_capita': 'median'
    }).round(3)
    
    # Summary by Continent
    continent_summary = latest_data.groupby('continent').agg({
        'Suicide_rate': ['mean', 'median', 'std', 'count'],
        'HDI': 'mean',
        'GDP_per_capita': 'median'
    }).round(3)
    
    # Data Quality Summary - Replace Unicode characters
    df_clean = df.copy()
    df_clean['Low_data_quality_flag'] = df_clean['Low_data_quality_flag'].str.replace('✅', '[GOOD]')
    df_clean['Low_data_quality_flag'] = df_clean['Low_data_quality_flag'].str.replace('⚠️', '[LOW]')
    
    quality_summary = df_clean.groupby('Low_data_quality_flag').agg({
        'Suicide_rate': ['mean', 'std', 'count'],
        'HDI': ['mean', 'std'],
        'GDP_per_capita': ['mean', 'std']
    }).round(3)
    
    return income_summary, continent_summary, quality_summary, latest_year

def main():
    """Main function to generate all visualizations"""
    print("🚀 Starting comprehensive analysis...")
    
    # Load data
    df = load_data()
    if df is None:
        return
    
    print(f"📊 Dataset covers years: {df['Year'].min()} to {df['Year'].max()}")
    print(f"🌍 Countries: {df['Country Name'].nunique()}")
    print(f"📅 Total observations: {len(df)}")
    
    # Generate visualizations
    print("\n1. Creating HDI Time Series Analysis...")
    fig_hdi = create_hdi_time_series_analysis(df)
    fig_hdi.write_image("hdi_time_series_analysis.png", scale=2)
    fig_hdi.write_html("hdi_time_series_analysis.html")
    
    print("2. Creating Data Quality Analysis...")
    fig_quality = create_data_quality_analysis(df)
    fig_quality.write_image("data_quality_analysis.png", scale=2)
    fig_quality.write_html("data_quality_analysis.html")
    
    print("3. Creating Latest Suicide Rate Analysis...")
    fig_suicide, latest_year = create_latest_suicide_analysis(df)
    fig_suicide.write_image("latest_suicide_analysis.png", scale=2)
    fig_suicide.write_html("latest_suicide_analysis.html")
    
    print("4. Generating Summary Tables...")
    income_summary, continent_summary, quality_summary, latest_year = create_detailed_summary_table(df)
    
    # Save summary tables
    income_summary.to_csv("income_group_summary.csv")
    continent_summary.to_csv("continent_summary.csv")
    quality_summary.to_csv("data_quality_summary.csv")
    
    # Create comprehensive report - FIXED: Handle Unicode characters
    with open("analysis_report.txt", "w", encoding='utf-8') as f:  # Added encoding='utf-8'
        f.write("COMPREHENSIVE ANALYSIS REPORT\n")
        f.write("=" * 50 + "\n\n")
        
        f.write("DATASET OVERVIEW:\n")
        f.write(f"- Time period: {df['Year'].min()} - {df['Year'].max()}\n")
        f.write(f"- Countries: {df['Country Name'].nunique()}\n")
        f.write(f"- Latest year with suicide data: {latest_year}\n")
        f.write(f"- Total observations: {len(df)}\n\n")
        
        f.write("DATA QUALITY SUMMARY:\n")
        # Create a cleaned version for text output
        quality_counts = df['Low_data_quality_flag'].value_counts()
        quality_mapping = {
            '✅ Sufficient data': '[GOOD] Sufficient data',
            '⚠️ Low data reliability': '[LOW] Low data reliability'
        }
        
        for flag, count in quality_counts.items():
            clean_flag = quality_mapping.get(flag, flag)
            f.write(f"- {clean_flag}: {count} observations ({count/len(df)*100:.1f}%)\n")
        f.write("\n")
        
        f.write("LATEST YEAR KEY STATISTICS:\n")
        latest_data = df[df['Year'] == latest_year]
        f.write(f"- Global average suicide rate: {latest_data['Suicide_rate'].mean():.2f}\n")
        f.write(f"- Global average HDI: {latest_data['HDI'].mean():.3f}\n")
        f.write(f"- Global median GDP per capita: ${latest_data['GDP_per_capita'].median():,.0f}\n\n")
        
        f.write("INCOME GROUP ANALYSIS:\n")
        for income_group in ['Low', 'Lower-Middle', 'Upper-Middle', 'High']:
            group_data = latest_data[latest_data['income_group_auto'] == income_group]
            if len(group_data) > 0:
                f.write(f"- {income_group}: Suicide rate = {group_data['Suicide_rate'].mean():.2f}, "
                       f"HDI = {group_data['HDI'].mean():.3f}, "
                       f"Countries = {len(group_data)}\n")
    
    print("\n📁 Files created:")
    print("   - hdi_time_series_analysis.png/.html")
    print("   - data_quality_analysis.png/.html")
    print("   - latest_suicide_analysis.png/.html")
    print("   - income_group_summary.csv")
    print("   - continent_summary.csv")
    print("   - data_quality_summary.csv")
    print("   - analysis_report.txt")
    
    print(f"\n🎉 Analysis completed! Latest suicide data year: {latest_year}")

if __name__ == "__main__":
    main()

🚀 Starting comprehensive analysis...
✅ Data loaded: 985 rows, 205 countries
📊 Dataset covers years: 2019 to 2023
🌍 Countries: 205
📅 Total observations: 985

1. Creating HDI Time Series Analysis...
2. Creating Data Quality Analysis...
3. Creating Latest Suicide Rate Analysis...
📅 Latest year with suicide rate data: 2023
4. Generating Summary Tables...

📁 Files created:
   - hdi_time_series_analysis.png/.html
   - data_quality_analysis.png/.html
   - latest_suicide_analysis.png/.html
   - income_group_summary.csv
   - continent_summary.csv
   - data_quality_summary.csv
   - analysis_report.txt

🎉 Analysis completed! Latest suicide data year: 2023
