In [1]:
# comprehensive_analysis.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

# =============================================================================
# DATA LOADING AND MERGING
# =============================================================================

def load_and_merge_data():
    """Load all datasets and merge them into a comprehensive panel dataset"""
    
    # Load the main dataset
    main_df = pd.read_csv('../Output/merged_clean_panel.csv')
    print(f"Main dataset loaded: {main_df.shape}")
    
    # Load HDI time series data
    hdi_ts = pd.read_csv('../Time-Series-Dataset/hdi-time-series-data.csv')
    print(f"HDI time series data loaded: {hdi_ts.shape}")
    
    # Load suicide rate time series data
    suicide_2019 = pd.read_csv('../Time-Series-Dataset/suicide-rate-by-country-2019.csv')
    suicide_2020 = pd.read_csv('../Time-Series-Dataset/suicide-rate-by-country-2020.csv')
    suicide_2021 = pd.read_csv('../Time-Series-Dataset/suicide-rate-by-country-2021.csv')
    print(f"Suicide rate data loaded: 2019({suicide_2019.shape}), 2020({suicide_2020.shape}), 2021({suicide_2021.shape})")
    
    # =============================================================================
    # DATA CLEANING AND PREPROCESSING
    # =============================================================================
    
    # Clean HDI time series data
    hdi_ts_clean = hdi_ts.rename(columns={
        'flagCode': 'ISO3',
        'country': 'Country Name',
        'HumanDevelopmentIndex_2023': 'HDI_2023',
        'HumanDevelopmentIndex_2022': 'HDI_2022',
        'HumanDevelopmentIndex_2021': 'HDI_2021',
        'HumanDevelopmentIndex_2020': 'HDI_2020',
        'HumanDevelopmentIndex_2019': 'HDI_2019',
        'HumanDevelopmentIndex_2010': 'HDI_2010'
    })
    
    # Melt HDI data to long format
    hdi_long = pd.melt(
        hdi_ts_clean, 
        id_vars=['ISO3', 'Country Name', 'HumanDevelopmentIndex_HDITierCurrent_txt_YearFree'],
        value_vars=['HDI_2010', 'HDI_2019', 'HDI_2020', 'HDI_2021', 'HDI_2022', 'HDI_2023'],
        var_name='Year',
        value_name='HDI'
    )
    
    # Extract year from column name
    hdi_long['Year'] = hdi_long['Year'].str.extract('(\d+)').astype(int)
    hdi_long = hdi_long.rename(columns={'HumanDevelopmentIndex_HDITierCurrent_txt_YearFree': 'HDI_Tier'})
    
    # Clean suicide rate data and merge into long format
    suicide_dfs = []
    
    # 2019 data
    suicide_2019_clean = suicide_2019.rename(columns={
        'flagCode': 'ISO3',
        'country': 'Country Name',
        'SuicideRateCountries_2019': 'Suicide_Rate',
        'SuicideRateMaleCountries_2019': 'Suicide_Rate_Male',
        'SuicideRateFemaleCountries_2019': 'Suicide_Rate_Female'
    })
    suicide_2019_clean['Year'] = 2019
    suicide_dfs.append(suicide_2019_clean[['ISO3', 'Country Name', 'Year', 'Suicide_Rate', 'Suicide_Rate_Male', 'Suicide_Rate_Female']])
    
    # 2020 data
    suicide_2020_clean = suicide_2020.rename(columns={
        'flagCode': 'ISO3',
        'country': 'Country Name',
        'SuicideRateCountries_2020': 'Suicide_Rate',
        'SuicideRateMaleCountries_2020': 'Suicide_Rate_Male',
        'SuicideRateFemaleCountries_2020': 'Suicide_Rate_Female'
    })
    suicide_2020_clean['Year'] = 2020
    suicide_dfs.append(suicide_2020_clean[['ISO3', 'Country Name', 'Year', 'Suicide_Rate', 'Suicide_Rate_Male', 'Suicide_Rate_Female']])
    
    # 2021 data
    suicide_2021_clean = suicide_2021.rename(columns={
        'flagCode': 'ISO3',
        'country': 'Country Name',
        'SuicideRateCountries_2021': 'Suicide_Rate',
        'SuicideRateMaleCountries_2021': 'Suicide_Rate_Male',
        'SuicideRateFemaleCountries_2021': 'Suicide_Rate_Female'
    })
    suicide_2021_clean['Year'] = 2021
    suicide_dfs.append(suicide_2021_clean[['ISO3', 'Country Name', 'Year', 'Suicide_Rate', 'Suicide_Rate_Male', 'Suicide_Rate_Female']])
    
    # Combine all suicide data
    suicide_long = pd.concat(suicide_dfs, ignore_index=True)
    
    # =============================================================================
    # MERGE ALL DATA
    # =============================================================================
    
    # Merge HDI and suicide data
    merged_ts = pd.merge(hdi_long, suicide_long, on=['ISO3', 'Country Name', 'Year'], how='outer')
    
    # Add GDP data from main dataset (using 2023 as base and creating estimates for other years)
    gdp_data = main_df[['ISO3', 'GDP_per_capita', 'income_group_auto', 'continent']].copy()
    
    # Create GDP estimates for different years (simplified approach)
    gdp_estimates = []
    for year in [2019, 2020, 2021, 2022, 2023]:
        temp_gdp = gdp_data.copy()
        temp_gdp['Year'] = year
        # Simple adjustment: assume 2% growth per year backwards from 2023
        growth_factor = 1.02 ** (2023 - year)
        temp_gdp['GDP_per_capita'] = temp_gdp['GDP_per_capita'] / growth_factor
        gdp_estimates.append(temp_gdp)
    
    gdp_panel = pd.concat(gdp_estimates, ignore_index=True)
    
    # Final merge
    final_panel = pd.merge(merged_ts, gdp_panel, on=['ISO3', 'Year'], how='left')
    
    # Fill missing continent and income group from main dataset
    country_metadata = main_df[['ISO3', 'continent', 'income_group_auto']].drop_duplicates()
    final_panel = pd.merge(final_panel, country_metadata, on='ISO3', how='left', suffixes=('', '_main'))
    
    # Use the main dataset values where available
    final_panel['continent'] = final_panel['continent_main'].combine_first(final_panel['continent'])
    final_panel['income_group_auto'] = final_panel['income_group_auto_main'].combine_first(final_panel['income_group_auto'])
    final_panel = final_panel.drop(['continent_main', 'income_group_auto_main'], axis=1)
    
    # =============================================================================
    # FEATURE ENGINEERING
    # =============================================================================
    
    # Create development stages based on HDI
    def categorize_hdi_tier(hdi):
        if hdi >= 0.8:
            return 'Very High'
        elif hdi >= 0.7:
            return 'High'
        elif hdi >= 0.55:
            return 'Medium'
        else:
            return 'Low'
    
    final_panel['HDI_Tier_Calculated'] = final_panel['HDI'].apply(categorize_hdi_tier)
    
    # Create HDI squared term for quadratic relationships
    final_panel['HDI_sq'] = final_panel['HDI'] ** 2
    
    # Log transform GDP
    final_panel['log_GDP_per_capita'] = np.log(final_panel['GDP_per_capita'].where(final_panel['GDP_per_capita'] > 0, np.nan))
    
    # Create gender disparity ratio
    final_panel['Gender_Disparity_Ratio'] = final_panel['Suicide_Rate_Male'] / final_panel['Suicide_Rate_Female']
    
    # Create HDI change metrics
    final_panel = final_panel.sort_values(['ISO3', 'Year'])
    final_panel['HDI_change_1yr'] = final_panel.groupby('ISO3')['HDI'].diff()
    final_panel['HDI_change_pct_1yr'] = final_panel.groupby('ISO3')['HDI'].pct_change() * 100
    
    # Suicide rate change metrics
    final_panel['Suicide_Rate_change_1yr'] = final_panel.groupby('ISO3')['Suicide_Rate'].diff()
    final_panel['Suicide_Rate_change_pct_1yr'] = final_panel.groupby('ISO3')['Suicide_Rate'].pct_change() * 100
    
    # Create development paradox indicator (High HDI but high suicide rate)
    final_panel['Development_Paradox'] = ((final_panel['HDI'] > 0.7) & (final_panel['Suicide_Rate'] > final_panel['Suicide_Rate'].median())).astype(int)
    
    # Economic stress indicator (High HDI but low GDP growth potential)
    final_panel['HDI_GDP_Ratio'] = final_panel['HDI'] / final_panel['GDP_per_capita'] * 1000
    
    print(f"Final panel dataset created: {final_panel.shape}")
    return final_panel, main_df

# Load and merge data
panel_df, original_df = load_and_merge_data()

# =============================================================================
# EXPLORATORY DATA ANALYSIS
# =============================================================================

def perform_eda(panel_df):
    """Perform comprehensive exploratory data analysis"""
    
    print("\n" + "="*50)
    print("EXPLORATORY DATA ANALYSIS")
    print("="*50)
    
    # Basic information
    print("\n1. DATASET OVERVIEW:")
    print(f"   - Total observations: {len(panel_df)}")
    print(f"   - Total countries: {panel_df['ISO3'].nunique()}")
    print(f"   - Years covered: {sorted(panel_df['Year'].unique())}")
    print(f"   - Columns: {list(panel_df.columns)}")
    
    # Missing values analysis
    print("\n2. MISSING VALUES ANALYSIS:")
    missing_data = panel_df.isnull().sum()
    missing_pct = (missing_data / len(panel_df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': missing_data,
        'Missing %': missing_pct
    }).sort_values('Missing Count', ascending=False)
    print(missing_df[missing_df['Missing Count'] > 0])
    
    # Descriptive statistics
    print("\n3. DESCRIPTIVE STATISTICS:")
    numeric_cols = ['HDI', 'Suicide_Rate', 'GDP_per_capita', 'Suicide_Rate_Male', 'Suicide_Rate_Female']
    desc_stats = panel_df[numeric_cols].describe()
    print(desc_stats)
    
    return desc_stats

desc_stats = perform_eda(panel_df)

# =============================================================================
# DATA VISUALIZATION
# =============================================================================

def create_comprehensive_visualizations(panel_df):
    """Create comprehensive visualizations for the analysis"""
    
    print("\n" + "="*50)
    print("CREATING VISUALIZATIONS")
    print("="*50)
    
    # 1. HDI Distribution Over Time
    fig1 = px.box(panel_df, x='Year', y='HDI', 
                  title='HDI Distribution Evolution (2019-2023)',
                  color='Year')
    fig1.show()
    
    # 2. Suicide Rate Distribution Over Time
    fig2 = px.box(panel_df, x='Year', y='Suicide_Rate',
                  title='Suicide Rate Distribution Evolution (2019-2021)',
                  color='Year')
    fig2.show()
    
    # 3. HDI vs Suicide Rate Scatter Plot (with time animation)
    fig3 = px.scatter(panel_df.dropna(subset=['HDI', 'Suicide_Rate']), 
                      x='HDI', y='Suicide_Rate',
                      color='continent',
                      size='GDP_per_capita',
                      hover_name='Country Name',
                      animation_frame='Year',
                      title='The Price of Progress: HDI vs Suicide Rate Over Time',
                      labels={'HDI': 'Human Development Index', 
                             'Suicide_Rate': 'Suicide Rate (per 100k)'},
                      size_max=20)
    fig3.show()
    
    # 4. Development Paradox Countries
    paradox_countries = panel_df[panel_df['Development_Paradox'] == 1]
    if len(paradox_countries) > 0:
        fig4 = px.scatter(paradox_countries, 
                          x='HDI', y='Suicide_Rate',
                          color='continent',
                          hover_name='Country Name',
                          size='GDP_per_capita',
                          title='Development Paradox: High HDI but High Suicide Rate Countries',
                          labels={'HDI': 'Human Development Index', 
                                 'Suicide_Rate': 'Suicide Rate (per 100k)'})
        fig4.show()
    
    # 5. Gender Disparity in Suicide Rates
    gender_data = panel_df[['Suicide_Rate_Male', 'Suicide_Rate_Female']].dropna()
    if len(gender_data) > 0:
        fig5 = go.Figure()
        fig5.add_trace(go.Box(y=gender_data['Suicide_Rate_Male'], name='Male Suicide Rate'))
        fig5.add_trace(go.Box(y=gender_data['Suicide_Rate_Female'], name='Female Suicide Rate'))
        fig5.update_layout(title='Gender Disparity in Suicide Rates',
                          yaxis_title='Suicide Rate (per 100k)')
        fig5.show()
    
    # 6. HDI Tier Analysis
    fig6 = px.box(panel_df, x='HDI_Tier_Calculated', y='Suicide_Rate',
                  color='HDI_Tier_Calculated',
                  title='Suicide Rates Across Different Development Tiers',
                  labels={'HDI_Tier_Calculated': 'Development Tier',
                         'Suicide_Rate': 'Suicide Rate (per 100k)'})
    fig6.show()
    
    # 7. Correlation Heatmap
    corr_matrix = panel_df[['HDI', 'Suicide_Rate', 'GDP_per_capita', 
                           'Suicide_Rate_Male', 'Suicide_Rate_Female']].corr()
    fig7 = px.imshow(corr_matrix, 
                     text_auto=True, 
                     color_continuous_scale='RdBu_r',
                     title='Correlation Matrix: Development vs Suicide Indicators',
                     aspect='auto')
    fig7.show()
    
    # 8. Time Series of Average HDI and Suicide Rates
    yearly_avg = panel_df.groupby('Year').agg({
        'HDI': 'mean',
        'Suicide_Rate': 'mean'
    }).reset_index()
    
    fig8 = make_subplots(specs=[[{"secondary_y": True}]])
    fig8.add_trace(
        go.Scatter(x=yearly_avg['Year'], y=yearly_avg['HDI'], name="Average HDI"),
        secondary_y=False,
    )
    fig8.add_trace(
        go.Scatter(x=yearly_avg['Year'], y=yearly_avg['Suicide_Rate'], name="Average Suicide Rate"),
        secondary_y=True,
    )
    fig8.update_layout(title_text="Global Trends: Average HDI vs Suicide Rate Over Time")
    fig8.update_xaxes(title_text="Year")
    fig8.update_yaxes(title_text="Average HDI", secondary_y=False)
    fig8.update_yaxes(title_text="Average Suicide Rate (per 100k)", secondary_y=True)
    fig8.show()
    
    # 9. Geographic Distribution (if ISO3 codes available)
    latest_data = panel_df[panel_df['Year'] == 2023].dropna(subset=['HDI', 'Suicide_Rate'])
    if len(latest_data) > 0:
        fig9 = px.choropleth(latest_data, 
                            locations="ISO3",
                            color="Suicide_Rate",
                            hover_name="Country Name",
                            hover_data=['HDI', 'GDP_per_capita'],
                            color_continuous_scale="Reds",
                            title="Global Suicide Rate Distribution (2023)")
        fig9.show()
    
    print("All visualizations created successfully!")

create_comprehensive_visualizations(panel_df)

# =============================================================================
# STATISTICAL ANALYSIS AND MODELING
# =============================================================================

def perform_statistical_analysis(panel_df):
    """Perform comprehensive statistical analysis and modeling"""
    
    print("\n" + "="*50)
    print("STATISTICAL ANALYSIS AND MODELING")
    print("="*50)
    
    # Use the most recent complete data for cross-sectional analysis
    recent_data = panel_df[panel_df['Year'] == 2023].dropna(subset=['HDI', 'Suicide_Rate', 'GDP_per_capita'])
    
    print(f"Sample size for statistical analysis: {len(recent_data)} countries")
    
    # 1. Correlation Analysis
    print("\n1. CORRELATION ANALYSIS:")
    correlations = {}
    for var in ['HDI', 'GDP_per_capita', 'log_GDP_per_capita']:
        if var in recent_data.columns:
            corr, p_value = pearsonr(recent_data[var].dropna(), recent_data['Suicide_Rate'].dropna())
            correlations[var] = (corr, p_value)
            print(f"   {var} vs Suicide_Rate: r = {corr:.3f}, p = {p_value:.4f}")
    
    # 2. Quadratic Relationship Analysis (The Core Hypothesis)
    print("\n2. QUADRATIC RELATIONSHIP ANALYSIS:")
    
    # Prepare data for modeling
    model_data = recent_data[['HDI', 'HDI_sq', 'Suicide_Rate', 'log_GDP_per_capita', 'continent']].dropna()
    
    if len(model_data) > 10:
        # Model 1: Simple quadratic
        X1 = sm.add_constant(model_data[['HDI', 'HDI_sq']])
        y = model_data['Suicide_Rate']
        model1 = sm.OLS(y, X1).fit()
        
        # Model 2: Quadratic with GDP control
        X2 = sm.add_constant(model_data[['HDI', 'HDI_sq', 'log_GDP_per_capita']])
        model2 = sm.OLS(y, X2).fit()
        
        # Model 3: With continent fixed effects
        X3 = sm.add_constant(model_data[['HDI', 'HDI_sq', 'log_GDP_per_capita']])
        model3 = sm.OLS(y, X3).fit()
        
        print("Model 1 (Quadratic HDI):")
        print(f"   R-squared: {model1.rsquared:.3f}")
        print(f"   HDI coefficient: {model1.params['HDI']:.3f}")
        print(f"   HDI² coefficient: {model1.params['HDI_sq']:.3f}")
        
        print("\nModel 2 (Quadratic HDI + GDP control):")
        print(f"   R-squared: {model2.rsquared:.3f}")
        print(f"   HDI coefficient: {model2.params['HDI']:.3f}")
        print(f"   HDI² coefficient: {model2.params['HDI_sq']:.3f}")
        
        # Calculate tipping point
        if model2.params['HDI_sq'] != 0:
            tipping_point = -model2.params['HDI'] / (2 * model2.params['HDI_sq'])
            print(f"\n   ESTIMATED TIPPING POINT: HDI = {tipping_point:.3f}")
            
            # Interpret tipping point
            if 0.3 <= tipping_point <= 1.0:
                print(f"   Interpretation: Below HDI {tipping_point:.3f}, development correlates with increasing suicide rates.")
                print(f"   Above HDI {tipping_point:.3f}, further development correlates with decreasing suicide rates.")
        
        # 3. Development Tier Analysis
        print("\n3. DEVELOPMENT TIER ANALYSIS:")
        tier_stats = recent_data.groupby('HDI_Tier_Calculated').agg({
            'Suicide_Rate': ['mean', 'median', 'std', 'count'],
            'HDI': 'mean',
            'GDP_per_capita': 'median'
        }).round(3)
        print(tier_stats)
        
        # 4. Random Forest Feature Importance
        print("\n4. MACHINE LEARNING ANALYSIS:")
        # Prepare data for Random Forest
        rf_data = recent_data[['HDI', 'GDP_per_capita', 'Suicide_Rate_Male', 'Suicide_Rate_Female', 'continent']].dropna()
        rf_data = pd.get_dummies(rf_data, columns=['continent'], drop_first=True)
        
        if len(rf_data) > 10:
            X_rf = rf_data.drop('Suicide_Rate_Male', axis=1)  # Using male suicide rate as target
            y_rf = rf_data['Suicide_Rate_Male']
            
            X_train, X_test, y_train, y_test = train_test_split(X_rf, y_rf, test_size=0.3, random_state=42)
            
            rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
            rf_model.fit(X_train, y_train)
            
            y_pred = rf_model.predict(X_test)
            rf_r2 = r2_score(y_test, y_pred)
            rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
            
            print(f"   Random Forest R²: {rf_r2:.3f}")
            print(f"   Random Forest RMSE: {rf_rmse:.3f}")
            
            # Feature importance
            feature_importance = pd.DataFrame({
                'feature': X_rf.columns,
                'importance': rf_model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            print("\n   Feature Importance for Predicting Male Suicide Rates:")
            for _, row in feature_importance.iterrows():
                print(f"     {row['feature']}: {row['importance']:.3f}")
        
        return model2, tipping_point if 'tipping_point' in locals() else None
    
    else:
        print("Insufficient data for statistical modeling")
        return None, None

# Perform statistical analysis
best_model, tipping_point = perform_statistical_analysis(panel_df)

# =============================================================================
# DEEP DIVE: THE DEVELOPMENT PARADOX
# =============================================================================

def analyze_development_paradox(panel_df):
    """Deep dive analysis of countries experiencing the development paradox"""
    
    print("\n" + "="*50)
    print("DEEP DIVE: THE DEVELOPMENT PARADOX")
    print("="*50)
    
    recent_data = panel_df[panel_df['Year'] == 2023].dropna(subset=['HDI', 'Suicide_Rate', 'GDP_per_capita'])
    
    # Identify paradox countries (High HDI but High Suicide Rate)
    hdi_threshold = recent_data['HDI'].quantile(0.7)  # Top 30% HDI
    suicide_threshold = recent_data['Suicide_Rate'].quantile(0.7)  # Top 30% suicide rate
    
    paradox_countries = recent_data[
        (recent_data['HDI'] >= hdi_threshold) & 
        (recent_data['Suicide_Rate'] >= suicide_threshold)
    ]
    
    non_paradox_high_hdi = recent_data[
        (recent_data['HDI'] >= hdi_threshold) & 
        (recent_data['Suicide_Rate'] < suicide_threshold)
    ]
    
    print(f"Development Paradox Countries (High HDI, High Suicide Rate): {len(paradox_countries)}")
    print(f"Non-Paradox High HDI Countries (High HDI, Low Suicide Rate): {len(non_paradox_high_hdi)}")
    
    if len(paradox_countries) > 0:
        print("\nDevelopment Paradox Countries:")
        for _, country in paradox_countries.nlargest(10, 'Suicide_Rate').iterrows():
            print(f"  - {country['Country Name']}: HDI={country['HDI']:.3f}, Suicide Rate={country['Suicide_Rate']:.1f}")
        
        # Compare paradox vs non-paradox countries
        print("\nComparison: Paradox vs Non-Paradox High HDI Countries")
        comparison = pd.DataFrame({
            'Metric': ['HDI', 'Suicide Rate', 'GDP per capita', 'Gender Disparity Ratio'],
            'Paradox Countries': [
                paradox_countries['HDI'].mean(),
                paradox_countries['Suicide_Rate'].mean(),
                paradox_countries['GDP_per_capita'].median(),
                (paradox_countries['Suicide_Rate_Male'] / paradox_countries['Suicide_Rate_Female']).mean()
            ],
            'Non-Paradox Countries': [
                non_paradox_high_hdi['HDI'].mean(),
                non_paradox_high_hdi['Suicide_Rate'].mean(),
                non_paradox_high_hdi['GDP_per_capita'].median(),
                (non_paradox_high_hdi['Suicide_Rate_Male'] / non_paradox_high_hdi['Suicide_Rate_Female']).mean()
            ]
        })
        print(comparison.round(3))
        
        # Statistical test for differences
        t_stat, p_value = stats.ttest_ind(
            paradox_countries['HDI'].dropna(), 
            non_paradox_high_hdi['HDI'].dropna()
        )
        print(f"\nT-test for HDI difference: t={t_stat:.3f}, p={p_value:.4f}")

analyze_development_paradox(panel_df)

# =============================================================================
# TIME SERIES ANALYSIS
# =============================================================================

def analyze_temporal_trends(panel_df):
    """Analyze how the relationship between HDI and suicide rates changes over time"""
    
    print("\n" + "="*50)
    print("TIME SERIES ANALYSIS")
    print("="*50)
    
    # Calculate yearly correlations
    yearly_correlations = []
    for year in sorted(panel_df['Year'].unique()):
        year_data = panel_df[panel_df['Year'] == year].dropna(subset=['HDI', 'Suicide_Rate'])
        if len(year_data) > 10:
            corr, p_value = pearsonr(year_data['HDI'], year_data['Suicide_Rate'])
            yearly_correlations.append({
                'Year': year,
                'Correlation': corr,
                'P_Value': p_value,
                'N_Countries': len(year_data)
            })
    
    correlation_df = pd.DataFrame(yearly_correlations)
    print("Yearly HDI-Suicide Rate Correlations:")
    print(correlation_df.round(4))
    
    # Plot correlation trend
    if len(correlation_df) > 1:
        fig = px.line(correlation_df, x='Year', y='Correlation',
                     title='Evolution of HDI-Suicide Rate Correlation Over Time',
                     markers=True)
        fig.add_hline(y=0, line_dash="dash", line_color="red")
        fig.show()
    
    # Analyze countries with significant changes
    country_changes = []
    for country in panel_df['ISO3'].unique():
        country_data = panel_df[panel_df['ISO3'] == country].dropna(subset=['HDI', 'Suicide_Rate'])
        if len(country_data) >= 2:
            hdi_change = country_data['HDI'].iloc[-1] - country_data['HDI'].iloc[0]
            suicide_change = country_data['Suicide_Rate'].iloc[-1] - country_data['Suicide_Rate'].iloc[0]
            country_changes.append({
                'ISO3': country,
                'Country_Name': country_data['Country Name'].iloc[0],
                'HDI_Change': hdi_change,
                'Suicide_Change': suicide_change,
                'Initial_HDI': country_data['HDI'].iloc[0],
                'Final_HDI': country_data['HDI'].iloc[-1]
            })
    
    changes_df = pd.DataFrame(country_changes)
    
    # Identify interesting patterns
    print("\nCountries with Notable Changes:")
    
    # Countries where HDI increased but suicide rates decreased (Positive development)
    positive_dev = changes_df[
        (changes_df['HDI_Change'] > 0.02) & 
        (changes_df['Suicide_Change'] < -1)
    ]
    if len(positive_dev) > 0:
        print("\nPositive Development (HDI↑, Suicide Rate↓):")
        for _, country in positive_dev.nlargest(5, 'HDI_Change').iterrows():
            print(f"  - {country['Country_Name']}: HDI Δ={country['HDI_Change']:.3f}, Suicide Δ={country['Suicide_Change']:.1f}")
    
    # Countries where HDI increased but suicide rates increased (The "Price" of progress)
    price_of_progress = changes_df[
        (changes_df['HDI_Change'] > 0.02) & 
        (changes_df['Suicide_Change'] > 1)
    ]
    if len(price_of_progress) > 0:
        print("\n'The Price of Progress' (HDI↑, Suicide Rate↑):")
        for _, country in price_of_progress.nlargest(5, 'Suicide_Change').iterrows():
            print(f"  - {country['Country_Name']}: HDI Δ={country['HDI_Change']:.3f}, Suicide Δ={country['Suicide_Change']:.1f}")

analyze_temporal_trends(panel_df)

# =============================================================================
# FINAL INSIGHTS AND CONCLUSIONS
# =============================================================================

def generate_final_insights(panel_df, best_model, tipping_point):
    """Generate final insights and conclusions"""
    
    print("\n" + "="*50)
    print("FINAL INSIGHTS AND CONCLUSIONS")
    print("="*50)
    
    recent_data = panel_df[panel_df['Year'] == 2023].dropna(subset=['HDI', 'Suicide_Rate', 'GDP_per_capita'])
    
    print("\n🔍 KEY FINDINGS:")
    
    # 1. Overall Relationship
    overall_corr, overall_p = pearsonr(recent_data['HDI'], recent_data['Suicide_Rate'])
    print(f"1. Overall HDI-Suicide Rate Correlation: r = {overall_corr:.3f} (p = {overall_p:.4f})")
    
    if overall_corr > 0:
        print("   → Generally, higher development associates with higher suicide rates")
    else:
        print("   → Generally, higher development associates with lower suicide rates")
    
    # 2. Quadratic Relationship Insight
    if best_model is not None and tipping_point is not None:
        print(f"2. Quadratic Relationship Confirmed:")
        print(f"   → Tipping point at HDI = {tipping_point:.3f}")
        print(f"   → Below this level: Development may increase suicide risk")
        print(f"   → Above this level: Further development may reduce suicide risk")
    
    # 3. Development Paradox
    high_hdi_count = len(recent_data[recent_data['HDI'] > 0.8])
    high_suicide_high_hdi = len(recent_data[(recent_data['HDI'] > 0.8) & (recent_data['Suicide_Rate'] > 10)])
    paradox_pct = (high_suicide_high_hdi / high_hdi_count) * 100
    
    print(f"3. Development Paradox Prevalence:")
    print(f"   → {paradox_pct:.1f}% of very high HDI countries have suicide rates > 10 per 100k")
    print(f"   → Suggests non-linear costs of development")
    
    # 4. Gender Dimensions
    male_median = recent_data['Suicide_Rate_Male'].median()
    female_median = recent_data['Suicide_Rate_Female'].median()
    gender_ratio = male_median / female_median
    
    print(f"4. Gender Dimension:")
    print(f"   → Male suicide rate median: {male_median:.1f} per 100k")
    print(f"   → Female suicide rate median: {female_median:.1f} per 100k")
    print(f"   → Gender ratio: {gender_ratio:.1f}x higher in males")
    
    # 5. Regional Patterns
    if 'continent' in recent_data.columns:
        regional_patterns = recent_data.groupby('continent').agg({
            'HDI': 'mean',
            'Suicide_Rate': 'mean'
        }).round(3)
        print(f"5. Regional Patterns:")
        for continent, row in regional_patterns.iterrows():
            print(f"   → {continent}: HDI={row['HDI']}, Suicide Rate={row['Suicide_Rate']:.1f}")
    
    print("\n💡 POLICY IMPLICATIONS:")
    print("   • Mental health infrastructure should keep pace with economic development")
    print("   • Targeted interventions needed during rapid development phases")
    print("   • Gender-specific mental health strategies are crucial")
    print("   • Development policies should consider psychological well-being alongside economic indicators")
    
    print("\n📈 RESEARCH RECOMMENDATIONS:")
    print("   • Investigate mediating factors between development and mental health")
    print("   • Study successful countries that achieved development without mental health costs")
    print("   • Examine cultural and social factors that mitigate development paradox")

generate_final_insights(panel_df, best_model, tipping_point)

# =============================================================================
# EXPORT FINAL DATASET
# =============================================================================

def export_final_dataset(panel_df):
    """Export the final cleaned and enriched dataset"""
    
    # Create a comprehensive summary dataset
    summary_columns = [
        'ISO3', 'Country Name', 'Year', 'continent', 'income_group_auto',
        'HDI', 'HDI_Tier_Calculated', 'Suicide_Rate', 
        'Suicide_Rate_Male', 'Suicide_Rate_Female', 'Gender_Disparity_Ratio',
        'GDP_per_capita', 'log_GDP_per_capita', 'HDI_sq',
        'HDI_change_1yr', 'Suicide_Rate_change_1yr', 'Development_Paradox'
    ]
    
    final_export = panel_df[summary_columns].copy()
    
    # Save to CSV
    final_export.to_csv('comprehensive_development_mental_health_panel.csv', index=False)
    print(f"\n✅ Final dataset exported: comprehensive_development_mental_health_panel.csv")
    print(f"   - Contains {len(final_export)} observations")
    print(f"   - {final_export['ISO3'].nunique()} countries")
    print(f"   - Years: {sorted(final_export['Year'].unique())}")

export_final_dataset(panel_df)

print("\n" + "="*70)
print("ANALYSIS COMPLETE! Check the generated visualizations and insights above.")
print("="*70)

  hdi_long['Year'] = hdi_long['Year'].str.extract('(\d+)').astype(int)


Libraries imported successfully!
Main dataset loaded: (193, 16)
HDI time series data loaded: (192, 9)
Suicide rate data loaded: 2019((182, 5)), 2020((203, 5)), 2021((203, 5))
Final panel dataset created: (1179, 21)

EXPLORATORY DATA ANALYSIS

1. DATASET OVERVIEW:
   - Total observations: 1179
   - Total countries: 204
   - Years covered: [np.int64(2010), np.int64(2019), np.int64(2020), np.int64(2021), np.int64(2022), np.int64(2023)]
   - Columns: ['ISO3', 'Country Name', 'HDI_Tier', 'Year', 'HDI', 'Suicide_Rate', 'Suicide_Rate_Male', 'Suicide_Rate_Female', 'GDP_per_capita', 'income_group_auto', 'continent', 'HDI_Tier_Calculated', 'HDI_sq', 'log_GDP_per_capita', 'Gender_Disparity_Ratio', 'HDI_change_1yr', 'HDI_change_pct_1yr', 'Suicide_Rate_change_1yr', 'Suicide_Rate_change_pct_1yr', 'Development_Paradox', 'HDI_GDP_Ratio']

2. MISSING VALUES ANALYSIS:
                             Missing Count   Missing %
HDI_GDP_Ratio                         1179  100.000000
log_GDP_per_capita         

ValueError: 
    Invalid element(s) received for the 'size' property of scatter.marker
        Invalid elements include: [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]

    The 'size' property is a number and may be specified as:
      - An int or float in the interval [0, inf]
      - A tuple, list, or one-dimensional numpy array of the above