In [1]:
# Full modeling pipeline for "The Price of Progress" dataset
# This cell loads the cleaned CSV, runs a set of models, diagnostic tests, and saves outputs (text + plots).
# It produces:
# - baseline OLS, quadratic OLS, OLS with region fixed effects, OLS on high-quality subset
# - robust SE versions, VIF, heteroskedasticity test, Cook's distance, residual plots, QQ-plot
# - bootstrap CI for tipping point (optional, with n_boot default 1000 but reduced for speed)
# All outputs saved under /mnt/data/model_output
import os, warnings, math
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_breuschpagan
from scipy.stats import pearsonr
from statsmodels.regression.quantile_regression import QuantReg

warnings.filterwarnings("ignore")
sns.set(style="whitegrid", font_scale=1.05)

OUT_DIR = Path("./Output/Model")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Load data
CSV = Path("./Output/merged_clean_panel.csv")
if CSV.exists():
    df = pd.read_csv(CSV)
else:
    raise FileNotFoundError("Cleaned CSV not found at ./Output/merged_clean_panel.csv. Please place it there and rerun.")

# Ensure numeric types
for c in ['HDI_2023','HDI_sq','GDP_per_capita','log_GDP_per_capita','Suicide_rate']:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Create a working dataframe for modeling: drop rows missing key vars
model_vars = ['Suicide_rate','HDI_2023','HDI_sq','log_GDP_per_capita','ISO3','continent','income_group_auto','GDP_per_capita']
df_model = df.copy().dropna(subset=['Suicide_rate','HDI_2023','HDI_sq'])

# Add an indicator for high data quality
df_model['high_data_quality'] = df_model['Low_data_quality_flag'].astype(str).str.contains('Sufficient', na=False)

# Utility: save text
def save_text(text, filename):
    with open(OUT_DIR / filename, "w", encoding="utf-8") as f:
        f.write(text)

# 1) Baseline OLS: Suicide ~ HDI (linear)
formula_base = "Suicide_rate ~ HDI_2023"
res_base = smf.ols(formula=formula_base, data=df_model).fit(cov_type='HC1')

# 2) Quadratic model: Suicide ~ HDI + HDI^2 + log_GDP
formula_quad = "Suicide_rate ~ HDI_2023 + HDI_sq + log_GDP_per_capita"
res_quad = smf.ols(formula=formula_quad, data=df_model).fit(cov_type='HC1')

# 3) Add region fixed effects and income_group
# Use continent and income_group_auto as categorical controls
# Drop rows with missing continent if any
df_region = df_model.dropna(subset=['continent'])
formula_region = "Suicide_rate ~ HDI_2023 + HDI_sq + log_GDP_per_capita + C(continent) + C(income_group_auto)"
res_region = smf.ols(formula=formula_region, data=df_region).fit(cov_type='HC1')

# 4) High-quality subset model (only sufficient data)
df_high = df_model[df_model['high_data_quality']==True].copy()
formula_high = formula_quad  # same specification on high-quality subset
res_high = smf.ols(formula=formula_high, data=df_high).fit(cov_type='HC1')

# 5) Quantile regression (median) as robustness
try:
    qr = QuantReg(df_model['Suicide_rate'], sm.add_constant(df_model[['HDI_2023','HDI_sq','log_GDP_per_capita']].fillna(0)))
    res_qr = qr.fit(q=0.5)
except Exception as e:
    res_qr = None

# Save model summaries to text
save_text(res_base.summary().as_text(), "model_base_summary.txt")
save_text(res_quad.summary().as_text(), "model_quad_summary.txt")
save_text(res_region.summary().as_text(), "model_region_summary.txt")
save_text(res_high.summary().as_text(), "model_high_quality_summary.txt")
if res_qr is not None:
    save_text(res_qr.summary().as_text(), "model_quantile_median_summary.txt")

# 6) VIF calculation (on predictors in quad model)
def compute_vif(df, features):
    X = df[features].dropna()
    X = sm.add_constant(X)
    vif_data = []
    for i, var in enumerate(X.columns):
        if var == 'const':
            continue
        vif = variance_inflation_factor(X.values, i)
        vif_data.append((var, vif))
    return pd.DataFrame(vif_data, columns=['variable','VIF'])

vif_df = compute_vif(df_model, ['HDI_2023','HDI_sq','log_GDP_per_capita'])
vif_df.to_csv(OUT_DIR / "vif_quad.csv", index=False)

# 7) Heteroskedasticity test (Breusch-Pagan) on quad model residuals
bp_test = het_breuschpagan(res_quad.resid, res_quad.model.exog)
bp_result_txt = f"Breusch-Pagan test Lagrange multiplier stat: {bp_test[0]:.4f}, p-value: {bp_test[1]:.4g}\n"
save_text(bp_result_txt, "breusch_pagan_quad.txt")

# 8) Influence diagnostics (Cook's distance) -- plot top influencers
influence = res_quad.get_influence()
cooks = influence.cooks_distance[0]
df_model['cooks_d'] = cooks
top_cooks = df_model.sort_values('cooks_d', ascending=False).head(10)[['Country Name','cooks_d']]
top_cooks.to_csv(OUT_DIR / "top_cooks.csv", index=False)

# Save cooks plot
plt.figure(figsize=(10,4))
plt.stem(np.arange(len(cooks)), cooks, markerfmt=",", use_line_collection=True)
plt.title("Cook's distance for quadratic model (all observations)")
plt.xlabel("Observation index")
plt.ylabel("Cook's distance")
plt.tight_layout()
plt.savefig(OUT_DIR / "cooks_distance.png", dpi=150)
plt.close()

# 9) Residuals vs Fitted plot
fitted = res_quad.fittedvalues
resid = res_quad.resid
plt.figure(figsize=(8,5))
plt.scatter(fitted, resid, alpha=0.7)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel("Fitted values")
plt.ylabel("Residuals")
plt.title("Residuals vs Fitted (Quadratic model)")
plt.tight_layout()
plt.savefig(OUT_DIR / "residuals_vs_fitted_quad.png", dpi=150)
plt.close()

# 10) QQ-plot of residuals
sm.qqplot(resid, line='s')
plt.title("QQ-plot of residuals (Quadratic model)")
plt.tight_layout()
plt.savefig(OUT_DIR / "qqplot_residuals_quad.png", dpi=150)
plt.close()

# 11) Partial effect plot: predicted Suicide_rate over HDI (holding log_GDP at mean)
hdimin = df_model['HDI_2023'].min()
hdimax = df_model['HDI_2023'].max()
hdix = np.linspace(hdimin, hdimax, 200)
mean_loggdp = df_model['log_GDP_per_capita'].mean()
# compute predicted using res_quad.params
params = res_quad.params
yhat = params.get('Intercept',0) + params.get('HDI_2023',0)*hdix + params.get('HDI_sq',0)*(hdix**2) + params.get('log_GDP_per_capita',0)*mean_loggdp
plt.figure(figsize=(8,6))
plt.scatter(df_model['HDI_2023'], df_model['Suicide_rate'], alpha=0.5, label='Observed')
plt.plot(hdix, yhat, color='red', linewidth=2, label='Quadratic predicted (mean log GDP)')
# tipping point
tipping = None
if ('HDI_2023' in params) and ('HDI_sq' in params) and params['HDI_sq']!=0:
    tipping = -params['HDI_2023']/(2*params['HDI_sq'])
    if np.isfinite(tipping):
        plt.axvline(tipping, color='blue', linestyle='--', label=f'Tipping â‰ˆ {tipping:.3f}')
plt.xlabel("HDI_2023")
plt.ylabel("Suicide_rate")
plt.title("Predicted Suicide Rate by HDI (Quadratic model)")
plt.legend()
plt.tight_layout()
plt.savefig(OUT_DIR / "predicted_hdi_quad.png", dpi=150)
plt.close()

# 12) Bootstrap CI for tipping point (optional, moderate bootstrap reps)
def bootstrap_tipping(df, n_boot=500, seed=123):
    np.random.seed(seed)
    tpoints = []
    X = df[['HDI_2023','HDI_sq','log_GDP_per_capita']].copy()
    y = df['Suicide_rate'].copy()
    data = pd.concat([y, X], axis=1).dropna()
    n = len(data)
    if n < 30:
        return None  # not enough for reliable bootstrap
    for i in range(n_boot):
        sample = data.sample(n, replace=True)
        try:
            mod = smf.ols("Suicide_rate ~ HDI_2023 + HDI_sq + log_GDP_per_capita", data=sample).fit()
            b1 = mod.params.get('HDI_2023', np.nan)
            b2 = mod.params.get('HDI_sq', np.nan)
            if b2!=0 and np.isfinite(b1) and np.isfinite(b2):
                tpoints.append(-b1/(2*b2))
        except Exception:
            continue
    if len(tpoints)==0:
        return None
    arr = np.array(tpoints)
    return {'median': np.nanmedian(arr), '2.5%': np.nanpercentile(arr,2.5), '97.5%': np.nanpercentile(arr,97.5)}

bs_result = bootstrap_tipping(df_model, n_boot=500)
if bs_result is not None:
    save_text(str(bs_result), "bootstrap_tipping_point.txt")

# 13) Robustness: drop top 2 Cook influential observations and refit quad
influential_idx = np.argsort(cooks)[-2:]
df_noinfl = df_model.drop(df_model.index[influential_idx])
res_quad_noinfl = smf.ols(formula_quad, data=df_noinfl).fit(cov_type='HC1')
save_text(res_quad_noinfl.summary().as_text(), "model_quad_no_influentials.txt")

# 14) Save key outputs overview
overview = []
overview.append("Modeling outputs overview\n")
overview.append(f"Observations used (quad model): {len(df_model)}\n")
overview.append("Files saved in /mnt/data/model_output:\n")
for p in sorted(OUT_DIR.iterdir()):
    overview.append(str(p))
save_text("\n".join(overview), "model_outputs_overview.txt")

# 15) Print concise results for user here
print("=== Modeling Results (concise) ===\n")
print("Quadratic model summary (first five rows):\n")
print(res_quad.summary().tables[1])
print("\nTipping point estimate:", tipping)
if bs_result is not None:
    print("Bootstrap tipping CI:", bs_result)

print("\nTop 5 countries by Cook's distance saved to top_cooks.csv")
print("All outputs saved to:", OUT_DIR)

# Also display VIF and top cooks in notebook UI
display_vif = compute_vif(df_model, ['HDI_2023','HDI_sq','log_GDP_per_capita'])
display_vif_path = OUT_DIR / "vif_for_display.csv"
display_vif.to_csv(display_vif_path, index=False)
display_vif



TypeError: stem() got an unexpected keyword argument 'use_line_collection'

<Figure size 1000x400 with 0 Axes>