In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
import statsmodels.formula.api as smf
import plotly.express as px

OUT_DIR = Path("./Output/EDA")
OUT_DIR.mkdir(parents=True, exist_ok=True)

CSV_PATH = Path("./Output/merged_clean_panel.csv")

if CSV_PATH.exists():
    df = pd.read_csv(CSV_PATH)
    source_note = f"Loaded {CSV_PATH}"
# else:
#     from io import StringIO
#     df = pd.read_csv(StringIO(sample_csv))
#     source_note = "Loaded fallback sample CSV (merged_clean_panel.csv not found)"

# Ensure numeric columns are numeric
numeric_cols = ['GDP_per_capita', 'Suicide_rate', 'HDI_2023', 'HDI_sq', 'log_GDP_per_capita', 'coverage_frac']
for c in numeric_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors='coerce')

# Basic metadata
n_countries = df['Country Name'].nunique()
n_rows = len(df)

# 1) Save a textual summary
summary_lines = []
summary_lines.append("EDA Report - The Price of Progress (automated)\n")
summary_lines.append(source_note + "\n")
summary_lines.append(f"Total rows: {n_rows}\nUnique countries: {n_countries}\n")
summary_lines.append("Columns: " + ", ".join(df.columns.tolist()) + "\n\n")

# Descriptive statistics for main numeric vars
desc = df[['HDI_2023', 'GDP_per_capita', 'Suicide_rate', 'log_GDP_per_capita']].describe().round(3)
summary_lines.append("Descriptive statistics (main variables):\n")
summary_lines.append(desc.to_string() + "\n\n")

# Correlation
corr = df[['HDI_2023', 'GDP_per_capita', 'Suicide_rate', 'log_GDP_per_capita']].corr().round(3)
summary_lines.append("Correlation matrix:\n")
summary_lines.append(corr.to_string() + "\n\n")

# Pearson r values
pearson_lines = []
if df[['HDI_2023','Suicide_rate']].dropna().shape[0] >= 3:
    r_hdi, p_hdi = pearsonr(df['HDI_2023'].dropna(), df['Suicide_rate'].dropna().loc[df['HDI_2023'].dropna().index])
    pearson_lines.append(f"Pearson HDI vs Suicide_rate: r = {r_hdi:.3f}, p = {p_hdi:.4f}")
else:
    pearson_lines.append("Not enough points for Pearson HDI vs Suicide_rate")
if df[['log_GDP_per_capita','Suicide_rate']].dropna().shape[0] >= 3:
    r_gdp, p_gdp = pearsonr(df['log_GDP_per_capita'].dropna(), df['Suicide_rate'].dropna().loc[df['log_GDP_per_capita'].dropna().index])
    pearson_lines.append(f"Pearson log(GDP) vs Suicide_rate: r = {r_gdp:.3f}, p = {p_gdp:.4f}")
else:
    pearson_lines.append("Not enough points for Pearson log(GDP) vs Suicide_rate")
summary_lines.append("\n".join(pearson_lines) + "\n\n")

# Save summary text
summary_path = OUT_DIR / "EDA_summary.txt"
with open(summary_path, "w", encoding="utf-8") as f:
    f.write("\n".join(summary_lines))

# Display top rows via UI helper
display_df_name = "Cleaned dataset sample"
df.head(50)

# 2) Correlation heatmap (matplotlib)
heatmap_path = OUT_DIR / "heatmap_correlation.png"
plt.figure(figsize=(6,5))
corr_matrix = df[['HDI_2023', 'GDP_per_capita', 'Suicide_rate', 'log_GDP_per_capita']].corr()
im = plt.imshow(corr_matrix, interpolation='nearest', aspect='auto')
plt.colorbar(im, fraction=0.046, pad=0.04)
plt.xticks(range(len(corr_matrix.columns)), corr_matrix.columns, rotation=45, ha='right')
plt.yticks(range(len(corr_matrix.index)), corr_matrix.index)
plt.title("Correlation matrix")
plt.tight_layout()
plt.savefig(heatmap_path, dpi=150)
plt.close()

# 3) Scatter HDI vs Suicide_rate with LOWESS-like smoothing (using np.polyfit for global trend)
scatter_hdi_path = OUT_DIR / "scatter_hdi_suicide.png"
plt.figure(figsize=(8,6))
x = df['HDI_2023']
y = df['Suicide_rate']
plt.scatter(x, y, alpha=0.8)
# Fit quadratic if enough data else linear
mask = df[['HDI_2023','Suicide_rate']].dropna()
if len(mask) >= 3:
    coeffs = np.polyfit(mask['HDI_2023'], mask['Suicide_rate'], 2)
    xp = np.linspace(mask['HDI_2023'].min(), mask['HDI_2023'].max(), 200)
    yp = np.polyval(coeffs, xp)
    plt.plot(xp, yp, linewidth=2)
plt.xlabel("HDI_2023")
plt.ylabel("Suicide_rate (per 100k)")
plt.title("HDI vs Suicide Rate (quadratic fit shown if available)")
plt.grid(True)
plt.tight_layout()
plt.savefig(scatter_hdi_path, dpi=150)
plt.close()

# 4) Scatter log GDP vs Suicide_rate
scatter_gdp_path = OUT_DIR / "scatter_loggdp_suicide.png"
plt.figure(figsize=(8,6))
x = df['log_GDP_per_capita']
y = df['Suicide_rate']
plt.scatter(x, y, alpha=0.8)
mask2 = df[['log_GDP_per_capita','Suicide_rate']].dropna()
if len(mask2) >= 3:
    coeffs2 = np.polyfit(mask2['log_GDP_per_capita'], mask2['Suicide_rate'], 1)
    xp2 = np.linspace(mask2['log_GDP_per_capita'].min(), mask2['log_GDP_per_capita'].max(), 200)
    yp2 = np.polyval(coeffs2, xp2)
    plt.plot(xp2, yp2, linewidth=2)
plt.xlabel("log_GDP_per_capita")
plt.ylabel("Suicide_rate (per 100k)")
plt.title("Log GDP per Capita vs Suicide Rate (linear fit shown if available)")
plt.grid(True)
plt.tight_layout()
plt.savefig(scatter_gdp_path, dpi=150)
plt.close()

# 5) Boxplot Suicide rate by income_group_auto (matplotlib)
boxplot_path = OUT_DIR / "boxplot_incomegroup_suicide.png"
plt.figure(figsize=(8,5))
groups = ['Low','Lower-Middle','Upper-Middle','High']
# prepare data in order, handling missing categories
data_to_plot = [df.loc[df['income_group_auto'] == g, 'Suicide_rate'].dropna().values for g in groups if g in df['income_group_auto'].values]
labels = [g for g in groups if g in df['income_group_auto'].values]
plt.boxplot(data_to_plot, labels=labels, showmeans=True)
plt.xlabel("Income group (auto)")
plt.ylabel("Suicide rate (per 100k)")
plt.title("Suicide Rate by Income Group (boxplot)")
plt.tight_layout()
plt.savefig(boxplot_path, dpi=150)
plt.close()

# 6) Quadratic regression with statsmodels and tipping point, save summary
model_summary_path = OUT_DIR / "model_summary.txt"
quad_plot_path = OUT_DIR / "quadratic_fit.png"
try:
    model_df = df.dropna(subset=['HDI_2023','HDI_sq','Suicide_rate','log_GDP_per_capita'])
    if len(model_df) >= 10:
        model = smf.ols("Suicide_rate ~ HDI_2023 + HDI_sq + log_GDP_per_capita", data=model_df).fit()
        with open(model_summary_path, "w", encoding="utf-8") as f:
            f.write(model.summary().as_text())
        # compute tipping point
        if 'HDI_sq' in model.params and 'HDI_2023' in model.params:
            b1 = model.params['HDI_2023']
            b2 = model.params['HDI_sq']
            if b2 != 0:
                tipping = -b1 / (2*b2)
            else:
                tipping = None
        else:
            tipping = None
        # plot predicted curve at mean log_GDP
        xp = np.linspace(df['HDI_2023'].min(), df['HDI_2023'].max(), 200)
        mean_log_gdp = model_df['log_GDP_per_capita'].mean() if 'log_GDP_per_capita' in model_df.columns else 0
        intercept = model.params['Intercept'] if 'Intercept' in model.params else 0
        # calculate predicted values from model coefficients if present
        try:
            y_pred = (model.params.get('Intercept',0) + model.params.get('HDI_2023',0)*xp + model.params.get('HDI_sq',0)*(xp**2) + model.params.get('log_GDP_per_capita',0)*mean_log_gdp)
        except Exception:
            y_pred = None
        # save plot
        plt.figure(figsize=(8,6))
        plt.scatter(df['HDI_2023'], df['Suicide_rate'], alpha=0.6)
        if y_pred is not None:
            plt.plot(xp, y_pred, linewidth=2, label='Quadratic fit')
        if tipping is not None:
            plt.axvline(tipping, color='red', linestyle='--', label=f"Tipping ~ {tipping:.3f}")
        plt.xlabel("HDI_2023")
        plt.ylabel("Suicide_rate (per 100k)")
        plt.title("Quadratic model fit (HDI & Suicide Rate)")
        plt.legend()
        plt.tight_layout()
        plt.savefig(quad_plot_path, dpi=150)
        plt.close()
    else:
        tipping = None
        with open(model_summary_path, "w", encoding="utf-8") as f:
            f.write("Not enough observations to run quadratic model (need >=10 rows).")
except Exception as e:
    tipping = None
    with open(model_summary_path, "w", encoding="utf-8") as f:
        f.write("Model run failed: " + str(e))

# 7) Choropleth map saved as HTML (Plotly)
choropleth_path = OUT_DIR / "choropleth_suicide.html"
try:
    fig = px.choropleth(df, locations="ISO3", locationmode="ISO-3", color="Suicide_rate",
                        hover_name="Country Name", hover_data=["HDI_2023","GDP_per_capita","income_group_auto"],
                        color_continuous_scale="Reds", title="Global Suicide Rate (avg)")
    fig.update_layout(width=1000, height=600, title_x=0.5)
    fig.write_html(str(choropleth_path))
except Exception as e:
    with open(OUT_DIR / "choropleth_error.txt", "w", encoding="utf-8") as f:
        f.write("Choropleth failed: " + str(e))

# 8) Save list of files created and a short narrative saved to text
files_created = [str(p) for p in OUT_DIR.iterdir()]
narrative = [
    "Automated EDA run outputs:",
    f"- Summary text: {summary_path}",
    f"- Correlation heatmap: {heatmap_path}",
    f"- Scatter HDI vs Suicide: {scatter_hdi_path}",
    f"- Scatter log GDP vs Suicide: {scatter_gdp_path}",
    f"- Boxplot by income group: {boxplot_path}",
    f"- Quadratic model summary: {model_summary_path}",
    f"- Quadratic fit plot: {quad_plot_path}",
    f"- Choropleth interactive HTML: {choropleth_path}",
    "",
    "Notes:",
    "- Images and model summary are saved in /mnt/data/eda_output.",
    "- Choropleth is saved as an interactive HTML (open in browser)."
]
narrative_path = OUT_DIR / "EDA_outputs_list.txt"
with open(narrative_path, "w", encoding="utf-8") as f:
    f.write("\n".join(narrative))

# Return a short programmatic summary for the UI
result = {
    "summary_txt": str(summary_path),
    "heatmap_png": str(heatmap_path),
    "scatter_hdi_png": str(scatter_hdi_path),
    "scatter_gdp_png": str(scatter_gdp_path),
    "boxplot_png": str(boxplot_path),
    "quadratic_plot_png": str(quad_plot_path),
    "model_summary_txt": str(model_summary_path),
    "choropleth_html": str(choropleth_path),
    "narrative_txt": str(narrative_path),
    "files_created": files_created,
    "n_countries": n_countries,
    "n_rows": n_rows,
    "tipping_point": tipping
}

result




The 'labels' parameter of boxplot() has been renamed 'tick_labels' since Matplotlib 3.9; support for the old name will be dropped in 3.11.



{'summary_txt': 'Output\\EDA\\EDA_summary.txt',
 'heatmap_png': 'Output\\EDA\\heatmap_correlation.png',
 'scatter_hdi_png': 'Output\\EDA\\scatter_hdi_suicide.png',
 'scatter_gdp_png': 'Output\\EDA\\scatter_loggdp_suicide.png',
 'boxplot_png': 'Output\\EDA\\boxplot_incomegroup_suicide.png',
 'quadratic_plot_png': 'Output\\EDA\\quadratic_fit.png',
 'model_summary_txt': 'Output\\EDA\\model_summary.txt',
 'choropleth_html': 'Output\\EDA\\choropleth_suicide.html',
 'narrative_txt': 'Output\\EDA\\EDA_outputs_list.txt',
 'files_created': ['Output\\EDA\\boxplot_incomegroup_suicide.png',
  'Output\\EDA\\choropleth_suicide.html',
  'Output\\EDA\\EDA_summary.txt',
  'Output\\EDA\\heatmap_correlation.png',
  'Output\\EDA\\model_summary.txt',
  'Output\\EDA\\quadratic_fit.png',
  'Output\\EDA\\scatter_hdi_suicide.png',
  'Output\\EDA\\scatter_loggdp_suicide.png'],
 'n_countries': 193,
 'n_rows': 193,
 'tipping_point': np.float64(0.5475447136631615)}