In [31]:
import re
from typing import List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

In [32]:
def _smart_to_numeric(s: pd.Series, thresh = 0.9) -> pd.Series:
    parsed = pd.to_numeric(s, errors="coerce")
    return parsed if parsed.notna().mean() >= thresh else s

In [33]:
def load_and_clean(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    df.columns = [re.sub(r"\s+", " ", c.strip()) for c in df.columns]
    for c in df.columns:
        if not pd.api.types.is_numeric_dtype(df[c]):
            df[c] = _smart_to_numeric(df[c], thresh=0.9)
    return df

In [34]:
def numeric_candidates(df: pd.DataFrame, exclude=()):
    ban = set(exclude or [])
    return [c for c in df.select_dtypes(include=[np.number]).columns if c not in ban]

In [35]:
def pearson_corr(df: pd.DataFrame, target: str, exogs: list, top_k: int | None = None) -> pd.DataFrame:
    """
    Correlación de Pearson (r) entre exógenas y target.
    Devuelve DataFrame ordenado por |r| con columnas: variable, r, abs_r.
    Si top_k no es None, retorna solo ese top.
    """
    rows = []
    for x in exogs:
        if x == target or x not in df.columns:
            continue
        sub = df[[x, target]].dropna()
        if sub.empty or sub[x].nunique() < 2 or sub[target].nunique() < 2:
            continue
        r = np.corrcoef(sub[x], sub[target])[0, 1]
        if np.isfinite(r):
            rows.append({"variable": x, "r": float(r), "abs_r": float(abs(r))})

    corr_df = (pd.DataFrame(rows)
               .sort_values("abs_r", ascending=False)
               .reset_index(drop=True))

    if top_k is not None:
        corr_df = corr_df.head(top_k)

    return corr_df[["variable", "r", "abs_r"]]

In [36]:
def pearson_table(df: pd.DataFrame, target: str, exogs: list) -> pd.DataFrame:
    """Tabla de correlación de Pearson r (exógenas originales vs target)."""
    rows = []
    for x in exogs:
        if x == target or x not in df.columns: 
            continue
        sub = df[[x, target]].dropna()
        if sub.empty or sub[x].nunique()<2 or sub[target].nunique()<2:
            continue
        r = np.corrcoef(sub[x], sub[target])[0,1]
        rows.append({"variable": x, "r": r, "abs_r": abs(r)})
    corr_df = pd.DataFrame(rows).sort_values("abs_r", ascending=False).reset_index(drop=True)
    return corr_df[["variable","r","abs_r"]]

In [37]:
def scatter_with_fit(df: pd.DataFrame, x: str, y: str, r: float):
    sub = df[[x, y]].dropna()
    if sub.empty: 
        return
    m, b = np.polyfit(sub[x], sub[y], 1)
    xs = np.linspace(sub[x].min(), sub[x].max(), 100)
    plt.figure(figsize=(6, 4.5))
    plt.scatter(sub[x], sub[y], alpha=0.5)
    plt.plot(xs, m*xs + b)
    plt.title(f"{y} vs {x}  (r = {r:.3f})")
    plt.xlabel(x); plt.ylabel(y)
    plt.tight_layout(); plt.show()

In [38]:
def _fit_poly_model(
    df: pd.DataFrame,
    target: str,
    X_cols: List[str],
    *,
    degree: int = 2,
    interaction_only: bool = False,
    test_size: float = 0.2,
    random_state: int = 42,
    do_prints: bool = True,
    do_plots: bool = False
):
    if not X_cols:
        raise ValueError("No hay exógenas válidas para el modelo.")

    data = df[[target] + X_cols].dropna(subset=[target])
    X, y = data[X_cols], data[target]

    model = Pipeline(steps=[
        ("imp", SimpleImputer(strategy="median")),
        ("poly", PolynomialFeatures(degree=degree, include_bias=False, interaction_only=interaction_only)),
        ("lr", LinearRegression())
    ])

    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=test_size, random_state=random_state)
    model.fit(Xtr, ytr)

    r2_tr = r2_score(ytr, model.predict(Xtr))
    r2_te = r2_score(yte, model.predict(Xte))

    poly = model.named_steps["poly"]
    lr = model.named_steps["lr"]
    feat_names = poly.get_feature_names_out(X_cols)
    coefs = pd.Series(lr.coef_, index=feat_names).sort_values(key=np.abs, ascending=False)
    kshow = min(15, len(coefs))

    # Correlaciones r solo sobre las X usadas
    corr_df = pearson_corr(df, target, X_cols)

    if do_prints:
        print("\n" + "="*74)
        print(f"Objetivo: {target}")
        print(f"Modelo  : Regresión Polinómica (grado={degree}, interaction_only={interaction_only})")
        print(f"Exógenas ({len(X_cols)}): {', '.join(X_cols[:12])}{' ...' if len(X_cols)>12 else ''}")
        print(f"R² (train): {r2_tr:.3f} | R² (test): {r2_te:.3f}")

        print("\nCorrelación de Pearson (exógenas originales vs objetivo): r ∈ [-1, 1]")
        try:
            display(corr_df.style.format({"r": "{:.3f}", "abs_r": "{:.3f}"}))
        except:
            print(corr_df.head(10))

        print("\nTop términos polinómicos por |coef| (NO son correlaciones):")
        try:
            display(coefs.head(kshow).to_frame("coef").style.format(precision=6))
        except:
            print(coefs.head(kshow))

        if do_plots and not corr_df.empty:
            top2 = list(zip(corr_df["variable"], corr_df["r"]))[:2]
            for x, r in top2:
                scatter_with_fit(df, x, target, r)

    return {
        "target": target,
        "r2_train": r2_tr,
        "r2_test": r2_te,
        "exog_used": X_cols,
        "corr_table": corr_df,
        "top_poly_coefs": coefs.head(kshow),
        "model": model
    }

In [39]:
def play_regression_batch(
    csv_path: str,
    targets: List[str],                 # 1 o 2 Y
    exog_manual= None,  # set X manual (mismo para todas las Y)
    auto_k = 8,          # top-k por correlación (por Y)
    drops = None,
    degree: int = 2,
    interaction_only: bool = False,
    test_size: float = 0.2,
    random_state: int = 42,
    do_prints: bool = True,
    do_plots: bool = False
):
    """
    Ejecuta múltiples análisis:
      - Para cada target en `targets`:
          (A) AUTO-K: selecciona top_k por |r| (respecto a esa Y) y entrena.
          (B) MANUAL: si exog_manual no es None, entrena con ese set común.
    Devuelve resumen comparativo y detalles por corrida.
    """
    df = load_and_clean(csv_path)
    for t in targets:
        assert t in df.columns, f"'{t}' no está en el CSV."

    drops = drops or []
    # Candidatas numéricas globales (excluyendo drops y Y si corresponde en cada cálculo)
    cand_global = numeric_candidates(df, exclude=drops)

    results = []
    details = {}

    for target in targets:
        # --- AUTO-K ---
        if auto_k:
            # Candidatas válidas para esta Y (numéricas, distintas de Y y no drops)
            cand = [c for c in cand_global if c != target]
            corr_all = pearson_corr(df, target, cand)
            X_auto = corr_all.head(auto_k)["variable"].tolist()
            run_id = f"{target}__AUTO_K={auto_k}"
            if do_prints:
                print(f"\n>>> {run_id}: {len(X_auto)} exógenas seleccionadas por |r|")
            res_auto = _fit_poly_model(
                df, target, X_auto,
                degree=degree, interaction_only=interaction_only,
                test_size=test_size, random_state=random_state,
                do_prints=do_prints, do_plots=do_plots
            )
            results.append({
                "target": target, "mode": "auto_k", "k": auto_k,
                "n_exog": len(X_auto),
                "r2_train": res_auto["r2_train"], "r2_test": res_auto["r2_test"]
            })
            details[run_id] = {**res_auto, "corr_all": corr_all}

        # --- MANUAL (si se pide) ---
        if exog_manual:
            # Filtra manuales a columnas existentes/numéricas y distintas de la Y
            exog_valid = [c for c in exog_manual if c in df.columns and c in cand_global and c != target]
            run_id = f"{target}__MANUAL"
            if do_prints:
                print(f"\n>>> {run_id}: {len(exog_valid)} exógenas (set común)")
            res_manual = _fit_poly_model(
                df, target, exog_valid,
                degree=degree, interaction_only=interaction_only,
                test_size=test_size, random_state=random_state,
                do_prints=do_prints, do_plots=do_plots
            )
            results.append({
                "target": target, "mode": "manual", "k": None,
                "n_exog": len(exog_valid),
                "r2_train": res_manual["r2_train"], "r2_test": res_manual["r2_test"]
            })
            details[run_id] = res_manual

    # Resumen comparativo
    summary = pd.DataFrame(results).sort_values(["target","mode"]).reset_index(drop=True)
    if do_prints:
        print("\n" + "="*74)
        print("RESUMEN COMPARATIVO")
        try:
            display(summary.style.format({"r2_train": "{:.3f}", "r2_test": "{:.3f}"}))
        except:
            print(summary)

    return {
        "summary": summary,
        "details": details
    }

In [40]:
out = play_regression_batch(
    csv_path=r"C:\Users\xavir\OneDrive\Escritorio\Life Expectancy Data (1).csv",
    targets=["Life expectancy", "Adult Mortality"],
    exog_manual=["Schooling","Income composition of resources","HIV/AIDS","BMI","Diphtheria"],
    auto_k=8,
    degree=2,
    interaction_only=False,
    do_prints=True,
    do_plots=False
)



>>> Life expectancy__AUTO_K=8: 8 exógenas seleccionadas por |r|

Objetivo: Life expectancy
Modelo  : Regresión Polinómica (grado=2, interaction_only=False)
Exógenas (8): Schooling, Income composition of resources, Adult Mortality, BMI, HIV/AIDS, Diphtheria, thinness 1-19 years, thinness 5-9 years
R² (train): 0.897 | R² (test): 0.895

Correlación de Pearson (exógenas originales vs objetivo): r ∈ [-1, 1]


Unnamed: 0,variable,r,abs_r
0,Schooling,0.752,0.752
1,Income composition of resources,0.725,0.725
2,Adult Mortality,-0.696,0.696
3,BMI,0.568,0.568
4,HIV/AIDS,-0.557,0.557
5,Diphtheria,0.479,0.479
6,thinness 1-19 years,-0.477,0.477
7,thinness 5-9 years,-0.472,0.472



Top términos polinómicos por |coef| (NO son correlaciones):


Unnamed: 0,coef
Income composition of resources^2,35.01939
Income composition of resources,5.562228
Schooling Income composition of resources,-2.212874
HIV/AIDS,-1.311518
Income composition of resources HIV/AIDS,-0.509043
thinness 5-9 years,-0.405361
BMI,0.310425
Schooling,0.245782
Schooling^2,0.081177
Income composition of resources thinness 1-19 years,0.072814



>>> Life expectancy__MANUAL: 5 exógenas (set común)

Objetivo: Life expectancy
Modelo  : Regresión Polinómica (grado=2, interaction_only=False)
Exógenas (5): Schooling, Income composition of resources, HIV/AIDS, BMI, Diphtheria
R² (train): 0.842 | R² (test): 0.839

Correlación de Pearson (exógenas originales vs objetivo): r ∈ [-1, 1]


Unnamed: 0,variable,r,abs_r
0,Schooling,0.752,0.752
1,Income composition of resources,0.725,0.725
2,BMI,0.568,0.568
3,HIV/AIDS,-0.557,0.557
4,Diphtheria,0.479,0.479



Top términos polinómicos por |coef| (NO son correlaciones):


Unnamed: 0,coef
Income composition of resources^2,49.979238
Income composition of resources,-3.805237
Schooling Income composition of resources,-2.731612
HIV/AIDS,-1.327729
Income composition of resources HIV/AIDS,-0.856606
Schooling,-0.364331
BMI,0.210348
Schooling^2,0.105619
Schooling HIV/AIDS,0.073035
Income composition of resources Diphtheria,0.069188



>>> Adult Mortality__AUTO_K=8: 8 exógenas seleccionadas por |r|

Objetivo: Adult Mortality
Modelo  : Regresión Polinómica (grado=2, interaction_only=False)
Exógenas (8): Life expectancy, HIV/AIDS, Income composition of resources, Schooling, BMI, thinness 5-9 years, thinness 1-19 years, GDP
R² (train): 0.545 | R² (test): 0.559

Correlación de Pearson (exógenas originales vs objetivo): r ∈ [-1, 1]


Unnamed: 0,variable,r,abs_r
0,Life expectancy,-0.696,0.696
1,HIV/AIDS,0.524,0.524
2,Income composition of resources,-0.458,0.458
3,Schooling,-0.455,0.455
4,BMI,-0.387,0.387
5,thinness 5-9 years,0.308,0.308
6,thinness 1-19 years,0.303,0.303
7,GDP,-0.296,0.296



Top términos polinómicos por |coef| (NO son correlaciones):


Unnamed: 0,coef
Income composition of resources,322.118241
Income composition of resources^2,-67.700767
Life expectancy,-19.005379
Income composition of resources Schooling,15.791421
Income composition of resources thinness 1-19 years,-11.533792
Income composition of resources thinness 5-9 years,10.654992
Life expectancy Income composition of resources,-6.966644
thinness 1-19 years,-6.896014
BMI,6.220514
HIV/AIDS,3.523054



>>> Adult Mortality__MANUAL: 5 exógenas (set común)

Objetivo: Adult Mortality
Modelo  : Regresión Polinómica (grado=2, interaction_only=False)
Exógenas (5): Schooling, Income composition of resources, HIV/AIDS, BMI, Diphtheria
R² (train): 0.472 | R² (test): 0.511

Correlación de Pearson (exógenas originales vs objetivo): r ∈ [-1, 1]


Unnamed: 0,variable,r,abs_r
0,HIV/AIDS,0.524,0.524
1,Income composition of resources,-0.458,0.458
2,Schooling,-0.455,0.455
3,BMI,-0.387,0.387
4,Diphtheria,-0.275,0.275



Top términos polinómicos por |coef| (NO son correlaciones):


Unnamed: 0,coef
Income composition of resources^2,-445.781495
Income composition of resources,62.088583
Schooling Income composition of resources,30.149539
Income composition of resources HIV/AIDS,8.602472
Income composition of resources Diphtheria,-1.667741
Schooling,-1.379885
Schooling HIV/AIDS,0.889236
Diphtheria,0.7787
Schooling^2,-0.745983
Income composition of resources BMI,-0.655128



RESUMEN COMPARATIVO


Unnamed: 0,target,mode,k,n_exog,r2_train,r2_test
0,Adult Mortality,auto_k,8.0,8,0.545,0.559
1,Adult Mortality,manual,,5,0.472,0.511
2,Life expectancy,auto_k,8.0,8,0.897,0.895
3,Life expectancy,manual,,5,0.842,0.839
