In [8]:
from pathlib import Path
import numpy as np, pandas as pd
from itertools import product

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score, confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
# --- CELDA 0: RUTAS ROBUSTAS ---
from pathlib import Path

# Busca la raíz del proyecto (para poder ejecutar desde /notebooks)
def find_root(start=None, max_up=6):
    p = Path.cwd() if start is None else Path(start)
    for _ in range(max_up):
        if (p / "data" / "raw").exists():
            return p
        p = p.parent
    raise FileNotFoundError("No se encontró la carpeta 'data/raw'.")

ROOT      = find_root()
DATA_RAW  = ROOT / "data" / "raw"
DATA_PROC = ROOT / "data" / "processed"
REPORTS   = ROOT / "reports" / "tables"
for d in (DATA_PROC, REPORTS): d.mkdir(parents=True, exist_ok=True)

print("Rutas configuradas correctamente ✅")

print("ROOT     →", ROOT)
print("DATA_RAW →", DATA_RAW)
print("DATA_PROC→", DATA_PROC)



Rutas configuradas correctamente ✅
ROOT     → c:\Users\Gerson\Downloads\PISA-ML
DATA_RAW → c:\Users\Gerson\Downloads\PISA-ML\data\raw
DATA_PROC→ c:\Users\Gerson\Downloads\PISA-ML\data\processed


In [9]:
# --- CELDA 1: CARGA (X, y) DESDE data/processed ---

import pandas as pd

X_PATH = DATA_PROC / "peru_X_full__part1.xlsx"
Y_PATH = DATA_PROC / "y_3niveles.xlsx"

assert X_PATH.exists(), f"No encontré {X_PATH}"
assert Y_PATH.exists(), f"No encontré {Y_PATH}"

# 1) Cargar X
X = pd.read_excel(X_PATH, engine="openpyxl")

# 2) Cargar y desde MATH_LEVEL_3CAT (low/mid/high)
ydf  = pd.read_excel(Y_PATH, engine="openpyxl")
cols = {c.upper(): c for c in ydf.columns}
assert "MATH_LEVEL_3CAT" in cols, f"No veo 'MATH_LEVEL_3CAT' en columnas: {list(ydf.columns)}"
coly = cols["MATH_LEVEL_3CAT"]

y_raw = ydf[coly].astype(str).str.strip().str.lower()
y_map = {"low": 0, "med": 1, "high": 2}
y_all = y_raw.map(y_map)

# 3) Reportar etiquetas inválidas (si las hubiera) y filtrar
bad = y_all.isna()
if bad.any():
    rep = pd.DataFrame({"raw": ydf[coly], "normalizado": y_raw, "mapped": y_all})[bad]
    rep.to_excel(REPORTS / "y_labels_invalidas.xlsx", index=False)
    print(f"Aviso: {bad.sum()} fila(s) con etiquetas fuera de {{low, med, high}}. "
          f"Detalle en reports/tables/y_labels_invalidas.xlsx")

# 4) Alinear X e y (misma longitud y eliminar inválidas)
nmin = min(len(X), len(y_all))
mask_keep = (~bad).iloc[:nmin]
X = X.iloc[:nmin].loc[mask_keep].reset_index(drop=True)
y = y_all.iloc[:nmin].loc[mask_keep].reset_index(drop=True)

print("CARGA OK ✅")
print("X:", X.shape, "| y:", y.shape)
print("Distribución y (0=low,1=med,2=high):", y.value_counts().sort_index().to_dict())


CARGA OK ✅
X: (6968, 183) | y: (6968,)
Distribución y (0=low,1=med,2=high): {0: 6076, 1: 866, 2: 26}


In [10]:
# --- CELDA 2: DEFINICIÓN DE DUELOS BINARIOS ---
# y ∈ {0=Low, 1=Medium, 2=High}

DUELS = {
    "Low–High":   {"keep": {0, 2}, "pos": 2},  # positivo = High
    "Medium–High":{"keep": {1, 2}, "pos": 2},  # positivo = High
    "Low–Medium": {"keep": {0, 1}, "pos": 1},  # positivo = Medium
}

def make_binary_task(X, y, duel_name):
    """Devuelve subconjunto (Xb, yb) para el duelo con yb binario (1=clase positiva)."""
    cfg = DUELS[duel_name]
    mask = y.isin(cfg["keep"])
    Xb = X.loc[mask].reset_index(drop=True)
    yb = y.loc[mask].reset_index(drop=True)
    yb = (yb == cfg["pos"]).astype(int)
    return Xb, yb

for name in DUELS:
    Xb, yb = make_binary_task(X, y, name)
    print(f"{name:12s} → X={Xb.shape}, positivos={int(yb.sum())}, negativos={len(yb)-int(yb.sum())}")


Low–High     → X=(6102, 183), positivos=26, negativos=6076
Medium–High  → X=(892, 183), positivos=26, negativos=866
Low–Medium   → X=(6942, 183), positivos=866, negativos=6076


In [11]:
# --- CELDA 3: MÉTRICAS Y SCORING (REFIT=AUC; SP post-hoc) ---

import numpy as np
from sklearn.metrics import (
    accuracy_score, recall_score, precision_score, f1_score,
    roc_auc_score, confusion_matrix, make_scorer
)

def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
    return tn / (tn + fp) if (tn + fp) else 0.0

SPEC_SCORER = make_scorer(specificity_score)

def compute_metrics(y_true, y_pred, y_proba=None):
    return {
        "ACC": accuracy_score(y_true, y_pred),
        "RC":  recall_score(y_true, y_pred, zero_division=0),
        "PR":  precision_score(y_true, y_pred, zero_division=0),
        "F1S": f1_score(y_true, y_pred, zero_division=0),
        "SP":  specificity_score(y_true, y_pred),
        "AUC": roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan,
    }

SCORING = {
    "accuracy": "accuracy",
    "recall":   "recall",
    "precision":"precision",
    "f1":       "f1",
    "roc_auc":  "roc_auc",
    "specificity": SPEC_SCORER,   # << añade SP al CV

}
REFIT = "roc_auc"
RNG = 42


In [12]:
# --- CELDA 4: MODELOS Y GRID DE HIPERPARÁMETROS ---

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

MODELS = {
    "LR":   LogisticRegression(max_iter=1000, solver="liblinear", random_state=RNG),
    "SVM":  SVC(probability=True, gamma="scale", random_state=RNG),
    "DT":   DecisionTreeClassifier(random_state=RNG),
    "RF":   RandomForestClassifier(n_jobs=-1, random_state=RNG),
    "GB":   GradientBoostingClassifier(random_state=RNG),
    "XGB":  XGBClassifier(n_jobs=-1, random_state=RNG, eval_metric="logloss", use_label_encoder=False),
    "LGBM": LGBMClassifier(n_jobs=-1, random_state=RNG),
    "MLP":  MLPClassifier(max_iter=500, random_state=RNG),
}

GRIDS = {
    "LR":  {"clf__penalty": ["l2","l1"], "clf__C": [1,10,100]},
    "SVM": {"clf__kernel": ["linear","rbf"], "clf__C": [1,10,100]},
    "DT":  {"clf__criterion": ["gini","entropy"], "clf__max_depth": [None,5,10]},
    "RF":  {"clf__n_estimators": [50,100,200], "clf__max_depth": [None,5]},
    "GB":  {"clf__n_estimators": [100,200,300], "clf__learning_rate": [0.1,0.01]},
    "XGB": {"clf__n_estimators": [100,200,300], "clf__learning_rate": [0.1,0.01]},
    "LGBM":{"clf__n_estimators": [100,200], "clf__learning_rate": [0.1,0.01]},
    "MLP": {"clf__hidden_layer_sizes": [(100,), (100,50,25)], "clf__activation": ["relu","tanh","logistic"]},
}


In [13]:
# --- CELDA 5: CV(5) EN TRAIN CON UNDERSAMPLING + TEST FINAL ---

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from sklearn.base import clone
import pandas as pd

def fit_eval_binary(model, grid, Xb, yb, test_size=0.2, rng=RNG, label_name="duelo"):
    # 1) Split estratificado (test NO entra al CV)
    X_tr, X_te, y_tr, y_te = train_test_split(
        Xb, yb, test_size=test_size, stratify=yb, random_state=rng
    )

    # 2) Pipeline con undersampling dentro del CV
    pipe = ImbPipeline(steps=[
        ("under", RandomUnderSampler(random_state=rng)),
        ("clf", clone(model))
    ])

    # 3) GridSearchCV (5 folds) SOLO en TRAIN
    cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=rng)
    gs = GridSearchCV(
        estimator=pipe,
        param_grid=grid,
        scoring=SCORING,
        refit=REFIT,
        cv=cv5,
        n_jobs=-1,
        verbose=0
    )
    gs.fit(X_tr, y_tr)

    # 4) Métricas CV (mean ± std) del mejor set
    idx = gs.best_index_
    cv_mean = {m: gs.cv_results_[f"mean_test_{m}"][idx] for m in SCORING}
    cv_std  = {m+"_sd": gs.cv_results_[f"std_test_{m}"][idx] for m in SCORING}

    # 5) Test final (umbral por defecto del clasificador)
    best = gs.best_estimator_
    if hasattr(best, "predict_proba"):
        proba = best.predict_proba(X_te)[:,1]
    elif hasattr(best, "decision_function"):
        from sklearn.preprocessing import MinMaxScaler
        proba = MinMaxScaler().fit_transform(best.decision_function(X_te).reshape(-1,1)).ravel()
    else:
        proba = None

    yhat = best.predict(X_te)
    test = compute_metrics(y_te, yhat, proba)

    # 6) Resumen
    fila_cv = {f"CV_{m}": f"{cv_mean[m]:.4f} ± {cv_std[m+'_sd']:.4f}" for m in SCORING}
    fila_te = {f"TEST_{k}": f"{v:.4f}" for k,v in test.items()}
    resumen = {"duelo": label_name, "best_params": gs.best_params_} | fila_cv | fila_te

    return resumen, gs


In [14]:
# --- CELDA 6: EJECUTAR POR DUELO (CV mean ± sd, con SP) ---

from pathlib import Path
import pandas as pd
import json

REPORTS.mkdir(parents=True, exist_ok=True)

def run_duel_cv(duel_name, overwrite=False):
    """Ejecuta TODOS los MODELS para un duelo.
       Guarda solo métricas de CV (media ± sd), al estilo de la Tabla 3 del paper.
       Archivos por modelo:
         - __gridcv.xlsx   (cv_results_ completo)
         - __best.json     (mejores hiperparámetros + métricas CV con nombres bonitos)
       NOTA: No guarda .pkl y omite LGBM.
    """
    Xb, yb = make_binary_task(X, y, duel_name)
    print(f"\n=== {duel_name} ===  X={Xb.shape}  pos={int(yb.sum())}  neg={len(yb)-int(yb.sum())}")

    rows = []

    # mapa de nombres "internos" -> "bonitos" (paper)
    PRETTY = {
        "accuracy":  "ACC",
        "recall":    "RC",
        "precision": "PR",
        "f1":        "F1S",
        "specificity":"SP",   # <- SP desde tu SPEC_SCORER
        "roc_auc":   "AUC",
    }

    for model_key, model in MODELS.items():
        # omitir LGBM
        if model_key == "LGBM":
            print(f"  > {model_key} (omitido por configuración)")
            continue

        base = REPORTS / f"{duel_name}__{model_key}"
        grid_path = base.with_name(base.name + "__gridcv.xlsx")
        best_path = base.with_name(base.name + "__best.json")

        if (not overwrite) and best_path.exists():
            print(f"  > {model_key} (saltando, ya existe)")
            best = json.loads(best_path.read_text(encoding="utf-8"))
        else:
            print(f"  > {model_key} ...")
            resumen, gs = fit_eval_binary(model, GRIDS[model_key], Xb, yb,
                                          label_name=f"{duel_name} · {model_key}")

            # guarda el grid completo
            pd.DataFrame(gs.cv_results_).to_excel(grid_path, index=False)

            # limpia parámetros del pipeline (quita prefijo clf__)
            def _clean_params(d):
                out = {}
                for k, v in d.items():
                    k = k.replace("clf__", "")
                    if hasattr(v, "item"):  # numpy scalar
                        v = v.item()
                    out[k] = v
                return out

            idx = gs.best_index_

            # extrae métricas mean ± sd del CV con nombres bonitos
            cv_metrics = {}
            for m in SCORING:  # usa tus keys: accuracy, recall, precision, f1, specificity, roc_auc
                mean = gs.cv_results_[f"mean_test_{m}"][idx]
                sd   = gs.cv_results_[f"std_test_{m}"][idx]
                pretty = PRETTY[m]
                cv_metrics[pretty] = f"{mean:.4f} ± {sd:.4f}"

            best = {
                "Duelo": duel_name,
                "Modelo": model_key,
                "params": _clean_params(gs.best_params_),
                **cv_metrics
            }

            # guarda solo JSON (sin .pkl)
            best_path.write_text(json.dumps(best, ensure_ascii=False, indent=2), encoding="utf-8")

        # fila resumen (para el Excel del duelo) con nombres bonitos
        row = {"Duelo": duel_name, "Modelo": model_key}
        for m in ("ACC","RC","F1S","PR","SP","AUC"):
            row[m] = best.get(m, "—")
        rows.append(row)

    df = pd.DataFrame(rows).sort_values(["Duelo","Modelo"]).reset_index(drop=True)
    out_tab = REPORTS / f"{duel_name}__tabla_CV.xlsx"
    df.to_excel(out_tab, index=False)
    print("✅ Guardado resumen de CV en:", out_tab)
    return df


In [None]:
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(restart=True)


{'status': 'ok', 'restart': True}

: 

In [15]:
run_duel_cv("Low–High")



=== Low–High ===  X=(6102, 183)  pos=26  neg=6076
  > LR ...
  > SVM ...
  > DT ...
  > RF ...
  > GB ...
  > XGB ...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  > LGBM (omitido por configuración)
  > MLP ...
✅ Guardado resumen de CV en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\Low–High__tabla_CV.xlsx


Unnamed: 0,Duelo,Modelo,ACC,RC,F1S,PR,SP,AUC
0,Low–High,DT,0.6626 ± 0.0499,0.7200 ± 0.2315,0.0177 ± 0.0038,0.0090 ± 0.0019,0.6623 ± 0.0507,0.6912 ± 0.1024
1,Low–High,GB,0.7042 ± 0.0730,0.9000 ± 0.1225,0.0268 ± 0.0066,0.0136 ± 0.0034,0.7033 ± 0.0735,0.8656 ± 0.0330
2,Low–High,LR,0.7947 ± 0.0463,0.7600 ± 0.0200,0.0316 ± 0.0032,0.0161 ± 0.0017,0.7949 ± 0.0466,0.8705 ± 0.0290
3,Low–High,MLP,0.8138 ± 0.0265,0.8500 ± 0.1225,0.0381 ± 0.0057,0.0195 ± 0.0029,0.8136 ± 0.0269,0.9089 ± 0.0451
4,Low–High,RF,0.7835 ± 0.0370,0.8500 ± 0.1225,0.0333 ± 0.0060,0.0170 ± 0.0031,0.7831 ± 0.0375,0.9050 ± 0.0436
5,Low–High,SVM,0.6724 ± 0.0842,0.9500 ± 0.1000,0.0253 ± 0.0039,0.0128 ± 0.0020,0.6712 ± 0.0850,0.8884 ± 0.0156
6,Low–High,XGB,0.7775 ± 0.0468,0.8500 ± 0.1225,0.0323 ± 0.0043,0.0165 ± 0.0022,0.7772 ± 0.0474,0.8496 ± 0.0513


In [16]:
run_duel_cv("Medium–High")



=== Medium–High ===  X=(892, 183)  pos=26  neg=866
  > LR ...
  > SVM ...
  > DT ...
  > RF ...
  > GB ...
  > XGB ...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  > LGBM (omitido por configuración)
  > MLP ...
✅ Guardado resumen de CV en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\Medium–High__tabla_CV.xlsx


Unnamed: 0,Duelo,Modelo,ACC,RC,F1S,PR,SP,AUC
0,Medium–High,DT,0.4236 ± 0.0383,0.7200 ± 0.1691,0.0678 ± 0.0116,0.0356 ± 0.0061,0.4149 ± 0.0427,0.5674 ± 0.0699
1,Medium–High,GB,0.4586 ± 0.0305,0.6700 ± 0.1887,0.0675 ± 0.0174,0.0356 ± 0.0092,0.4524 ± 0.0321,0.5207 ± 0.0954
2,Medium–High,LR,0.4712 ± 0.0536,0.5200 ± 0.2768,0.0550 ± 0.0291,0.0291 ± 0.0154,0.4696 ± 0.0539,0.4908 ± 0.1142
3,Medium–High,MLP,0.5064 ± 0.0688,0.6200 ± 0.1122,0.0694 ± 0.0131,0.0368 ± 0.0073,0.5030 ± 0.0729,0.5125 ± 0.0889
4,Medium–High,RF,0.4669 ± 0.0648,0.5700 ± 0.1860,0.0607 ± 0.0234,0.0322 ± 0.0127,0.4639 ± 0.0661,0.4607 ± 0.1053
5,Medium–High,SVM,0.4994 ± 0.0715,0.5800 ± 0.2135,0.0633 ± 0.0185,0.0336 ± 0.0097,0.4972 ± 0.0750,0.4612 ± 0.1018
6,Medium–High,XGB,0.5118 ± 0.0530,0.4200 ± 0.2135,0.0487 ± 0.0275,0.0259 ± 0.0147,0.5144 ± 0.0558,0.4458 ± 0.0663


In [17]:
run_duel_cv("Low–Medium")



=== Low–Medium ===  X=(6942, 183)  pos=866  neg=6076
  > LR ...
  > SVM ...
  > DT ...
  > RF ...
  > GB ...
  > XGB ...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


  > LGBM (omitido por configuración)
  > MLP ...
✅ Guardado resumen de CV en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\Low–Medium__tabla_CV.xlsx


Unnamed: 0,Duelo,Modelo,ACC,RC,F1S,PR,SP,AUC
0,Low–Medium,DT,0.6395 ± 0.0548,0.7560 ± 0.0834,0.3450 ± 0.0133,0.2255 ± 0.0187,0.6228 ± 0.0740,0.7177 ± 0.0248
1,Low–Medium,GB,0.7414 ± 0.0074,0.7980 ± 0.0281,0.4351 ± 0.0149,0.2991 ± 0.0104,0.7333 ± 0.0057,0.8438 ± 0.0113
2,Low–Medium,LR,0.7677 ± 0.0076,0.7937 ± 0.0306,0.4602 ± 0.0111,0.3242 ± 0.0085,0.7640 ± 0.0105,0.8551 ± 0.0115
3,Low–Medium,MLP,0.7502 ± 0.0095,0.7807 ± 0.0213,0.4384 ± 0.0117,0.3048 ± 0.0095,0.7459 ± 0.0108,0.8363 ± 0.0129
4,Low–Medium,RF,0.7286 ± 0.0136,0.8008 ± 0.0332,0.4243 ± 0.0168,0.2888 ± 0.0132,0.7183 ± 0.0158,0.8399 ± 0.0125
5,Low–Medium,SVM,0.7396 ± 0.0084,0.8182 ± 0.0415,0.4394 ± 0.0146,0.3005 ± 0.0095,0.7284 ± 0.0115,0.8537 ± 0.0138
6,Low–Medium,XGB,0.7405 ± 0.0046,0.7807 ± 0.0283,0.4288 ± 0.0106,0.2956 ± 0.0068,0.7348 ± 0.0062,0.8411 ± 0.0117


In [18]:
# --- CELDA 7A: TABLA 1 — Mejores hiperparámetros por duelo–modelo ---

import json
from pathlib import Path
import pandas as pd

def build_bestparams_table(REPORTS):
    rows = []
    for p in Path(REPORTS).glob("*__best.json"):
        data = json.loads(p.read_text(encoding="utf-8"))

        # duelo/modelo desde JSON o desde el nombre del archivo
        duel = data.get("Duelo")
        model = data.get("Modelo")
        if duel is None or model is None:
            parts = p.stem.split("__")
            duel = parts[0] if len(parts) > 0 else "?"
            model = parts[1] if len(parts) > 1 else "?"

        # AUC en formato mean ± sd ya viene como string bonito
        auc = data.get("AUC", "—")

        params = data.get("params", {})
        rows.append({"Duelo": duel, "Modelo": model, "AUC_CV": auc, **params})

    if not rows:
        raise RuntimeError("No se encontraron archivos __best.json en REPORTS.")

    df = pd.DataFrame(rows).sort_values(["Duelo","Modelo"]).reset_index(drop=True)
    out = REPORTS / "TABLA1__MEJORES_HIPERPARAMETROS.xlsx"
    df.to_excel(out, index=False)
    print("✅ Tabla 1 guardada en:", out)
    display(df)
    return df

tabla_hiper = build_bestparams_table(REPORTS)


✅ Tabla 1 guardada en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\TABLA1__MEJORES_HIPERPARAMETROS.xlsx


Unnamed: 0,Duelo,Modelo,AUC_CV,criterion,max_depth,learning_rate,n_estimators,C,penalty,activation,hidden_layer_sizes,kernel
0,Low–High,DT,0.6912 ± 0.1024,gini,,,,,,,,
1,Low–High,GB,0.8656 ± 0.0330,,,0.01,300.0,,,,,
2,Low–High,LR,0.8705 ± 0.0290,,,,,1.0,l2,,,
3,Low–High,MLP,0.9089 ± 0.0451,,,,,,,relu,[100],
4,Low–High,RF,0.9050 ± 0.0436,,,,100.0,,,,,
5,Low–High,SVM,0.8884 ± 0.0156,,,,,1.0,,,,rbf
6,Low–High,XGB,0.8496 ± 0.0513,,,0.1,200.0,,,,,
7,Low–Medium,DT,0.7177 ± 0.0248,entropy,5.0,,,,,,,
8,Low–Medium,GB,0.8438 ± 0.0113,,,0.1,100.0,,,,,
9,Low–Medium,LR,0.8551 ± 0.0115,,,,,1.0,l2,,,


In [39]:
# --- CELDA 7B: TABLA 2 — Comparativo de performance (CV mean ± sd) ---

import pandas as pd

def build_performance_tables(REPORTS, duels):
    result = {}
    for duel in duels:
        path = REPORTS / f"{duel}__tabla_CV.xlsx"
        if not path.exists():
            print(f"Aviso: no encontré {path.name}")
            continue
        df = pd.read_excel(path)
        # Asegura el orden de columnas como en el paper
        cols = ["Modelo","ACC","RC","F1S","PR","SP","AUC"]
        cols = [c for c in cols if c in df.columns]
        df = df[["Modelo"] + [c for c in cols if c != "Modelo"]]
        result[duel] = df
        print(f"✅ {duel}: {df.shape[0]} modelos")

    out_path = REPORTS / "TABLA2__PERFORMANCE_MODELOS.xlsx"
    with pd.ExcelWriter(out_path) as xl:
        for duel, df in result.items():
            df.to_excel(xl, sheet_name=duel.replace("–","-"), index=False)
    print("✅ Tabla 2 guardada en:", out_path)
    return result

tables_perf = build_performance_tables(REPORTS, DUELS.keys())


✅ Low–High: 7 modelos
✅ Medium–High: 7 modelos
✅ Low–Medium: 7 modelos
✅ Tabla 2 guardada en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\TABLA2__PERFORMANCE_MODELOS.xlsx


In [40]:
# --- CELDA 7C: TABLA 3 — Consolidada tipo paper (tres duelos juntos) ---

import pandas as pd

def build_table3(REPORTS, duels):
    dfs = []
    for duel in duels:
        path = REPORTS / f"{duel}__tabla_CV.xlsx"
        if not path.exists():
            print(f"Aviso: {path.name} no encontrado, se omite.")
            continue
        df = pd.read_excel(path)
        df["Duelo"] = duel
        # orden de columnas amigable
        cols = ["Duelo","Modelo","ACC","RC","F1S","PR","SP","AUC"]
        cols_exist = [c for c in cols if c in df.columns or c=="Duelo"]
        df = df[[c for c in cols if c in cols_exist]]
        dfs.append(df)

    if not dfs:
        raise RuntimeError("No se encontraron archivos __tabla_CV.xlsx para los duelos.")

    final = pd.concat(dfs, ignore_index=True)

    out = REPORTS / "TABLA3__COMPARATIVA_PAPER.xlsx"
    with pd.ExcelWriter(out) as xl:
        for duel in duels:
            sub = final[final["Duelo"] == duel].drop(columns="Duelo", errors="ignore")
            sub.to_excel(xl, sheet_name=duel.replace("–","-"), index=False)
    print("✅ Tabla 3 tipo paper guardada en:", out)
    display(final.head())
    return final

tabla3 = build_table3(REPORTS, DUELS.keys())


✅ Tabla 3 tipo paper guardada en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\TABLA3__COMPARATIVA_PAPER.xlsx


Unnamed: 0,Duelo,Modelo,ACC,RC,F1S,PR,SP,AUC
0,Low–High,DT,0.6626 ± 0.0499,0.7200 ± 0.2315,0.0177 ± 0.0038,0.0090 ± 0.0019,0.6623 ± 0.0507,0.6912 ± 0.1024
1,Low–High,GB,0.7042 ± 0.0730,0.9000 ± 0.1225,0.0268 ± 0.0066,0.0136 ± 0.0034,0.7033 ± 0.0735,0.8656 ± 0.0330
2,Low–High,LR,0.7947 ± 0.0463,0.7600 ± 0.0200,0.0316 ± 0.0032,0.0161 ± 0.0017,0.7949 ± 0.0466,0.8705 ± 0.0290
3,Low–High,MLP,0.8138 ± 0.0265,0.8500 ± 0.1225,0.0381 ± 0.0057,0.0195 ± 0.0029,0.8136 ± 0.0269,0.9089 ± 0.0451
4,Low–High,RF,0.7835 ± 0.0370,0.8500 ± 0.1225,0.0333 ± 0.0060,0.0170 ± 0.0031,0.7831 ± 0.0375,0.9050 ± 0.0436


In [23]:
# --- CELDA 7S: STACKING (Meta-Model LR) + GRID del meta-estimador ---

from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone

def build_stacking_estimator():
    bases = [
        ("LR",  clone(MODELS["LR"])),
        ("SVM", clone(MODELS["SVM"])),
        ("DT",  clone(MODELS["DT"])),
        ("RF",  clone(MODELS["RF"])),
        ("GB",  clone(MODELS["GB"])),
        ("XGB", clone(MODELS["XGB"])),
        # ("LGBM", clone(MODELS["LGBM"])),  # lo omitimos, consistente con tu flujo
        ("MLP", clone(MODELS["MLP"])),
    ]
    meta = LogisticRegression(max_iter=1000, random_state=RNG)
    stack = StackingClassifier(
        estimators=bases,
        final_estimator=meta,
        stack_method="predict_proba",
        cv=5,
        n_jobs=-1
    )
    return stack

STACK_GRID = {
    "stack__final_estimator__solver":  ["liblinear", "saga"],
    "stack__final_estimator__penalty": ["l1", "l2"],
    "stack__final_estimator__C":       [0.1, 1, 10, 100],
}


In [41]:
# --- CELDA 8: CONFUSION MATRICES (Tabla 10 tipo paper, sin .pkl) ---

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import confusion_matrix
from sklearn.base import clone
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

CONF_DIR = REPORTS / "conf_matrices"
CONF_DIR.mkdir(parents=True, exist_ok=True)

def rebuild_pipeline(model_key, params_dict):
    """
    Replica el pipeline usado en CV:
      under = RandomUnderSampler(random_state=RNG)
      clf   = MODELS[model_key]
    y le inyecta los mejores hiperparámetros en clf (usando prefijo clf__).
    """
    base_clf = clone(MODELS[model_key])
    pipe = ImbPipeline(steps=[
        ("under", RandomUnderSampler(random_state=RNG)),
        ("clf", base_clf)
    ])
    # los __best.json tienen params 'limpios' (sin 'clf__'), así que les ponemos el prefijo:
    prefixed = {f"clf__{k}": v for k, v in params_dict.items()}
    pipe.set_params(**prefixed)
    return pipe

def plot_conf_matrix(duel_name, model_key, model, Xb, yb):
    """
    Entrena el pipeline en TODO el subset del duelo y genera la matriz de confusión.
    Guarda PNG y retorna TN/FP/FN/TP.
    """
    model.fit(Xb, yb)
    y_pred = model.predict(Xb)
    cm = confusion_matrix(yb, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()

    fig, ax = plt.subplots(figsize=(3,3))
    im = ax.imshow(cm, cmap="Blues")
    for (i, j), v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center",
                color=("white" if im.norm(v) > 0.5 else "black"), fontsize=11)
    ax.set_xticks([0,1]); ax.set_xticklabels(["Pred Neg", "Pred Pos"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["True Neg", "True Pos"])
    ax.set_xlabel("Predicted Class"); ax.set_ylabel("True Class")
    ax.set_title(f"{duel_name} — {model_key}", fontsize=10)
    fig.tight_layout()

    out = CONF_DIR / f"{duel_name}__{model_key}_conf.png"
    fig.savefig(out, dpi=200)
    plt.close(fig)
    return {"Duelo": duel_name, "Modelo": model_key, "TN": tn, "FP": fp, "FN": fn, "TP": tp}

def build_confusion_table(REPORTS, duels):
    rows = []
    for duel in duels:
        Xb, yb = make_binary_task(X, y, duel)
        # lee cada mejor.json del duelo
        for p in Path(REPORTS).glob(f"{duel}__*__best.json"):
            data = json.loads(p.read_text(encoding="utf-8"))
            model_key = data["Modelo"]
            if model_key == "LGBM":     # lo estamos omitiendo en todo el flujo
                continue
            params = data["params"]     # sin 'clf__' (ya los limpiamos antes)

            pipe = rebuild_pipeline(model_key, params)
            print(f"Generando matriz → {duel} · {model_key}")
            rows.append(plot_conf_matrix(duel, model_key, pipe, Xb, yb))

    df = pd.DataFrame(rows).sort_values(["Duelo","Modelo"]).reset_index(drop=True)
    out = REPORTS / "TABLA10__CONFUSION_MATRICES.xlsx"
    df.to_excel(out, index=False)
    print("✅ Tabla 10 exportada y figuras guardadas en:", CONF_DIR)
    return df

tabla10 = build_confusion_table(REPORTS, DUELS.keys())
display(tabla10.head())


Generando matriz → Low–High · DT
Generando matriz → Low–High · GB
Generando matriz → Low–High · LR
Generando matriz → Low–High · MLP
Generando matriz → Low–High · RF
Generando matriz → Low–High · SVM
Generando matriz → Low–High · XGB


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Generando matriz → Medium–High · DT
Generando matriz → Medium–High · GB
Generando matriz → Medium–High · LR
Generando matriz → Medium–High · MLP
Generando matriz → Medium–High · RF
Generando matriz → Medium–High · SVM
Generando matriz → Medium–High · XGB


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Generando matriz → Low–Medium · DT
Generando matriz → Low–Medium · GB
Generando matriz → Low–Medium · LR
Generando matriz → Low–Medium · MLP
Generando matriz → Low–Medium · RF
Generando matriz → Low–Medium · SVM
Generando matriz → Low–Medium · XGB


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


✅ Tabla 10 exportada y figuras guardadas en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\conf_matrices


Unnamed: 0,Duelo,Modelo,TN,FP,FN,TP
0,Low–High,DT,4390,1686,0,26
1,Low–High,GB,4282,1794,0,26
2,Low–High,LR,4411,1665,0,26
3,Low–High,MLP,4470,1606,0,26
4,Low–High,RF,4351,1725,0,26


In [24]:
# --- CELDA 8S: GridSearchCV del STACKING por duelo (CV mean ± sd, sin .pkl) ---

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
import pandas as pd, json

def run_stacking_cv(duel_name, overwrite=False):
    Xb, yb = make_binary_task(X, y, duel_name)
    print(f"\n=== STACKING — {duel_name} ===  X={Xb.shape}")

    base = REPORTS / f"{duel_name}__STACK"
    grid_xlsx = base.with_name(base.name + "__stack_gridcv.xlsx")
    best_json = base.with_name(base.name + "__stack_best.json")

    if (not overwrite) and best_json.exists():
        print("  > (saltando, ya existe)")
        data = json.loads(best_json.read_text(encoding="utf-8"))
    else:
        stack = build_stacking_estimator()
        pipe = ImbPipeline([("under", RandomUnderSampler(random_state=RNG)),
                            ("stack", stack)])

        cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=RNG)
        gs = GridSearchCV(
            estimator=pipe,
            param_grid=STACK_GRID,
            scoring=SCORING,
            refit=REFIT,  # "roc_auc"
            cv=cv5,
            n_jobs=-1,
            verbose=0
        )
        gs.fit(Xb, yb)
        pd.DataFrame(gs.cv_results_).to_excel(grid_xlsx, index=False)

        idx = gs.best_index_
        # empaquetar métricas mean±sd
        metrics = {}
        for m in SCORING:  # accuracy, recall, precision, f1, specificity, roc_auc
            metrics[m.upper()] = f"{gs.cv_results_[f'mean_test_{m}'][idx]:.4f} ± {gs.cv_results_[f'std_test_{m}'][idx]:.4f}"

        data = {
            "Duelo": duel_name,
            "Modelo": "STACK",  # etiqueta para diferenciar
            "params_final_estimator": {
                "solver":  gs.best_params_["stack__final_estimator__solver"],
                "penalty": gs.best_params_["stack__final_estimator__penalty"],
                "C":       float(gs.best_params_["stack__final_estimator__C"]),
            },
            **metrics
        }
        best_json.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")

    # export resumido por duelo
    row = {"Modelo": "STACK",
           "ACC": data.get("ACCURACY","—"),
           "RC":  data.get("RECALL","—"),
           "F1S": data.get("F1","—"),
           "PR":  data.get("PRECISION","—"),
           "SP":  data.get("SPECIFICITY","—"),
           "AUC": data.get("ROC_AUC", data.get("AUC","—"))}
    out_duel = REPORTS / f"{duel_name}__tabla_CV.xlsx"
    if out_duel.exists():
        df = pd.read_excel(out_duel)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    else:
        df = pd.DataFrame([row])
    df = df[["Modelo","ACC","RC","F1S","PR","SP","AUC"]]
    df.to_excel(out_duel, index=False)
    print("  ✅ Apéndice de STACK a:", out_duel)
    return data

# Ejecutar stacking para todos los duelos
stack_rows = []
for duel in DUELS.keys():
    stack_rows.append(run_stacking_cv(duel, overwrite=False))

# Opcional: export resumen global de stacking
pd.DataFrame(stack_rows).to_excel(REPORTS / "STACKING__RESUMEN_GLOBAL.xlsx", index=False)
print("✅ Resumen global de STACKING exportado.")



=== STACKING — Low–High ===  X=(6102, 183)
  ✅ Apéndice de STACK a: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\Low–High__tabla_CV.xlsx

=== STACKING — Medium–High ===  X=(892, 183)
  ✅ Apéndice de STACK a: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\Medium–High__tabla_CV.xlsx

=== STACKING — Low–Medium ===  X=(6942, 183)
  ✅ Apéndice de STACK a: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\Low–Medium__tabla_CV.xlsx
✅ Resumen global de STACKING exportado.


In [25]:
# --- CELDA 8SB: Rehacer TABLA 2 (comparativa) incluyendo STACK ---

import pandas as pd

def rebuild_table2_with_stack(REPORTS, duels):
    result = {}
    for duel in duels:
        path = REPORTS / f"{duel}__tabla_CV.xlsx"
        if not path.exists():
            print(f"Aviso: no encontré {path.name}")
            continue
        df = pd.read_excel(path)
        cols = ["Modelo","ACC","RC","F1S","PR","SP","AUC"]
        df = df[[c for c in cols if c in df.columns]]
        result[duel] = df
        print(f"✅ {duel}: {df.shape[0]} modelos (incluye STACK si ya corrió)")

    out_path = REPORTS / "TABLA2__PERFORMANCE_MODELOS.xlsx"
    with pd.ExcelWriter(out_path) as xl:
        for duel, df in result.items():
            df.to_excel(xl, sheet_name=duel.replace("–","-"), index=False)
    print("✅ Tabla 2 actualizada en:", out_path)
    return result

tables_perf = rebuild_table2_with_stack(REPORTS, DUELS.keys())


✅ Low–High: 8 modelos (incluye STACK si ya corrió)
✅ Medium–High: 8 modelos (incluye STACK si ya corrió)
✅ Low–Medium: 8 modelos (incluye STACK si ya corrió)
✅ Tabla 2 actualizada en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\TABLA2__PERFORMANCE_MODELOS.xlsx


In [26]:
# --- CELDA 9S: Matrices de confusión del STACK (Tabla 10 style, sin .pkl) ---

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler

CONF_DIR.mkdir(parents=True, exist_ok=True)

def rebuild_stack_from_best(duel_name):
    """Reconstruye el stacking con los mejores hiperparámetros del meta desde __stack_best.json."""
    p = REPORTS / f"{duel_name}__STACK__stack_best.json"
    data = json.loads(p.read_text(encoding="utf-8"))
    pars = data["params_final_estimator"]  # solver/penalty/C

    stack = build_stacking_estimator()
    stack.set_params(
        **{
            "final_estimator__solver":  pars["solver"],
            "final_estimator__penalty": pars["penalty"],
            "final_estimator__C":       pars["C"],
        }
    )
    pipe = ImbPipeline([("under", RandomUnderSampler(random_state=RNG)),
                        ("stack", stack)])
    return pipe

def plot_stack_conf(duel_name):
    Xb, yb = make_binary_task(X, y, duel_name)
    pipe = rebuild_stack_from_best(duel_name)
    pipe.fit(Xb, yb)
    y_pred = pipe.predict(Xb)
    cm = confusion_matrix(yb, y_pred, labels=[0,1])
    tn, fp, fn, tp = cm.ravel()

    fig, ax = plt.subplots(figsize=(3,3))
    im = ax.imshow(cm, cmap="Purples")
    for (i, j), v in np.ndenumerate(cm):
        ax.text(j, i, str(v), ha="center", va="center",
                color=("white" if im.norm(v) > 0.5 else "black"), fontsize=11)
    ax.set_xticks([0,1]); ax.set_xticklabels(["Pred Neg", "Pred Pos"])
    ax.set_yticks([0,1]); ax.set_yticklabels(["True Neg", "True Pos"])
    ax.set_xlabel("Predicted Class"); ax.set_ylabel("True Class")
    ax.set_title(f"{duel_name} — STACK", fontsize=10)
    fig.tight_layout()
    out = CONF_DIR / f"{duel_name}__STACK_conf.png"
    fig.savefig(out, dpi=200)
    plt.close(fig)
    return {"Duelo": duel_name, "Modelo": "STACK", "TN": tn, "FP": fp, "FN": fn, "TP": tp}

rows = [plot_stack_conf(d) for d in DUELS.keys()]
df_stack_conf = pd.DataFrame(rows).sort_values(["Duelo","Modelo"]).reset_index(drop=True)

# Añadir/combinar con tu TABLA10 si quieres todo junto
out_all = REPORTS / "TABLA10__CONFUSION_MATRICES__CON_STACK.xlsx"
if (REPORTS / "TABLA10__CONFUSION_MATRICES.xlsx").exists():
    base = pd.read_excel(REPORTS / "TABLA10__CONFUSION_MATRICES.xlsx")
    final = pd.concat([base, df_stack_conf], ignore_index=True)
else:
    final = df_stack_conf
final.to_excel(out_all, index=False)
print("✅ Matrices de confusión del STACK guardadas y combinadas en:", out_all)
display(df_stack_conf)


✅ Matrices de confusión del STACK guardadas y combinadas en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\TABLA10__CONFUSION_MATRICES__CON_STACK.xlsx


Unnamed: 0,Duelo,Modelo,TN,FP,FN,TP
0,Low–High,STACK,4575,1501,0,26
1,Low–Medium,STACK,4805,1271,83,783
2,Medium–High,STACK,866,0,26,0


In [43]:
# --- CELDA 8S-BEST (solo STACK): Mejores hiperparámetros del meta por duelo ---

import json
import pandas as pd
from pathlib import Path

def build_stackparams_table(REPORTS):
    rows = []
    for p in Path(REPORTS).glob("*__STACK__stack_best.json"):
        data = json.loads(p.read_text(encoding="utf-8"))
        duel = data.get("Duelo", p.stem.split("__")[0])
        pars = data.get("params_final_estimator", {})
        auc  = data.get("ROC_AUC", data.get("AUC", "—"))  # mean±sd string

        rows.append({
            "Duelo": duel,
            "Modelo": "STACK",
            "AUC_CV": auc,
            "solver":  pars.get("solver", "—"),
            "penalty": pars.get("penalty", "—"),
            "C":       pars.get("C", "—"),
        })

    if not rows:
        raise RuntimeError("No se encontraron __stack_best.json en REPORTS.")

    df = pd.DataFrame(rows).sort_values(["Duelo","Modelo"]).reset_index(drop=True)
    out = REPORTS / "TABLA1_STACK__MEJORES_HIPERPARAMETROS.xlsx"
    df.to_excel(out, index=False)
    print("✅ Tabla STACK (mejores hiperparámetros) guardada en:", out)
    display(df)
    return df

tabla1_stack = build_stackparams_table(REPORTS)


✅ Tabla STACK (mejores hiperparámetros) guardada en: c:\Users\Gerson\Downloads\PISA-ML\reports\tables\TABLA1_STACK__MEJORES_HIPERPARAMETROS.xlsx


Unnamed: 0,Duelo,Modelo,AUC_CV,solver,penalty,C
0,Low–High,STACK,0.8906 ± 0.0413,saga,l1,1.0
1,Low–Medium,STACK,0.8631 ± 0.0149,saga,l2,1.0
2,Medium–High,STACK,0.5000 ± 0.0000,liblinear,l1,0.1
