## Generación de modelos ##


In [None]:
# Montar Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Cambiar el directorio de trabajo
import os
import sys

# Define the TFM directory path
TFM_PATH = '/content/drive/My Drive/TFM'

# Change the current working directory to the TFM directory
os.chdir(TFM_PATH)
print(f"Current working directory changed to: {os.getcwd()}")

# Add the TFM directory to the Python system path
if TFM_PATH not in sys.path:
    sys.path.append(TFM_PATH)
    print(f"'{TFM_PATH}' added to Python system path.")
else:
    print(f"'{TFM_PATH}' is already in Python system path.")

In [2]:
# ml_pipeline_htn.py
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from dataclasses import dataclass
from typing import Dict, Any, Optional, Tuple, List

from sklearn.base import clone
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score, precision_score, recall_score,
    confusion_matrix, make_scorer
)
from sklearn.base import clone

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance

# XGBoost (opcional)
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except Exception:
    HAS_XGB = False

# Statsmodels para p-values en logística (opcional)
try:
    import statsmodels.api as sm
    HAS_SM = True
except Exception:
    HAS_SM = False

# SHAP (opcional, para explicabilidad “si es posible”)
try:
    import shap
    HAS_SHAP = True
except Exception:
    HAS_SHAP = False


In [3]:
# Variables
TARGET = "Hypertension"                  # <- cambia si tu target tiene otro nombre

source_path = TFM_PATH + '/sources/'
SEP = "\t"

dataset_covariables = "selected_participants_imputed_reduced_standardized.tsv"
dataset_covariables_analysis = "selected_participants_imputed_analysis_standardized.tsv"
dataset_modelos = "prs_zscore_modelos_representativos_target.tsv"
dataset_cov_mod = "selected_participants_modelos_reduced_standardized.tsv"
dataset_cov_mod_analysis = "selected_participants_modelos_analysis_standardized.tsv"

In [4]:
# =========================================================
# Utilidades de carga / limpieza
# =========================================================
def _maybe_convert_comma_decimal(series: pd.Series, min_numeric_ratio: float = 0.98) -> pd.Series:
    """Convierte strings tipo '0,123' -> float 0.123 si casi todo es numérico."""
    s = series.astype(str).str.replace(",", ".", regex=False)
    num = pd.to_numeric(s, errors="coerce")
    if num.notna().mean() >= min_numeric_ratio:
        return num
    return series


def load_dataset_tsv(
    path: str,
    target_candidates: Tuple[str, ...] = ("Hypertension", "target", "HTN"),
    drop_cols: Tuple[str, ...] = ("Participant", "Participant_ID", "participant_id"),
) -> Tuple[pd.DataFrame, pd.Series]:
    df = pd.read_csv(path, sep="\t")

    # target
    target_col = None
    for c in target_candidates:
        if c in df.columns:
            target_col = c
            break
    if target_col is None:
        raise ValueError(f"No se encontró target en {path}. Probé: {target_candidates}")

    # drop ids
    for c in drop_cols:
        if c in df.columns:
            df = df.drop(columns=[c])

    y = df[target_col].astype(int)
    X = df.drop(columns=[target_col])

    # convertir coma decimal en columnas object si aplica
    Xc = X.copy()
    for col in Xc.columns:
        if Xc[col].dtype == "object":
            Xc[col] = _maybe_convert_comma_decimal(Xc[col])

    return Xc, y


# =========================================================
# Preprocesado: numéricas + categóricas
# =========================================================
def make_preprocessor(
    X: pd.DataFrame,
    scale_numeric: bool = True,
    impute_numeric: str = "median",
    impute_categorical: str = "most_frequent",
    onehot_min_frequency: Optional[float] = None,  # None => normal onehot
) -> ColumnTransformer:

    numeric_features = X.select_dtypes(include=["number", "bool"]).columns.tolist()
    categorical_features = [c for c in X.columns if c not in numeric_features]

    num_steps = [("imputer", SimpleImputer(strategy=impute_numeric))]
    if scale_numeric:
        num_steps.append(("scaler", StandardScaler()))

    if onehot_min_frequency is None:
        ohe = OneHotEncoder(handle_unknown="ignore")
    else:
        # útil si hubiera categorías raras
        ohe = OneHotEncoder(handle_unknown="ignore", min_frequency=onehot_min_frequency)

    cat_pipe = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy=impute_categorical)),
        ("onehot", ohe),
    ])

    preprocess = ColumnTransformer(
        transformers=[
            ("num", Pipeline(num_steps), numeric_features),
            ("cat", cat_pipe, categorical_features),
        ],
        remainder="drop",
        verbose_feature_names_out=True
    )
    return preprocess


In [5]:
@dataclass
class CVResult:
    folds: pd.DataFrame
    summary: pd.Series


# =========================================================
# Métricas CV
# =========================================================
def evaluate_with_cv(
    base_model,
    X: pd.DataFrame,
    y: pd.Series,
    preprocess: ColumnTransformer,
    n_splits: int = 5,
    random_state: int = 42,
    threshold: float = 0.5,
) -> CVResult:
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    rows: List[Dict[str, Any]] = []
    TP = TN = FP = FN = 0

    for fold, (tr, te) in enumerate(skf.split(X, y), start=1):
        model = clone(base_model)
        pipe = Pipeline([("preprocess", preprocess), ("model", model)])
        pipe.fit(X.iloc[tr], y.iloc[tr])

        # Probabilidades
        if hasattr(pipe.named_steps["model"], "predict_proba"):
            proba = pipe.predict_proba(X.iloc[te])[:, 1]
        else:
            # por compatibilidad (p.ej., SVC sin probability=True)
            # aquí asumimos que se configuró probability=True si se quiere AUC
            raise ValueError("El modelo no tiene predict_proba. Configura probability=True si aplica.")

        pred = (proba >= threshold).astype(int)
        y_te = y.iloc[te]

        tn, fp, fn, tp = confusion_matrix(y_te, pred, labels=[0, 1]).ravel()
        TN += tn; FP += fp; FN += fn; TP += tp

        rows.append({
            "fold": fold,
            "AUC": roc_auc_score(y_te, proba),
            "accuracy": accuracy_score(y_te, pred),
            "f1": f1_score(y_te, pred, zero_division=0),  # f-measure
            "precision": precision_score(y_te, pred, zero_division=0),
            "recall": recall_score(y_te, pred, zero_division=0),
            "TN": tn, "FP": fp, "FN": fn, "TP": tp
        })

    folds = pd.DataFrame(rows)

    summary = pd.Series({
        "AUC_mean": folds["AUC"].mean(),
        "AUC_std": folds["AUC"].std(ddof=1),
        "accuracy_mean": folds["accuracy"].mean(),
        "accuracy_std": folds["accuracy"].std(ddof=1),
        "f1_mean": folds["f1"].mean(),
        "f1_std": folds["f1"].std(ddof=1),
        "precision_mean": folds["precision"].mean(),
        "precision_std": folds["precision"].std(ddof=1),
        "recall_mean": folds["recall"].mean(),
        "recall_std": folds["recall"].std(ddof=1),
        "TN_total": TN, "FP_total": FP, "FN_total": FN, "TP_total": TP
    })

    return CVResult(folds=folds, summary=summary)


In [6]:
# =========================================================
# Importancia / pesos / p-values
# =========================================================
def get_feature_names(preprocess: ColumnTransformer) -> np.ndarray:
    return preprocess.get_feature_names_out()

def linear_weights_from_fitted_pipeline(fitted_pipe: Pipeline) -> pd.DataFrame:
    preprocess = fitted_pipe.named_steps["preprocess"]
    model = fitted_pipe.named_steps["model"]
    names = get_feature_names(preprocess)
    coefs = model.coef_.ravel()
    df = pd.DataFrame({"feature": names, "coef": coefs})
    df["abs_coef"] = df["coef"].abs()
    df = df.sort_values("abs_coef", ascending=False).reset_index(drop=True)
    # Odds ratio interpretable si no hay estandarización, pero aún útil como ranking
    df["odds_ratio"] = np.exp(df["coef"])
    return df

def tree_native_importance_from_fitted_pipeline(fitted_pipe: Pipeline) -> Optional[pd.DataFrame]:
    preprocess = fitted_pipe.named_steps["preprocess"]
    model = fitted_pipe.named_steps["model"]
    if not hasattr(model, "feature_importances_"):
        return None
    names = get_feature_names(preprocess)
    imp = model.feature_importances_
    df = pd.DataFrame({"feature": names, "importance": imp})
    df["abs_importance"] = df["importance"].abs()
    df = df.sort_values("abs_importance", ascending=False).reset_index(drop=True)
    return df

def permutation_importance_from_fitted_pipeline(
    fitted_pipe: Pipeline,
    X_val: pd.DataFrame,
    y_val: pd.Series,
    n_repeats: int = 50,
    random_state: int = 42,
    scoring: str = "roc_auc"
) -> pd.DataFrame:
    # Use original feature names for permutation importance, as it perturbs original columns
    names = X_val.columns.tolist()

    r = permutation_importance(
        fitted_pipe, X_val, y_val,
        scoring=scoring,
        n_repeats=n_repeats,
        random_state=random_state,
        n_jobs=1
    )
    df = pd.DataFrame({
        "feature": names,
        "perm_importance_mean": r.importances_mean,
        "perm_importance_std": r.importances_std
    }).sort_values("perm_importance_mean", ascending=False).reset_index(drop=True)
    return df

def logistic_pvalues_statsmodels(
    preprocess: ColumnTransformer,
    X: pd.DataFrame,
    y: pd.Series,
    fit_intercept: bool = True,
) -> Optional[pd.DataFrame]:
    """
    P-values para logística usando statsmodels.
    Nota: esto es para logística NO regularizada (o regularización no estándar).
    En L1 (LASSO) no hay p-values clásicos directos.
    """
    if not HAS_SM:
        return None

    # Clone and fit the preprocessor to ensure correct feature names after transform
    fitted_preprocess = clone(preprocess)
    Xt = fitted_preprocess.fit_transform(X, y)
    if hasattr(Xt, "toarray"):
        Xt = Xt.toarray()

    # Get feature names from the fitted preprocessor
    feature_names_after_transform = list(fitted_preprocess.get_feature_names_out())

    # --- Check for and remove constant features (zero variance) ---
    std_devs = np.std(Xt, axis=0)
    constant_features_indices = np.where(std_devs < 1e-9)[0] # Using a small threshold for floating point precision

    if len(constant_features_indices) > 0:
        removed_features_names = [feature_names_after_transform[i] for i in constant_features_indices]
        print(f"Advertencia: Se encontraron y eliminaron {len(removed_features_names)} características constantes o casi constantes (STD < 1e-9) antes de calcular p-values con Statsmodels: {', '.join(removed_features_names)}")
        Xt = np.delete(Xt, constant_features_indices, axis=1)
        # Update feature names list to reflect removed columns
        remaining_feature_names = [name for i, name in enumerate(feature_names_after_transform) if i not in constant_features_indices]
    else:
        remaining_feature_names = feature_names_after_transform

    if fit_intercept:
        try:
            Xt = sm.add_constant(Xt, has_constant="add")
            feat = ["const"] + remaining_feature_names
        except ValueError as e:
            print(f"Advertencia: No se pudo añadir una constante al modelo Statsmodels: {e}. Retornando None para p-values.")
            return None
    else:
        feat = remaining_feature_names

    try:
        # Check if Xt is empty after removing features
        if Xt.shape[1] == 0:
            print("Advertencia: No quedan características después de la eliminación de constantes. No se puede ajustar el modelo Statsmodels. Retornando None para p-values.")
            return None

        model = sm.Logit(y.values, Xt)
        res = model.fit(disp=False)
    except np.linalg.LinAlgError as e:
        print(f"Advertencia: LinAlgError al calcular p-values con Statsmodels. La matriz de diseño es singular, lo que indica problemas de multicolinealidad. Detalles: {e}. Retornando None para p-values.")
        return None
    except Exception as e:
        print(f"Advertencia: Error inesperado al calcular p-values con Statsmodels: {e}. Retornando None para p-values.")
        return None

    # Ensure feat matches the number of parameters in res.params
    if len(feat) != len(res.params):
        print(f"Advertencia: El número de nombres de características ({len(feat)}) no coincide con el número de parámetros del modelo ({len(res.params)}). Esto puede indicar un problema. Se intenta emparejar los nombres de forma heurística, pero podría ser incorrecto.")
        # Fallback to generic names if mismatch, to prevent further errors
        feat = [f"param_{i}" for i in range(len(res.params))]

    out = pd.DataFrame({
        "feature": feat,
        "coef": res.params,
        "std_err": res.bse,
        "z": res.tvalues,
        "p_value": res.pvalues
    })
    out["abs_coef"] = np.abs(out["coef"])
    out = out.sort_values("p_value", ascending=True).reset_index(drop=True)
    return out


def try_shap_kernel_explainer(
    fitted_pipe: Pipeline,
    X_background: pd.DataFrame,
    X_explain: pd.DataFrame,
    nsamples: int = 200
) -> Optional[pd.DataFrame]:
    """
    SHAP KernelExplainer (modelo-tabular general), costoso pero con N=122 suele ser viable.
    Devuelve importancia media absoluta por feature.
    """
    if not HAS_SHAP:
        return None

    # SHAP necesita función que devuelva probas
    def f(x_np):
        x_df = pd.DataFrame(x_np, columns=X_background.columns)
        return fitted_pipe.predict_proba(x_df)[:, 1]

    # Convertimos a numpy
    bg = X_background.values
    ex = X_explain.values

    explainer = shap.KernelExplainer(f, bg)
    shap_values = explainer.shap_values(ex, nsamples=nsamples)  # (n_samples, n_features)

    mean_abs = np.mean(np.abs(shap_values), axis=0)
    df = pd.DataFrame({"feature": X_background.columns, "mean_abs_shap": mean_abs})
    df = df.sort_values("mean_abs_shap", ascending=False).reset_index(drop=True)
    return df

In [7]:
# =========================================================
# Modelos (factory) parametrizables
# =========================================================
def build_models(config: Dict[str, Dict[str, Any]]) -> Dict[str, Any]:
    """
    config: dict {model_name: {params...}}
    Devuelve instancias sklearn ya configuradas.
    """
    models = {}

    # 1) Logistic
    if "logreg" in config:
        p = config["logreg"]
        models["logreg"] = LogisticRegression(**p)
    if "logreg_d1a" in config:
        p = config["logreg_d1a"]
        models["logreg_d1a"] = LogisticRegression(**p)
    if "logreg_l3" in config:
        p = config["logreg_l3"]
        models["logreg_l3"] = LogisticRegression(**p)

    # 1.2) Logistic (baseline / L2 / L1)
    if "logreg_l4" in config:
        p = config["logreg_l4"]
        models["logreg_l4"] = LogisticRegression(**p)

    # 2) Logistic L1 (si quieres explícito)
    if "logreg_l1" in config:
        p = config["logreg_l1"]
        models["logreg_l1"] = LogisticRegression(**p)

    # 3) Random Forest
    if "rf" in config:
        p = config["rf"]
        models["rf"] = RandomForestClassifier(**p)

    if "rf_d1a" in config:
        p = config["rf_d1a"]
        models["rf_d1a"] = RandomForestClassifier(**p)

    if "rf_d3" in config:
        p = config["rf_d3"]
        models["rf_d3"] = RandomForestClassifier(**p)

    # 4) XGBoost
    if "xgb" in config:
        if not HAS_XGB:
            raise RuntimeError("xgboost no está instalado. Instala con: pip install xgboost")
        p = config["xgb"]
        models["xgb"] = XGBClassifier(**p)

    if "xgb_d1a" in config:
        if not HAS_XGB:
            raise RuntimeError("xgboost no está instalado. Instala con: pip install xgboost")
        p = config["xgb_d1a"]
        models["xgb_d1a"] = XGBClassifier(**p)

    if "xgb_d2" in config:
        if not HAS_XGB:
            raise RuntimeError("xgboost no está instalado. Instala con: pip install xgboost")
        p = config["xgb_d2"]
        models["xgb_d2"] = XGBClassifier(**p)

    if "xgb_d3" in config:
        if not HAS_XGB:
            raise RuntimeError("xgboost no está instalado. Instala con: pip install xgboost")
        p = config["xgb_d3"]
        models["xgb_d3"] = XGBClassifier(**p)

    # 5) SVM (probability=True para AUC)
    if "svm_rbf" in config:
        p = config["svm_rbf"]
        models["svm_rbf"] = SVC(**p)

    # 6) MLP
    if "mlp" in config:
        p = config["mlp"]
        models["mlp"] = MLPClassifier(**p)

    if "mlp_d2" in config:
        p = config["mlp_d2"]
        models["mlp_d2"] = MLPClassifier(**p)

    if "mlp_d3a" in config:
        p = config["mlp_d2"]
        models["mlp_d3a"] = MLPClassifier(**p)

    # 7) Bootstrap Decision Forest (bagging de árboles)
    if "bootstrap_df" in config:
        p = config["bootstrap_df"]
        base_tree_params = p.pop("base_tree_params", {})
        base_tree = DecisionTreeClassifier(**base_tree_params)
        models["bootstrap_df"] = BaggingClassifier(estimator=base_tree, **p)

    return models


In [8]:
# =========================================================
# Ejecución por dataset + reporte
# =========================================================
@dataclass
class DatasetSpec:
    name: str
    path: str
    scale_numeric: bool
    models_to_run: List[str]

def run_experiment(
    spec: DatasetSpec,
    model_config: Dict[str, Dict[str, Any]],
    n_splits: int = 5,
    random_state: int = 42,
    threshold: float = 0.5,
    permutation_repeats: int = 50,
    top_k: int = 20,
) -> Dict[str, Any]:
    X, y = load_dataset_tsv(spec.path)

    preprocess = make_preprocessor(
        X,
        scale_numeric=spec.scale_numeric
    )

    models = build_models(model_config)

    results = {
        "dataset": spec.name,
        "n": int(len(y)),
        "pos": int((y == 1).sum()),
        "neg": int((y == 0).sum()),
        "models": {}
    }

    # CV + importancias
    for mname in spec.models_to_run:
        print()
        print("=======================================================================================")
        print (f"     Conjunto de datos[{spec.name}] - Modelo de a ejecutar [{mname}]")
        print("=======================================================================================")
        print()
        if mname not in models:
            raise ValueError(f"Modelo '{mname}' no está en model_config.")

        base_model = models[mname]
        cv_res = evaluate_with_cv(
            base_model, X, y, preprocess,
            n_splits=n_splits,
            random_state=random_state,
            threshold=threshold
        )

        # Ajuste final en TODO el dataset para extraer pesos / importancias
        fitted_pipe = Pipeline([("preprocess", preprocess), ("model", clone(base_model))])
        fitted_pipe.fit(X, y)

        # Importancias / pesos:
        weights = None
        native_imp = None
        perm_imp = None
        pvals = None
        shap_imp_raw = None

        # lineales
        if hasattr(fitted_pipe.named_steps["model"], "coef_"):
            weights = linear_weights_from_fitted_pipeline(fitted_pipe).head(top_k)

            # p-values solo si (a) statsmodels disponible y (b) logística NO regularizada (o la tratas como tal)
            # Si el usuario define penalty="none" o penalty="l2", y quiere p-values, intentamos.
            if isinstance(fitted_pipe.named_steps["model"], LogisticRegression):
                pen = getattr(fitted_pipe.named_steps["model"], "penalty", None)
                if HAS_SM and (pen in (None, "none") or pen == "l2"):
                    # p-values con statsmodels (no es “el mismo” solver/regularización exacta, pero es estándar para inferencia)
                    fit_intercept = getattr(fitted_pipe.named_steps["model"], "fit_intercept", True)
                    pvals = logistic_pvalues_statsmodels(preprocess, X, y, fit_intercept=fit_intercept)
                    if pvals is not None:
                        pvals = pvals.head(top_k)

        # árboles / ensembles con feature_importances_
        native_imp = tree_native_importance_from_fitted_pipeline(fitted_pipe)
        if native_imp is not None:
            native_imp = native_imp.head(top_k)

        # permutation importance (robusto para cualquier modelo)
        # Usamos un pequeño “pseudo-val” (todo el dataset) para ranking general.
        perm_imp = permutation_importance_from_fitted_pipeline(
            fitted_pipe, X, y,
            n_repeats=permutation_repeats,
            random_state=random_state
        ).head(top_k)

        # SHAP KernelExplainer opcional (coste; útil en NN/SVM y modelos sin importancias interpretables)
        # Nota: esto da SHAP sobre columnas ORIGINALES de X (antes de onehot), es intencional para interpretabilidad “humana”.
        if HAS_SHAP and mname in ("mlp", "mlp_d2", "mlp_d3a", "svm_rbf"):
            # background pequeño
            bg = X.sample(min(30, len(X)), random_state=random_state)
            ex = X.sample(min(30, len(X)), random_state=random_state)
            try:
                shap_imp_raw = try_shap_kernel_explainer(fitted_pipe, bg, ex, nsamples=200)
                if shap_imp_raw is not None:
                    shap_imp_raw = shap_imp_raw.head(top_k)
            except Exception:
                shap_imp_raw = None

        results["models"][mname] = {
            "cv_folds": cv_res.folds,
            "cv_summary": cv_res.summary,
            "top_weights_or_coefs": weights,
            "top_pvalues": pvals,
            "top_native_importance": native_imp,
            "top_permutation_importance": perm_imp,
            "top_shap_kernel_importance_original_X": shap_imp_raw,
        }

    return results


In [None]:
# =========================================================
# CONFIGURACIÓN PARA TUS 3 DATASETS
# =========================================================
if __name__ == "__main__":
    # ---- Paths de tus 3 conjuntos ----
    DATASETS = [
        DatasetSpec(
            name="D1_clinico_demografico",
            path= source_path + dataset_covariables,
            scale_numeric=True,
            models_to_run=["logreg", "rf", "xgb","mlp"]
        ),
        DatasetSpec(
            name="D1_clinico_demografico_analisis",
            path= source_path + dataset_covariables_analysis,
            scale_numeric=True,
            models_to_run= ["logreg_d1a", "rf_d1a", "xgb_d1a", "mlp_d2"]
        ),
        DatasetSpec(
           name="D2_prs_34modelos",
           path=source_path + dataset_modelos,
           scale_numeric=True,
           models_to_run= ["logreg_l1", "svm_rbf", "xgb_d2", "mlp_d2"]
       ),
       DatasetSpec(
           name="D3_combinado_clinico_prs",
           path=source_path + dataset_cov_mod,
           scale_numeric=True,
           models_to_run=["logreg_l3", "rf_d3", "xgb_d3", "mlp_d2"]
       ),
       DatasetSpec(
           name="D3_combinado_clinico_prs_analisis",
           path=source_path + dataset_cov_mod_analysis,
           scale_numeric=True,
           models_to_run=["logreg_l4", "rf", "xgb", "mlp_d3a"]
       ),
    ]

    # ---- Modelos parametrizables (ajusta lo que quieras) ----
    # Nota: SVC debe llevar probability=True para AUC.
    MODEL_CONFIG = {
        # Logística baseline (sin regularización “fuerte”)
        "logreg": dict(                 # clinico-demográfico reducido
            penalty="l2",
            C=0.01,                     # [0.01, 0.03, 0.1, 0.3, 1, 3, 10]   # Default 1.0
            solver="liblinear",
            max_iter=5000,
            fit_intercept=True,       # [True, False]
            random_state=42
        ),
        "logreg_d1a": dict(                 # clinico-demográfico bibliografía
            penalty="elasticnet",
            C=0.3,                     # [0.01, 0.03, 0.1, 0.3, 1, 3, 10]   # Default 1.0
            l1_ratio = 0.5,
            solver="saga",
            max_iter=5000,
            fit_intercept=True,       # [True, False]
            random_state=42
        ),
        # Logística L1 (selección automática) – muy útil en PRS
        "logreg_l1": dict(            # prs
            penalty="l1",
            C=1,
            solver="saga",
            max_iter=5000,
            fit_intercept=True,
            tol=1e-4,
            random_state=42
        ),
        # Logística baseline (sin regularización “fuerte”)
        "logreg_l3": dict(
            penalty="elasticnet",       # combinado clinico + prs
            l1_ratio = 0.1,
            C=0.01,                     # [0.01, 0.03, 0.1, 0.3, 1, 3, 10]   # Default 1.0
            solver="saga",
            max_iter=5000,
            fit_intercept=True,       # [True, False]
            random_state=42
        ),
        # Logística baseline (sin regularización “fuerte”)
        "logreg_l4": dict(
            penalty="elasticnet",     # combinado clinico + prs  bibliografía
            l1_ratio = 0.3,
            C=1,                     # [0.01, 0.03, 0.1, 0.3, 1, 3, 10]   # Default 1.0
            solver="saga",
            max_iter=5000,
            fit_intercept=True,       # [True, False]
            random_state=42
        ),

        "rf": dict(                 # Clinico demográfico reducido
            n_estimators=300,
            max_depth=5,
            min_samples_leaf=5,
            random_state=42,
            n_jobs=-1
        ),
        "rf_d1a": dict(                  # clinico-demográfico bibliografía
            n_estimators=800,
            max_depth=3,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        ),
        "rf_d3": dict(                  # clinico-demográfico + modelos
            n_estimators=500,
            max_depth=4,
            min_samples_leaf=5,
            random_state=42,
            n_jobs=-1
        ),

        "xgb": dict(              # clinico-demográfico reducido
            n_estimators=500,     #[200, 500, 1200]
            learning_rate=0.1,     #[0.01, 0.05, 0.1]
            max_depth=4,            # [2, 3, 4]
            subsample=0.8,          #  [0.6, 0.8, 1.0]
            colsample_bytree=0.8,     #[0.5, 0.8, 1.0]
            reg_lambda=10,           # [1, 5, 10, 20
            reg_alpha = 1,       #[0, 0.5, 1, 3]
            min_child_weight=10,          # [1, 5, 10, 20]
            objective="binary:logistic",
            eval_metric="auc",
            random_state=42,
            n_jobs=-1,
            tree_method="hist",
            verbosity=0
        ),
        "xgb_d1a": dict(            # clinico-demográfico bibliografía
            n_estimators=800,     #[200, 500, 1200]
            learning_rate=0.1,     #[0.01, 0.05, 0.1]
            max_depth=4,            # [2, 3, 4]
            subsample=0.9,          #  [0.6, 0.8, 1.0]
            colsample_bytree=0.9,     #[0.5, 0.8, 1.0]
            reg_lambda=10,           # [1, 5, 10, 20
            reg_alpha = 0.5,       #[0, 0.5, 1, 3]
            min_child_weight=10,          # [1, 5, 10, 20]
            objective="binary:logistic",
            eval_metric="auc",
            random_state=42,
            n_jobs=-1,
            tree_method="hist",
            verbosity=0
        ),

        "xgb_d2": dict(           # prs
            n_estimators=500,     #[200, 500, 1200]
            learning_rate=0.1,     #[0.01, 0.05, 0.1]
            max_depth=4,            # [2, 3, 4]
            subsample=0.8,          #  [0.6, 0.8, 1.0]
            colsample_bytree=0.8,     #[0.5, 0.8, 1.0]
            reg_lambda=20,           # [1, 5, 10, 20
            reg_alpha = 3,       #[0, 0.5, 1, 3]
            min_child_weight=10,          # [1, 5, 10, 20]
            objective="binary:logistic",
            eval_metric="auc",
            random_state=42,
            n_jobs=-1,
            tree_method="hist",
            verbosity=0
        ),

        "xgb_d3": dict(            # clinico-demográfico bibliografía
            n_estimators=300,     #[200, 500, 1200]
            learning_rate=0.05,     #[0.01, 0.05, 0.1]
            max_depth=3,            # [2, 3, 4]
            subsample=0.8,          #  [0.6, 0.8, 1.0]
            colsample_bytree=0.9,     #[0.5, 0.8, 1.0]
            reg_lambda=1,           # [1, 5, 10, 20
            reg_alpha = 1,       #[0, 0.5, 1, 3]
            min_child_weight=10,          # [1, 5, 10, 20]
            objective="binary:logistic",
            eval_metric="auc",
            random_state=42,
            n_jobs=-1,
            tree_method="hist",
            verbosity=0
        ),

        "svm_rbf": dict(        # prs
            C=1,                # C: [0.1, 1, 10, 100]
            gamma=0.01,        # ["scale", 0.01, 0.1, 1]
            kernel="rbf",
            probability=True,
            random_state=42
        ),

        "mlp": dict(            # clinico-demográfico reducido + bibliografia
            hidden_layer_sizes=(64,32),      # [(16,), (32,), (32, 16), (64, 32)],
            activation="relu",
            solver="adam",
            alpha=1e-3,             # [1e-5, 1e-4, 1e-3, 1e-2]
            batch_size=32,          # [8, 16, 32]
            learning_rate="adaptive",
            learning_rate_init=1e-3,    # [1e-4, 1e-3, 5e-3]
            max_iter=2000,
            early_stopping=True,
            validation_fraction=0.2,
            n_iter_no_change=30,
            random_state=42
        ),

        "mlp_d2": dict(           # prs
            hidden_layer_sizes=(64,32),      # [(16,), (32,), (32, 16), (64, 32)],
            activation="relu",
            solver="adam",
            alpha=1e-4,             # [1e-5, 1e-4, 1e-3, 1e-2]
            batch_size=32,          # [8, 16, 32]
            learning_rate="adaptive",
            learning_rate_init=1e-3,    # [1e-4, 1e-3, 5e-3]
            max_iter=2000,
            early_stopping=True,
            validation_fraction=0.2,
            n_iter_no_change=30,
            random_state=42
        ),

        "mlp_d3a": dict(           # prs
            hidden_layer_sizes=(32,16),      # [(16,), (32,), (32, 16), (64, 32)],
            activation="relu",
            solver="adam",
            alpha=1e-5,             # [1e-5, 1e-4, 1e-3, 1e-2]
            batch_size=16,          # [8, 16, 32]
            learning_rate="adaptive",
            learning_rate_init=1e-4,    # [1e-4, 1e-3, 5e-3]
            max_iter=2000,
            early_stopping=True,
            validation_fraction=0.2,
            n_iter_no_change=30,
            random_state=42
        ),
    }


    # ---- Ejecución ----
    all_results = []
    for ds in DATASETS:
        res = run_experiment(
            spec=ds,
            model_config=MODEL_CONFIG,
            n_splits=5,
            random_state=42,
            threshold=0.5,
            permutation_repeats=50,
            top_k=20
        )
        all_results.append(res)

        print("\n" + "="*90)
        print(f"DATASET: {res['dataset']} | N={res['n']} | pos={res['pos']} | neg={res['neg']}")
        print("="*90)

        for mname, out in res["models"].items():
            print(f"\n--- Modelo: {mname} ---")
            print("\nCV summary:")
            print(out["cv_summary"].round(4).to_string())

            print("\nCV folds:")
            print(out["cv_folds"].round(4).to_string(index=False))

            if out["top_weights_or_coefs"] is not None:
                print("\nTop coeficientes (peso/odds-ratio):")
                print(out["top_weights_or_coefs"].round(6).to_string(index=False))

            if out["top_pvalues"] is not None:
                print("\nTop p-values (statsmodels; válido sobre todo para logística no-L1):")
                print(out["top_pvalues"].round(6).to_string(index=False))

            if out["top_native_importance"] is not None:
                print("\nTop importancia nativa (feature_importances_):")
                print(out["top_native_importance"].round(6).to_string(index=False))

            if out["top_permutation_importance"] is not None:
                print("\nTop permutation importance (AUC):")
                print(out["top_permutation_importance"].round(6).to_string(index=False))

            if out["top_shap_kernel_importance_original_X"] is not None:
                print("\nTop SHAP (KernelExplainer) sobre variables originales (útil en MLP/SVM):")
                print(out["top_shap_kernel_importance_original_X"].round(6).to_string(index=False))

    # (Opcional) Guardar todo a disco de forma simple:
    # - summaries a un TSV por dataset/modelo
    mod_res_path = TFM_PATH + "/modelos_results/"
    for res in all_results:
        for mname, out in res["models"].items():
            out["cv_folds"].to_csv(mod_res_path+ f"{res['dataset']}__{mname}__cv_folds.tsv", sep="\t", index=False)
            out["cv_summary"].to_frame("value").to_csv(mod_res_path+ f"{res['dataset']}__{mname}__cv_summary.tsv", sep="\t")
            if out["top_weights_or_coefs"] is not None:
                out["top_weights_or_coefs"].to_csv(mod_res_path + f"{res['dataset']}__{mname}__top_coefs.tsv", sep="\t", index=False)
            if out["top_pvalues"] is not None:
                out["top_pvalues"].to_csv(mod_res_path + f"{res['dataset']}__{mname}__top_pvalues.tsv", sep="\t", index=False)
            if out["top_native_importance"] is not None:
                out["top_native_importance"].to_csv(mod_res_path + f"{res['dataset']}__{mname}__top_native_importance.tsv", sep="\t", index=False)
            if out["top_permutation_importance"] is not None:
                out["top_permutation_importance"].to_csv(mod_res_path + f"{res['dataset']}__{mname}__top_perm_importance.tsv", sep="\t", index=False)
            if out["top_shap_kernel_importance_original_X"] is not None:
                out["top_shap_kernel_importance_original_X"].to_csv(mod_res_path + f"{res['dataset']}__{mname}__top_shap_kernel.tsv", sep="\t", index=False)

In [None]:
import pandas as pd

summary_data = []

for res in all_results:
    dataset_name = res['dataset']
    for mname, model_results in res['models'].items():
        summary_series = model_results['cv_summary']
        summary_dict = summary_series.to_dict()
        summary_dict['Dataset_Model'] = f"{dataset_name} - {mname}"
        summary_data.append(summary_dict)

# Create a DataFrame from the collected summaries
full_cv_summary_df = pd.DataFrame(summary_data)

# Set 'Dataset_Model' as the index and sort it
full_cv_summary_df = full_cv_summary_df.set_index('Dataset_Model')
full_cv_summary_df = full_cv_summary_df.round(4)

# Display the DataFrame
display(full_cv_summary_df)