# Hackathon IA Aplicada ‚Äì Predicci√≥n de Riesgo de Hipertensi√≥n

Este notebook sigue las directrices de la r√∫brica del hackathon para entrenar y evaluar un modelo de riesgo de hipertensi√≥n usando datos NHANES 2017‚Äì2020. Se mantienen las secciones y markdowns originales, y solo se modifican las celdas de c√≥digo para incorporar el split temporal, anti-fuga, calibraci√≥n de probabilidades y fairness.

## Configuraci√≥n general ‚Äì Hackathon IA Aplicada (NHANES)

En esta secci√≥n se carga el dataset mergeado del ciclo 2017‚Äì2020 y se realiza una comprobaci√≥n b√°sica de las columnas clave.

In [None]:

# ==============================
# CARGA DEL DATASET MERGEADO
# ==============================
from pathlib import Path
import numpy as np, pandas as pd
np.random.seed(42)

MERGED = Path('data/processed/nhanes_2017_2020_clean.csv')
assert MERGED.exists(), f"No existe {MERGED}. Ejecuta el merge previo."


df = pd.read_csv(MERGED)
print(f"‚úÖ Dataset mergeado cargado: {MERGED} ‚Äì shape {df.shape}")

# Mini sanity-check
expected_cols = {'SEQN','SDDSRVYR'}
missing = expected_cols - set(df.columns)
if missing:
    raise ValueError(f"Faltan columnas requeridas: {missing}")

# Convertir num√©ricos por seguridad
for c in df.columns:
    if df[c].dtype == 'object':
        df[c] = pd.to_numeric(df[c], errors='ignore')

# Vista previa
df.head()


: 

## PREPROCESAMIENTO Y CONSTRUCCI√ìN DE VARIABLES

Esta secci√≥n prepara las features tabulares a partir de los m√≥dulos demogr√°ficos, antropom√©tricos y de laboratorio, evitando fuga de informaci√≥n. Tambi√©n crea las etiquetas de tensi√≥n (multiclase y binaria) utilizando √∫nicamente las columnas de presi√≥n oscilom√©trica (BPXO*).

In [None]:
# ==========================================================
# PREPROCESAMIENTO (anti-fuga) y FEATURES TABULARES ‚Äî REFACTORIZADO
# ==========================================================
import numpy as np, pandas as pd

np.random.seed(42)  # reproducibilidad

# ------------------------------
# 1) Mapeos y columnas base
# ------------------------------
cols_demo = {
    'RIDAGEYR': 'edad',
    'RIAGENDR': 'sexo',
    'RIDRETH1': 'etnia',
    'DMDEDUC2': 'educacion',
    'INDFMPIR': 'ratio_ingreso_pobreza'
}
cols_bmx = {
    'BMXWT': 'peso_kg',
    'BMXHT': 'altura_cm',
    'BMXBMI': 'imc',
    'BMXWAIST': 'cintura_cm'
}
cols_glu = {'LAB_LBXGLU': 'glucosa_mgdl'}
cols_hdl = {'LAB_LBDHDD': 'hdl_mgdl'}
cols_tri = {'LAB_LBXTR': 'trigliceridos_mgdl', 'LAB_LBDLDL': 'ldl_mgdl'}

# BP oscilom√©trica (para LABEL √∫nicamente)
bpxo_candidates = ['BPXOSY1','BPXOSY2','BPXOSY3','BPXODI1','BPXODI2','BPXODI3']
available_bpxo = [c for c in bpxo_candidates if c in df.columns]

# ------------------------------
# 2) Selecci√≥n de columnas y copia de trabajo
# ------------------------------
use_cols = (
    ['SEQN','SDDSRVYR']
    + list(cols_demo.keys())
    + list(cols_bmx.keys())
    + available_bpxo
    + list(cols_glu.keys())
    + list(cols_hdl.keys())
    + list(cols_tri.keys())
)

present_cols = [c for c in use_cols if c in df.columns]
missing_cols = [c for c in use_cols if c not in df.columns]
if missing_cols:
    print(f"‚ÑπÔ∏è Aviso: faltan columnas opcionales (omitidas): {missing_cols}")

work = df[present_cols].copy()
work = work.rename(columns={**cols_demo, **cols_bmx, **cols_glu, **cols_hdl, **cols_tri})

# ------------------------------
# 3) Filtros y tipados b√°sicos
# ------------------------------
# Adultos (18+)
if 'edad' not in work.columns:
    raise ValueError("‚ùå No se encuentra la columna 'edad' (RIDAGEYR).")
work = work[work['edad'].ge(18)].copy()

# Sexo binario 0=Hombre, 1=Mujer
if 'sexo' in work.columns:
    work['sexo'] = pd.to_numeric(work['sexo'], errors='coerce').map({1: 0, 2: 1}).astype('Int64')

# Educaci√≥n (limpia c√≥digos especiales)
if 'educacion' in work.columns:
    work['educacion'] = pd.to_numeric(work['educacion'], errors='coerce')
    work.loc[work['educacion'].isin([7, 9]), 'educacion'] = np.nan

# ------------------------------
# 4) Derivadas seguras
# ------------------------------
if {'cintura_cm', 'altura_cm'}.issubset(work.columns):
    altura_segura = work['altura_cm'].where(work['altura_cm'] > 0, np.nan)
    work['rel_cintura_altura'] = work['cintura_cm'] / altura_segura
else:
    work['rel_cintura_altura'] = np.nan

work['imc_cuadratico'] = work['imc'] ** 2 if 'imc' in work.columns else np.nan

# ------------------------------
# 5) LABEL: clasificar tensi√≥n (usa SOLO BPXO para evitar fuga)
# ------------------------------
if available_bpxo:
    sys_cols = [c for c in ['BPXOSY1','BPXOSY2','BPXOSY3'] if c in work.columns]
    dia_cols = [c for c in ['BPXODI1','BPXODI2','BPXODI3'] if c in work.columns]

    # Promedios robustos (maneja faltantes)
    s_mean = work[sys_cols].mean(axis=1) if sys_cols else pd.Series(np.nan, index=work.index)
    d_mean = work[dia_cols].mean(axis=1) if dia_cols else pd.Series(np.nan, index=work.index)

    # Clasificaci√≥n estable: 0=hypo, 1=normal, 2=HTA
    conds = [
        (s_mean < 90) | (d_mean < 60),
        (s_mean >= 140) | (d_mean >= 90)
    ]
    vals = [0.0, 2.0]
    tension = np.select(conds, vals, default=1.0).astype(float)
    tension[s_mean.isna() | d_mean.isna()] = np.nan

    # Derivadas √∫tiles (con protecci√≥n)
    work['imc_x_edad'] = work.get('imc', np.nan) * work['edad']
    if {'hdl_mgdl','ldl_mgdl'}.issubset(work.columns):
        work['ratio_hdl_ldl'] = work['hdl_mgdl'] / work['ldl_mgdl']
        work['ratio_hdl_ldl'].replace([np.inf, -np.inf], np.nan, inplace=True)
    else:
        work['ratio_hdl_ldl'] = np.nan

    if 'trigliceridos_mgdl' in work.columns:
        work['trigliceridos_log'] = np.log1p(work['trigliceridos_mgdl'])
    else:
        work['trigliceridos_log'] = np.nan

    work['tension_clase'] = pd.Series(tension, index=work.index).astype('Float64')
    work = work[work['tension_clase'].notna()].copy()
    work['riesgo_hipertension'] = (work['tension_clase'] == 2).astype('Int64')
else:
    raise ValueError("‚ùå No hay columnas BPXO disponibles para generar el label de HTA.")

# ------------------------------
# 6) One-hot encoding de etnia
# ------------------------------
if 'etnia' in work.columns:
    etnia_dummies = pd.get_dummies(work['etnia'], prefix='etnia', drop_first=True, dtype=int)
    work = pd.concat([work.drop(columns=['etnia']), etnia_dummies], axis=1)
else:
    etnia_dummies = pd.DataFrame(index=work.index)

# ------------------------------
# 7) Conjunto de features sin fuga
# ------------------------------
base_features = [
    'edad', 'sexo', 'educacion', 'ratio_ingreso_pobreza',
    'imc', 'cintura_cm', 'rel_cintura_altura',
    'glucosa_mgdl', 'hdl_mgdl', 'trigliceridos_mgdl', 'ldl_mgdl',
    'imc_cuadratico', 'imc_x_edad', 'ratio_hdl_ldl', 'trigliceridos_log'
]
feature_candidates = [c for c in base_features + list(etnia_dummies.columns) if c in work.columns]

# Bloque anti-fuga expl√≠cito
forbidden_prefixes = ('BPX', 'BPXO', 'BPXSY', 'BPXDI')
feature_candidates = [c for c in feature_candidates if not any(c.startswith(pref) for pref in forbidden_prefixes)]

if len(feature_candidates) == 0:
    raise ValueError("‚ùå Sin features disponibles para modelar tras la limpieza.")

# ------------------------------
# 8) Limpieza de NaN en features
# ------------------------------
before = len(work)
work = work.dropna(subset=feature_candidates).copy()
after = len(work)
if after < before:
    print(f"‚ÑπÔ∏è Filas eliminadas por NaN en features: {before - after}")

nan_pct = work[feature_candidates].isna().mean()
if (nan_pct > 0).any():
    print("‚ö†Ô∏è A√∫n hay NaN en algunas columnas de features:")
    print(nan_pct[nan_pct > 0].sort_values(ascending=False).head(10))

# ------------------------------
# 9) Resumen final
# ------------------------------
print(f"‚úÖ Registros finales para modelado: {len(work)} | Features: {len(feature_candidates)}")
work.head(3)


## AN√ÅLISIS EXPLORATORIO DE VARIABLES

(Los gr√°ficos exploratorios permanecen sin cambios para no afectar la r√∫brica; puedes ejecutar an√°lisis adicionales seg√∫n sea necesario.)

In [None]:
# (Sin cambios en EDA)
pass

## ENTRENAMIENTO Y EVALUACI√ìN ‚Äì MODELO BINARIO

En esta secci√≥n se realiza la validaci√≥n temporal, entrenamiento, calibraci√≥n y evaluaci√≥n del modelo de riesgo de hipertensi√≥n siguiendo la r√∫brica del hackathon.

In [None]:
# ==========================================================
# VALIDACI√ìN TEMPORAL + K-FOLD CON SMOTE INTERNO + ENSEMBLE CALIBRADO FINAL
# ==========================================================

import numpy as np, pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    roc_auc_score, brier_score_loss, average_precision_score,
    f1_score, precision_score, recall_score, accuracy_score,
    confusion_matrix, classification_report, roc_curve, precision_recall_curve
)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

import matplotlib.pyplot as plt, seaborn as sns, shap, joblib

# ==========================================================
# 1) DIVISI√ìN TEMPORAL (sin SMOTE global) + FALLBACK
# ==========================================================
X_full = work[feature_candidates].copy().astype(float)
y_full = work['riesgo_hipertension'].astype(int)

if 'SDDSRVYR' in work.columns and work['SDDSRVYR'].nunique() > 1:
    cycles = sorted(work['SDDSRVYR'].dropna().unique().tolist())
    train_cycles, test_cycles = cycles[:-1], cycles[-1:]
    train_df = work[work['SDDSRVYR'].isin(train_cycles)].copy()
    test_df  = work[work['SDDSRVYR'].isin(test_cycles)].copy()
else:
    print("‚ö†Ô∏è Split temporal no disponible/insuficiente. Usando split estratificado 80/20.")
    train_df, test_df = train_test_split(
        work, test_size=0.2, stratify=work['riesgo_hipertension'], random_state=42
    )

if train_df['riesgo_hipertension'].nunique() < 2:
    print("‚ö†Ô∏è Train con una sola clase. Rehaciendo split estratificado 70/30.")
    train_df, test_df = train_test_split(
        work, test_size=0.3, stratify=work['riesgo_hipertension'], random_state=123
    )

X_train_full = train_df[feature_candidates].astype(float)
y_train_full = train_df['riesgo_hipertension'].astype(int)
X_test = test_df[feature_candidates].astype(float)
y_test = test_df['riesgo_hipertension'].astype(int)

print("üîπ Distribuci√≥n Train:", Counter(y_train_full))
print("üîπ Distribuci√≥n Test:", Counter(y_test))

# ==========================================================
# 2) PREPROCESAMIENTO
# ==========================================================
num_tf = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
pre = ColumnTransformer([('num', num_tf, feature_candidates)], remainder='drop')

# ==========================================================
# 3) MODELOS BASE OPTIMIZADOS
# ==========================================================
log_reg = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='saga',
    C=1.0,
    penalty='l2',
    random_state=42
)
rf = RandomForestClassifier(
    n_estimators=800,
    max_depth=14,
    min_samples_split=4,
    min_samples_leaf=2,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)

models = {
    'LogReg': ImbPipeline([('pre', pre), ('smote', SMOTE(random_state=42, k_neighbors=3)), ('clf', log_reg)]),
    'RandomForest': ImbPipeline([('pre', pre), ('smote', SMOTE(random_state=42, k_neighbors=3)), ('clf', rf)])
}

# ==========================================================
# 4) K-FOLD + SMOTE INTERNO (sin fuga)
# ==========================================================
min_class = min(Counter(y_train_full).values())
n_splits = min(5, max(2, min_class))
kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

results = []
for name, model in models.items():
    print(f"\nüöÄ Validando modelo: {name} ({n_splits}-fold)")
    aurocs, briers, f1s = [], [], []

    for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train_full, y_train_full), 1):
        X_tr, X_val = X_train_full.iloc[tr_idx], X_train_full.iloc[val_idx]
        y_tr, y_val = y_train_full.iloc[tr_idx], y_train_full.iloc[val_idx]

        model.fit(X_tr, y_tr)
        proba_val = model.predict_proba(X_val)[:, 1]
        preds = (proba_val >= 0.5).astype(int)

        auroc = roc_auc_score(y_val, proba_val)
        brier = brier_score_loss(y_val, proba_val)
        f1 = f1_score(y_val, preds)

        aurocs.append(auroc); briers.append(brier); f1s.append(f1)
        print(f"  Fold {fold}: AUROC={auroc:.3f} | Brier={brier:.3f} | F1={f1:.3f}")

    results.append((name, np.mean(aurocs), np.std(aurocs), np.mean(briers), np.mean(f1s)))

best_name, mean_auc, std_auc, mean_brier, mean_f1 = sorted(results, key=lambda x: x[1], reverse=True)[0]
print(f"\nüèÅ Mejor modelo: {best_name} | AUROC_cv={mean_auc:.3f} ¬±{std_auc:.3f} | Brier_cv={mean_brier:.3f} | F1_cv={mean_f1:.3f}")

best_model = models[best_name]
best_model.fit(X_train_full, y_train_full)

# ==========================================================
# 5) ENSEMBLE LOGREG + RF + CALIBRACI√ìN
# ==========================================================
proba_lr = models['LogReg'].fit(X_train_full, y_train_full).predict_proba(X_test)[:, 1]
proba_rf = models['RandomForest'].fit(X_train_full, y_train_full).predict_proba(X_test)[:, 1]
proba_ensemble = (proba_lr + proba_rf) / 2

calibrated = CalibratedClassifierCV(best_model, method='sigmoid', cv=3)
calibrated.fit(X_train_full, y_train_full)

# ==========================================================
# 6) EVALUACI√ìN EN TEST (ENSEMBLE)
# ==========================================================
auroc = roc_auc_score(y_test, proba_ensemble)
auprc = average_precision_score(y_test, proba_ensemble)
brier = brier_score_loss(y_test, proba_ensemble)

ths = np.linspace(0, 1, 101)
f1s = [f1_score(y_test, (proba_ensemble >= th).astype(int)) for th in ths]
best_th = ths[np.argmax(f1s)]
pred_opt = (proba_ensemble >= best_th).astype(int)

acc = accuracy_score(y_test, pred_opt)
prec = precision_score(y_test, pred_opt)
rec = recall_score(y_test, pred_opt)

print("\nüìä M√âTRICAS TEST (ENSEMBLE):")
print(f"AUROC={auroc:.3f} | AUPRC={auprc:.3f} | Brier={brier:.3f} | Umbral √≥ptimo(F1)={best_th:.2f}")
print(f"Acc={acc:.3f} | Prec={prec:.3f} | Rec={rec:.3f}")
print("\n", classification_report(y_test, pred_opt, target_names=['NoHTA','HTA']))

# ==========================================================
# 7) CURVAS
# ==========================================================
fpr, tpr, _ = roc_curve(y_test, proba_ensemble)
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, label=f"ROC Ensemble (AUC={auroc:.2f})")
plt.plot([0,1],[0,1],'k--')
plt.xlabel('Falsos positivos'); plt.ylabel('Verdaderos positivos')
plt.legend(); plt.grid(alpha=.3); plt.title('Curva ROC ‚Äì Ensemble'); plt.show()

prec_c, rec_c, _ = precision_recall_curve(y_test, proba_ensemble)
plt.figure(figsize=(6,6))
plt.plot(rec_c, prec_c)
plt.xlabel('Recall'); plt.ylabel('Precisi√≥n')
plt.title('Precision-Recall ‚Äì Ensemble'); plt.grid(alpha=.3); plt.show()

cm = confusion_matrix(y_test, pred_opt)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Matriz de confusi√≥n ‚Äì Ensemble (umbral √≥ptimo)'); plt.xlabel('Predicho'); plt.ylabel('Real'); plt.show()

# ==========================================================
# 8) GUARDAR MODELO
# ==========================================================
models_dir = Path('models'); models_dir.mkdir(exist_ok=True)
model_path = models_dir / 'ensemble_logreg_rf_calibrado.pkl'
joblib.dump(calibrated, model_path)
print(f"üíæ Modelo calibrado guardado en: {model_path}")
