# Preparación de datos (CRISP-DM) — Diabetes, Hypertension, Obesity
**Objetivo:** limpieza, EDA, feature engineering, codificación y pipelines listos para modelado probabilístico (predicción de probabilidad de Diabetes / Hypertension / Obesity).
Basado en el análisis previo (Fase de Entendimiento de los datos).


In [1]:
!pip install -q category_encoders

import os, joblib, json, re, math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import category_encoders as ce
from imblearn.over_sampling import SMOTE

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.9/85.9 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

# 1) Carga de archivos

In [3]:
# 1) Carga robusta de archivos
candidates = {
    "diabetes": [
        "./diabetes_data.csv"
    ],
    "hypertension": [
        "./hypertension_data.csv"
    ],
    "obesity": [
        "./obesity_level.csv"
    ]
}

def try_load(paths):
    for p in paths:
        if os.path.exists(p):
            print(f"Loading {p}")
            return pd.read_csv(p)
    return None

diabetes_df = try_load(candidates["diabetes"])
hypertension_df = try_load(candidates["hypertension"])
obesity_df = try_load(candidates["obesity"])

# fallback: pedir upload (solo si alguna es None)
if diabetes_df is None or hypertension_df is None or obesity_df is None:
    from google.colab import files
    print("Algunas tablas no se encontraron en las rutas esperadas. Puedes subir archivos ahora.")
    uploaded = files.upload()
    for fname in uploaded:
        print("Uploaded:", fname)
        if "diabetes" in fname.lower():
            diabetes_df = pd.read_csv(fname)
        elif "hyper" in fname.lower() or "hypertension" in fname.lower() or "hypert" in fname.lower():
            hypertension_df = pd.read_csv(fname)
        elif "obes" in fname.lower():
            obesity_df = pd.read_csv(fname)

# Report shapes
print("Shapes:")
print("Diabetes:", None if diabetes_df is None else diabetes_df.shape)
print("Hypertension:", None if hypertension_df is None else hypertension_df.shape)
print("Obesity:", None if obesity_df is None else obesity_df.shape)

# Guardar copias originales para referencia
if diabetes_df is not None:
    diabetes_raw = diabetes_df.copy()
if hypertension_df is not None:
    hypertension_raw = hypertension_df.copy()
if obesity_df is not None:
    obesity_raw = obesity_df.copy()


Loading ./diabetes_data.csv
Loading ./hypertension_data.csv
Loading ./obesity_level.csv
Shapes:
Diabetes: (70692, 18)
Hypertension: (26083, 14)
Obesity: (20758, 18)


# 3) Limpieza específica por dataset

In [4]:
def _find_age_col(df):
    """Devuelve el nombre de la columna que parece ser edad ('age' case-insensitive) o None."""
    if df is None:
        return None
    for c in df.columns:
        if c.lower() == "age":
            return c
    # buscar variantes (Age, AGE, age_years...)
    for c in df.columns:
        if "age" in c.lower():
            return c
    return None

USE_MIDPOINT = False    # True -> usar punto medio del rango; False -> edad aleatoria dentro del rango
DROP_DUPLICATES_SUBSET = None  # e.g. ['id', 'fecha'] para duplicados por subset; None -> duplicados exactos

# ---- Diabetes: carga y duplicados ----
if diabetes_df is not None:
    df = diabetes_df.copy()
    print("Diabetes: initial shape", df.shape)

    # 1) Duplicados exactos (o por subset)
    if DROP_DUPLICATES_SUBSET:
        ndup = df.duplicated(subset=DROP_DUPLICATES_SUBSET).sum()
        print(f"Duplicados por subset {DROP_DUPLICATES_SUBSET}:", ndup)
        if ndup > 0:
            df = df.drop_duplicates(subset=DROP_DUPLICATES_SUBSET).reset_index(drop=True)
            print("After drop duplicates (subset):", df.shape)
    else:
        ndup = df.duplicated().sum()
        print("Duplicados exactos:", ndup)
        if ndup > 0:
            df = df.drop_duplicates().reset_index(drop=True)
            print("After drop duplicates:", df.shape)

    # Guardar copia base para limpieza
    diabetes_df_clean = df.copy()
else:
    print("diabetes_df es None -> no hay datos para procesar")

# ---- EDA y Age mapping ----
if diabetes_df_clean is None:
    print("No hay dataframe limpio: se omite mapeo de Age")
else:
    # Helper para modo seguro
    def get_mode_safe(series):
        m = series.mode()
        return m.iloc[0] if not m.empty else None

    # Chequeos EDA básicos para 'Age'
    if "Age" in diabetes_df_clean.columns:
        print("Checando columna 'Age'...")
        s = diabetes_df_clean["Age"]

        n_null = s.isnull().sum()
        n_unique = s.nunique(dropna=True)
        sample_uniques = sorted(s.dropna().unique())[:30]
        print(f" nulos: {n_null}, únicos: {n_unique}, sample únicos (hasta 30): {sample_uniques}")

        # Detectar si parecen códigos 1..13 o edades reales
        # Regla heurística: si todos los valores no nulos están entre 1 y 13 (o mayoría) -> son códigos
        non_null = s.dropna()
        if non_null.empty:
            print(" Columna 'Age' sólo tiene NaNs. No se mapea.")
            is_code = False
        else:
            # normalizamos a enteros cuando el valor es efectivamente entero (ej 1.0)
            # pero no sobrescribimos aún la columna
            vals = non_null.map(lambda x: float(x) if pd.notna(x) else x)
            within_1_13 = vals.between(1, 13).mean()  # proporción en rango 1..13
            print(f" Proporción valores en [1,13]: {within_1_13:.2f}")

            # Umbral: si >= 0.7 asumimos códigos (ajusta según contexto)
            is_code = within_1_13 >= 0.7

        if is_code:
            print(" Detectado: 'Age' parece ser CÓDIGOS (1..13). Procediendo a mapear.")
            # Definir rangos (ajusta si tus rangos difieren)
            rangos = {
                1: (18, 24),
                2: (25, 29),
                3: (30, 34),
                4: (35, 39),
                5: (40, 44),
                6: (45, 49),
                7: (50, 54),
                8: (55, 59),
                9: (60, 64),
                10: (65, 69),
                11: (70, 74),
                12: (75, 79),
                13: (80, 100)
            }

            # Si se requiere reproducibilidad
            if RANDOM_SEED is not None:
                np.random.seed(RANDOM_SEED)

            def map_code_to_age(v):
                # manejar NaN
                if pd.isna(v):
                    return np.nan
                # intentar convertir a entero cercano
                try:
                    # si v viene como '1.0' o 1.0 -> int
                    vi = int(float(v))
                except Exception:
                    # si no convertible, devolver NaN para revisar manualmente
                    return np.nan
                if vi in rangos:
                    low, high = rangos[vi]
                    if USE_MIDPOINT:
                        return (low + high) // 2
                    else:
                        return int(np.random.randint(low, high + 1))
                else:
                    # si no encontrado en rangos, devolver NaN para revisión
                    return np.nan

            # Aplicar mapeo con .loc para evitar warnings
            diabetes_df_clean.loc[:, "Age_mapped"] = diabetes_df_clean["Age"].apply(map_code_to_age)

            # Reporte
            n_mapped = diabetes_df_clean["Age_mapped"].notna().sum()
            n_failed = diabetes_df_clean["Age_mapped"].isna().sum()
            print(f" Edad mapeada en {n_mapped} filas; {n_failed} filas quedan NaN tras el mapeo (incluye NaN previos).")
            print("Stats de Age_mapped:")
            print(diabetes_df_clean["Age_mapped"].describe())

            # Si deseas reemplazar la columna original:
            # diabetes_df_clean.loc[:, "Age"] = diabetes_df_clean["Age_mapped"]
            # diabetes_df_clean = diabetes_df_clean.drop(columns=["Age_mapped"])
        else:
            print(" 'Age' parece contener edades reales o mezcla. No se mapea; solo se convierte a numérico seguro.")
            # convertir a numérico forzando NaN donde no aplicable
            diabetes_df_clean.loc[:, "Age_numeric"] = pd.to_numeric(diabetes_df_clean["Age"], errors="coerce")
            print("Stats de Age_numeric:")
            print(diabetes_df_clean["Age_numeric"].describe())

    else:
        print("La columna 'Age' no existe en el dataset.")

    # ---- EDA adicional y resumen ----
    print("\nResumen de nulls por columna (primeras 20):")
    print(diabetes_df_clean.isnull().sum().sort_values(ascending=False).head(20))

    print("\nEjemplo de primeras filas (head):")
    print(diabetes_df_clean.head(10))

    # media dentro del rango, no random
    # comparar con la edad de los otros datasts

Diabetes: initial shape (70692, 18)
Duplicados exactos: 6672
After drop duplicates: (64020, 18)
Checando columna 'Age'...
 nulos: 0, únicos: 13, sample únicos (hasta 30): [np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0), np.float64(10.0), np.float64(11.0), np.float64(12.0), np.float64(13.0)]
 Proporción valores en [1,13]: 1.00
 Detectado: 'Age' parece ser CÓDIGOS (1..13). Procediendo a mapear.
 Edad mapeada en 64020 filas; 0 filas quedan NaN tras el mapeo (incluye NaN previos).
Stats de Age_mapped:
count    64020.000000
mean        60.670150
std         15.588718
min         18.000000
25%         51.000000
50%         62.000000
75%         70.000000
max        100.000000
Name: Age_mapped, dtype: float64

Resumen de nulls por columna (primeras 20):
Age                     0
Sex                     0
HighChol                0
CholCheck               0
BMI                     0
Smoker  

In [5]:
# eeliminar columna age
diabetes_df_clean = diabetes_df_clean.drop(columns=["Age"])

In [6]:
diabetes_df_clean.head()

Unnamed: 0,Sex,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes,Age_mapped
0,1.0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0,38
1,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,79
2,1.0,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0,94
3,1.0,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0,72
4,0.0,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,59


In [7]:
# ---- Hypertension ----
if hypertension_df is not None:
    df = hypertension_df.copy()
    print("Hypertension: initial shape", df.shape)
    print("Missing total:", df.isnull().sum().sum())

    if "target" in df.columns:
        missing_target = df["target"].isnull().sum()
        print("target missing:", missing_target)

    hypertension_df_clean = df.copy()
else:
    print("hypertension_df is None -> no data to clean")

# Si no hay dataframe limpio, evitar ejecutar el resto
if hypertension_df_clean is None:
    # opcional: lanzar excepción o continuar según flujo
    print("No hypertension_df_clean available — saltando limpieza.")
else:
    # ---------- Helper: función para obtener moda segura ----------
    def get_mode_safe(series):
        m = series.mode()
        if m.empty:
            return None
        return m.iloc[0]

    # ------- sex: rellenar con moda (0/1 mapping) -------
    if "sex" in hypertension_df_clean.columns:
        before_missing = hypertension_df_clean["sex"].isnull().sum()
        mode_sex = get_mode_safe(hypertension_df_clean["sex"])
        if mode_sex is None:
            print("La columna 'sex' no tiene moda (todos NaN). No se rellenó.")
        else:
            # usar .loc para evitar SettingWithCopyWarning
            hypertension_df_clean.loc[:, "sex"] = hypertension_df_clean["sex"].fillna(mode_sex)
            after_missing = hypertension_df_clean["sex"].isnull().sum()
            print(f"'sex' - missing before: {before_missing}, after fill: {after_missing}, used mode: {mode_sex}")


    # ------- thal: 1..3 válidos; reemplazar 0 por NaN y llenar con moda -------
    if "thal" in hypertension_df_clean.columns:
        valid_thal = {1, 2, 3}
        # contar valores inválidos
        invalid_mask = ~hypertension_df_clean["thal"].isin(valid_thal)
        num_invalid = invalid_mask.sum()
        print(f"'thal' - encontrados {num_invalid} valores inválidos (no en {valid_thal})")

        # Reemplazar valores inválidos por NaN (incluye 0 y otros fuera de rango)
        hypertension_df_clean.loc[invalid_mask, "thal"] = np.nan

        mode_thal = get_mode_safe(hypertension_df_clean["thal"])
        if mode_thal is None:
            print("La columna 'thal' no tiene moda (todos NaN). No se rellenó.")
        else:
            hypertension_df_clean.loc[:, "thal"] = hypertension_df_clean["thal"].fillna(mode_thal)
            print(f"Filled missing 'thal' values with mode: {mode_thal}")
            print("Missing values after filling 'thal':", hypertension_df_clean["thal"].isnull().sum())
            print("Unique thal values:", sorted(hypertension_df_clean["thal"].dropna().unique()))

    # ------- restecg: valores esperados 0 y 1 (2 inválido) -------
    if "restecg" in hypertension_df_clean.columns:
        valid_restecg = {0, 1}
        invalid_mask = ~hypertension_df_clean["restecg"].isin(valid_restecg)
        num_invalid = invalid_mask.sum()
        print(f"'restecg' - encontrados {num_invalid} valores inválidos (no en {valid_restecg})")

        hypertension_df_clean.loc[invalid_mask, "restecg"] = np.nan

        mode_restecg = get_mode_safe(hypertension_df_clean["restecg"])
        if mode_restecg is None:
            print("La columna 'restecg' no tiene moda (todos NaN). No se rellenó.")
        else:
            hypertension_df_clean.loc[:, "restecg"] = hypertension_df_clean["restecg"].fillna(mode_restecg)
            print(f"Filled missing 'restecg' values with mode: {mode_restecg}")
            print("Missing values after filling 'restecg':", hypertension_df_clean["restecg"].isnull().sum())
            print("Unique restecg values:", sorted(hypertension_df_clean["restecg"].dropna().unique()))

# ------- mostrar resumen final -------
print("Final shape:", hypertension_df_clean.shape)
print(hypertension_df_clean.isnull().sum())


Hypertension: initial shape (26083, 14)
Missing total: 25
target missing: 0
'sex' - missing before: 25, after fill: 0, used mode: 0.0
'thal' - encontrados 154 valores inválidos (no en {1, 2, 3})
Filled missing 'thal' values with mode: 2.0
Missing values after filling 'thal': 0
Unique thal values: [np.float64(1.0), np.float64(2.0), np.float64(3.0)]
'restecg' - encontrados 352 valores inválidos (no en {0, 1})
Filled missing 'restecg' values with mode: 1.0
Missing values after filling 'restecg': 0
Unique restecg values: [np.float64(0.0), np.float64(1.0)]
Final shape: (26083, 14)
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [8]:
sm = SMOTE(random_state=42)
X = hypertension_df_clean.drop(columns=["target"])
y = hypertension_df_clean["target"]
X_res, y_res = sm.fit_resample(X, y)
hypertension_df_clean = pd.concat([X_res, y_res], axis=1)

In [9]:
hypertension_df_clean.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,57.0,1.0,3,145,233,1,0.0,150,0,2.3,0,0,1.0,1
1,64.0,0.0,2,130,250,0,1.0,187,0,3.5,0,0,2.0,1
2,52.0,1.0,1,130,204,0,0.0,172,0,1.4,2,0,2.0,1
3,56.0,0.0,1,120,236,0,1.0,178,0,0.8,2,0,2.0,1
4,66.0,0.0,0,120,354,0,1.0,163,1,0.6,2,0,2.0,1


In [10]:
# ---------- Parámetros ----------
DROP_ID_COL = True  # eliminar columna 'id' si existe
ONE_HOT_PREFIX = "MTRANS"  # prefijo para columnas one-hot

# ---------- Inicialización ----------
obesity_df_clean = None
mapping_report = {}  # para guardar cómo se mapearon columnas y casos no esperados

def normalize_str(s):
    if pd.isna(s):
        return np.nan
    s0 = str(s).strip().lower()
    s0 = s0.replace("-", "_").replace(" ", "_")
    # dejar solo a-z0-9_
    s0 = re.sub(r'[^a-z0-9_]', '', s0)
    return s0

# ---- Obesity ----
if obesity_df is not None:
    df = obesity_df.copy()
    print("Obesity: initial shape", df.shape)

    # eliminar id si existe
    if DROP_ID_COL and "id" in df.columns:
        df = df.drop(columns=["id"])
        print("Dropped 'id' column")

    # detectar columnas target posibles (sensible a nombres comunes)
    possible_targets = [c for c in df.columns
                        if ("obes" in c.lower()) or ("obeldad" in c.lower()) or c.lower().startswith("ob") or "0be1" in c.lower()]
    print("Possible target columns:", possible_targets)

    # eliminar duplicados exactos si los hay
    ndup = df.duplicated().sum()
    print("Duplicados exactos encontrados:", ndup)
    if ndup > 0:
        df = df.drop_duplicates().reset_index(drop=True)
        print("After drop duplicates:", df.shape)

    obesity_df_clean = df.copy()
else:
    print("obesity_df is None -> no data to process")

# Si no hay df limpio, salir
if obesity_df_clean is None:
    raise ValueError("No obesity_df_clean available, aborting cleaning.")

# ---------- Helpers ----------
def safe_map_series(series, mapping, colname):
    """
    Mapear series con dict mapping de forma case-insensitive.
    Devuelve (mapped_series, unmatched_values_set)
    """
    # crear mapa tolerant a mayúsculas, manejando keys que no son strings
    lower_map = {}
    for k, v in mapping.items():
        try:
            lower_map[str(k).lower()] = v
        except Exception:
            # Si no se puede convertir a string o lower, ignorar para mapeo case-insensitive
            pass


    def mapper(x):
        if pd.isna(x):
            return np.nan
        try:
            # Check for exact match first (handles non-string keys like int 0)
            if x in mapping:
                return mapping[x]
            # Then check for case-insensitive string match
            xl = str(x).strip().lower()
            if xl in lower_map:
                return lower_map[xl]
            return np.nan
        except Exception:
            return np.nan

    mapped = series.map(mapper)
    # Identify unmatched values - compare against original keys and lowercased keys
    all_mapped_keys_normalized = set(str(k).strip().lower() for k in mapping.keys())
    unmatched = set()
    for val in series.dropna().unique():
        val_str = str(val).strip()
        if val_str.lower() not in all_mapped_keys_normalized and val not in mapping.keys():
             unmatched.add(val)

    return mapped, unmatched

# ---------- Age: decimales -> entero más cercano ----------
if "Age" in obesity_df_clean.columns:
    obesity_df_clean.loc[:, "Age_numeric_temp"] = pd.to_numeric(obesity_df_clean["Age"], errors="coerce")
    n_before = obesity_df_clean["Age"].isnull().sum()
    # redondear .round() y convertir a Int64 (nullable) para preservar NaN
    obesity_df_clean.loc[:, "Age"] = obesity_df_clean["Age_numeric_temp"].round().astype("Int64")
    n_after = obesity_df_clean["Age"].isnull().sum()
    obesity_df_clean = obesity_df_clean.drop(columns=["Age_numeric_temp"])
    mapping_report["Age"] = {"action": "to_numeric_round_int", "n_null_before": int(n_before), "n_null_after": int(n_after)}
    print("Age mapped to int (Int64 nullable). Nulls before/after:", n_before, n_after)
    print(obesity_df_clean["Age"].describe())
else:
    print("No 'Age' column found.")

# ---------- Gender: One-hot (male / female / unknown) ----------
if "Gender" in obesity_df_clean.columns:
    df = obesity_df_clean.copy()
    # normalizar valor raw
    df["Gender_clean"] = df["Gender"].map(normalize_str)

    # mapear variantes comunes a 'male' / 'female'
    gender_aliases = {
        "male": "male", "m": "male", "man": "male", "male_": "male",
        "female": "female", "f": "female", "woman": "female", "female_": "female",
        # añade más aliases si detectas otros valores
    }
    # aplicar alias map case-insensitive
    df["Gender_std"] = df["Gender_clean"].map(lambda x: gender_aliases.get(x, np.nan) if pd.notna(x) else np.nan)

    # mantener unknown label para NaNs para que OneHot cree columna 'unknown'
    df["Gender_std"] = df["Gender_std"].fillna("unknown")

    # OneHot encode
    ohe_g = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    arr_g = ohe_g.fit_transform(df[["Gender_std"]])
    cols_g = ohe_g.get_feature_names_out(["Gender_std"])
    df_ohe_g = pd.DataFrame(arr_g, columns=cols_g, index=df.index)

    # Concat y eliminar columnas originales/temporales
    df = pd.concat([df, df_ohe_g], axis=1)
    df = df.drop(columns=["Gender", "Gender_clean", "Gender_std"], errors="ignore")

    # Update dataframe and mapping_report
    obesity_df_clean = df.copy()
    mapping_report = mapping_report if 'mapping_report' in globals() else {}
    mapping_report.setdefault("Gender", {})
    mapping_report["Gender"]["alias_map_used"] = gender_aliases
    mapping_report["Gender"]["ohe_categories"] = list(ohe_g.categories_[0])
    print("Gender one-hot columns added:", list(cols_g))
    print("Gender one-hot counts:")
    print(obesity_df_clean[cols_g].sum())
else:
    print("No 'Gender' column found in obesity_df_clean")

# ---------- CAEC: ordinal sometimes/frequently/always/0 -> 3,2,1,0 ----------
if "CAEC" in obesity_df_clean.columns:
    caec_map = {"Always": 3, "Frequently": 2, "Sometimes": 1, "0": 0, 0:0}
    mapped, unmatched = safe_map_series(obesity_df_clean["CAEC"], caec_map, "CAEC")
    obesity_df_clean.loc[:, "CAEC"] = mapped.astype("Int64")
    mapping_report["CAEC"] = {"action": "ordinal_map", "mapping": caec_map, "unmatched_examples": list(unmatched)[:10]}
    print("CAEC mapped. Unmatched examples (up to 10):", mapping_report["CAEC"]["unmatched_examples"])
    print(obesity_df_clean["CAEC"].value_counts(dropna=False))
else:
    print("No 'CAEC' column found.")

# ---------- CALC: ordinal sometimes/frequently/0 -> 2,1,0 ----------
if "CALC" in obesity_df_clean.columns:
    calc_map = {"Frequently": 2, "Sometimes": 1, "0": 0, 0:0}
    mapped, unmatched = safe_map_series(obesity_df_clean["CALC"], calc_map, "CALC")
    obesity_df_clean.loc[:, "CALC"] = mapped.astype("Int64")
    mapping_report["CALC"] = {"action": "ordinal_map", "mapping": calc_map, "unmatched_examples": list(unmatched)[:10]}
    print("CALC mapped. Unmatched examples (up to 10):", mapping_report["CALC"]["unmatched_examples"])
    print(obesity_df_clean["CALC"].value_counts(dropna=False))
else:
    print("No 'CALC' column found.")

# ---------- MTRANS: One-hot (transportation categories, non-ordinal) ----------
if "MTRANS" in obesity_df_clean.columns:
    df = obesity_df_clean.copy()

    # normalizar
    df["MTRANS_norm"] = df["MTRANS"].map(normalize_str)

    # expected raw mapping (raw variants -> canonical category)
    expected_raw = {
        "Automobile": "automobile",
        "Bike": "bike",
        "Motorbike": "motorbike",
        "Public_Transportation": "public_transportation",
        "Walking": "walking",
    }
    # normalize keys of expected_raw for lookup
    expected = {normalize_str(k): v for k, v in expected_raw.items()}

    # map to canonical categories (or NaN)
    df["MTRANS_std"] = df["MTRANS_norm"].map(lambda v: expected.get(v, np.nan) if pd.notna(v) else np.nan)

    # record unexpected raw values for review
    unexpected_values = sorted(set(df["MTRANS_norm"].dropna().unique()) - set(expected.keys()))
    mapping_report = mapping_report if 'mapping_report' in globals() else {}
    mapping_report.setdefault("MTRANS", {})
    mapping_report["MTRANS"]["expected_map"] = expected
    mapping_report["MTRANS"]["unexpected_raw_values"] = unexpected_values[:50]  # sample up to 50

    # fill unknown so OneHotEncoder creates explicit unknown column
    df["MTRANS_std"] = df["MTRANS_std"].fillna("unknown")

    # OneHot encode MTRANS_std
    ohe_t = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    arr_t = ohe_t.fit_transform(df[["MTRANS_std"]])
    cols_t = ohe_t.get_feature_names_out(["MTRANS_std"])
    df_ohe_t = pd.DataFrame(arr_t, columns=cols_t, index=df.index)

    # concat and drop originals/temporals
    df = pd.concat([df, df_ohe_t], axis=1)
    df = df.drop(columns=["MTRANS", "MTRANS_norm", "MTRANS_std"], errors="ignore")

    # Update obesity_df_clean and mapping
    obesity_df_clean = df.copy()
    mapping_report["MTRANS"]["ohe_categories"] = list(ohe_t.categories_[0])
    print("MTRANS one-hot columns added (sample):", list(cols_t)[:10])
    if unexpected_values:
        print("MTRANS unexpected raw values (sample):", unexpected_values[:20])
else:
    print("No 'MTRANS' column found in obesity_df_clean")

# ---------- FCVC, NCP, CH20, FAF, TUE: redondeo seguro a int ----------
for col in ["FCVC", "NCP", "CH2O", "FAF", "TUE"]:
    if col in obesity_df_clean.columns:
        # convertir a numérico forzando NaN donde no convertible
        obesity_df_clean.loc[:, f"{col}_num_temp"] = pd.to_numeric(obesity_df_clean[col], errors="coerce")
        n_null_before = int(obesity_df_clean[col].isnull().sum())
        obesity_df_clean.loc[:, col] = obesity_df_clean[f"{col}_num_temp"].round().astype("Int64")
        n_null_after = int(obesity_df_clean[col].isnull().sum())
        obesity_df_clean = obesity_df_clean.drop(columns=[f"{col}_num_temp"])
        mapping_report[col] = {"action": "round_to_int", "n_null_before": n_null_before, "n_null_after": n_null_after}
        print(f"{col} mapped to Int64. Nulls before/after:", n_null_before, n_null_after)
    else:
        print(f"No column {col} found.")

# ---------- Resumen final ----------
print("\nFinal shape:", obesity_df_clean.shape)
print("Null counts (top 20):")
print(obesity_df_clean.isnull().sum().sort_values(ascending=False).head(20))

print("\nMapping/report summary:")
for k, v in mapping_report.items():
    print(k, "->", v)

# Opcional: devolver o guardar limpio
# obesity_df_clean.to_csv("obesity_cleaned.csv", index=False)

Obesity: initial shape (20758, 18)
Dropped 'id' column
Possible target columns: ['0be1dad']
Duplicados exactos encontrados: 0
Age mapped to int (Int64 nullable). Nulls before/after: 0 0
count    20758.000000
mean        23.850371
std          5.700779
min         14.000000
25%         20.000000
50%         23.000000
75%         26.000000
max         61.000000
Name: Age, dtype: float64
Gender one-hot columns added: ['Gender_std_female', 'Gender_std_male']
Gender one-hot counts:
Gender_std_female    10422.0
Gender_std_male      10336.0
dtype: float64
CAEC mapped. Unmatched examples (up to 10): []
CAEC
1    17529
2     2472
3      478
0      279
Name: count, dtype: int64
CALC mapped. Unmatched examples (up to 10): []
CALC
1    15066
0     5163
2      529
Name: count, dtype: int64
MTRANS one-hot columns added (sample): ['MTRANS_std_automobile', 'MTRANS_std_bike', 'MTRANS_std_motorbike', 'MTRANS_std_public_transportation', 'MTRANS_std_walking']
FCVC mapped to Int64. Nulls before/after: 0 0


In [11]:
obesity_df_clean.head()

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,...,TUE,CALC,0be1dad,Gender_std_female,Gender_std_male,MTRANS_std_automobile,MTRANS_std_bike,MTRANS_std_motorbike,MTRANS_std_public_transportation,MTRANS_std_walking
0,24.0,1.699998,81.66995,1,1,2.0,3.0,1,0,3.0,...,1.0,1,Overweight_Level_II,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,18.0,1.56,57.0,1,1,2.0,3.0,2,0,2.0,...,1.0,0,0rmal_Weight,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,18.0,1.71146,50.165754,1,1,2.0,1.0,1,0,2.0,...,2.0,0,Insufficient_Weight,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3,21.0,1.71073,131.274851,1,1,3.0,3.0,1,0,2.0,...,1.0,1,Obesity_Type_III,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,32.0,1.914186,93.798055,1,1,3.0,2.0,1,0,2.0,...,1.0,1,Overweight_Level_II,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [12]:
# ---------------------------
# One-hot encoder para 'sex'
# ---------------------------
def encode_sex_onehot(df, col_name='sex', fillna_with_mode=True, drop_original=True):
    """
    Normaliza variantes de sex (0/1, 'M','F','male','female'), opcionalmente rellena NaNs con la moda,
    y crea dos columnas: sex_female, sex_male (dtype int).
    Si drop_original=True elimina la columna original.
    Devuelve DataFrame modificado (no modifica la entrada en sitio salvo que reasignes).
    """
    import pandas as pd
    import numpy as np

    if df is None:
        return df
    if col_name not in df.columns:
        # intentar detectar mayúsculas/minúsculas comunes
        alt = None
        for c in df.columns:
            if c.lower() == col_name.lower():
                alt = c
                break
        if alt is None:
            # nada que hacer
            return df
        else:
            col_name = alt

    s = df[col_name].copy()

    def _norm_sex(v):
        if pd.isna(v):
            return np.nan
        vs = str(v).strip().lower()
        # casos string explícitos
        if vs in {'male', 'm', 'man'}:
            return 'male'
        if vs in {'female', 'f', 'woman', 'woman'}:
            return 'female'
        # casos numéricos en string o float
        try:
            fv = float(v)
            if fv == 1.0:
                return 'male'
            if fv == 0.0:
                return 'female'
        except Exception:
            pass
        # si queda algo como '1.0' ya cubierto; si no reconocemos, devolver NaN
        return np.nan

    s_norm = s.map(_norm_sex)

    # rellenar NaNs con moda si se pidió
    if fillna_with_mode:
        mode = s_norm.mode()
        if not mode.empty:
            s_norm = s_norm.fillna(mode.iloc[0])

    # crear dummies (dos columnas: sex_female, sex_male)
    dummies = pd.get_dummies(s_norm, prefix='sex')

    # asegurar que las columnas existan en el mismo orden y tipo (0/1)
    for expected in ['sex_female', 'sex_male']:
        if expected not in dummies.columns:
            dummies[expected] = 0

    dummies = dummies[['sex_female', 'sex_male']].astype(int)

    # construir nuevo df: opcionalmente eliminar original
    out = df.copy()
    if drop_original:
        out = out.drop(columns=[col_name])

    # concatenar (reseteando índices para evitar problemas si hay desalineación)
    out = pd.concat([out.reset_index(drop=True), dummies.reset_index(drop=True)], axis=1)

    return out

def align_dummy_columns(dfs, dummy_cols=['sex_female','sex_male']):
    """
    Asegura que cada df en la lista tenga las mismas columnas dummy (las crea con ceros si faltan).
    Modifica en sitio los DataFrames de la lista y devuelve la lista.
    """
    for i, df in enumerate(dfs):
        for c in dummy_cols:
            if c not in df.columns:
                df[c] = 0
        # opcional: garantizar orden consistente (dejar las dummies al final)
        cols_other = [c for c in df.columns if c not in dummy_cols]
        dfs[i] = df[cols_other + dummy_cols]
    return dfs

# ---------------------------
# Uso con tus dataframes
# ---------------------------
# Para diabetes: si la columna se llama 'Sex' usa col_name='Sex'; aquí intento manejar ambas variantes.
diabetes_df_clean = encode_sex_onehot(diabetes_df_clean, col_name='Sex', fillna_with_mode=False, drop_original=True)

# Para hypertension (en tu script ya rellenaste la moda), no hace daño volver a pedir fillna_with_mode=False
hypertension_df_clean = encode_sex_onehot(hypertension_df_clean, col_name='sex', fillna_with_mode=False, drop_original=True)

# Alinear columnas dummy entre datasets (por si uno no tenía ambos valores)
diabetes_df_clean, hypertension_df_clean = align_dummy_columns([diabetes_df_clean, hypertension_df_clean])

# Mostrar chequeo rápido
print("Diabetes columns ->", diabetes_df_clean.columns.tolist()[-4:])  # mostrar final
print("Hypertension columns ->", hypertension_df_clean.columns.tolist()[-4:])
print("Sex dummies (diabetes) value counts:")
print(diabetes_df_clean[['sex_female','sex_male']].sum())
print("Sex dummies (hypertension) value counts:")
print(hypertension_df_clean[['sex_female','sex_male']].sum())


Diabetes columns -> ['Diabetes', 'Age_mapped', 'sex_female', 'sex_male']
Hypertension columns -> ['thal', 'target', 'sex_female', 'sex_male']
Sex dummies (diabetes) value counts:
sex_female    34828
sex_male      29192
dtype: int64
Sex dummies (hypertension) value counts:
sex_female    13584
sex_male      13593
dtype: int64


## 4) Tratamiento de outliers (funciones reutilizables)
- Usaremos IQR-capping (winsorization) y/o imputación por mediana agrupada por sexo/edad cuando aplique.
- Para variables binarios y ordinales con pocos niveles no aplicaremos outlier removal.


In [13]:
# Utilidades de outliers
def iqr_bounds(series, k=1.5):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    return q1 - k*iqr, q3 + k*iqr

def cap_outliers_iqr(series, k=1.5):
    low, high = iqr_bounds(series, k=k)
    return series.clip(lower=low, upper=high)

# Aplicar de forma segura: solo a variables continuas con muchos valores únicos
def cap_df_outliers(df, cols=None, k=1.5):
    df = df.copy()
    if cols is None:
        cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c]) and df[c].nunique() > 20]
    for c in cols:
        df[c] = cap_outliers_iqr(df[c], k=k)
    return df


In [14]:
# A) Diabetes: revisar BMI (valores extremos) — vamos a marcar y capear > 60 como outlier clínico
if 'diabetes_df_clean' in globals():
    df = diabetes_df_clean.copy()
    print("BMI stats before:")
    print(df["BMI"].describe())
    # marcar registros plausiblemente erróneos
    df["BMI_flag_high"] = (df["BMI"] > 60).astype(int)   # umbral configurable
    print("High BMI count:", df["BMI_flag_high"].sum())
    # opción default: capear por IQR
    df["BMI"] = cap_outliers_iqr(df["BMI"], k=1.5)
    print("BMI stats after capping:")
    print(df["BMI"].describe())
    diabetes_df_clean = df

# C) Obesity: crear BMI si Height (m) y Weight (kg) están presentes y chequear extremos
if 'obesity_df_clean' in globals():
    df = obesity_df_clean.copy()
    if "Height" in df.columns and "Weight" in df.columns:
        # algunos datasets height puede estar en cm; inferir
        median_height = df["Height"].median()
        if median_height > 3:  # asume está en cm
            df["Height_m"] = df["Height"] / 100.0
        else:
            df["Height_m"] = df["Height"]
        df["BMI_calc"] = df["Weight"] / (df["Height_m"]**2)
        print("BMI_calc stats:", df["BMI_calc"].describe())
        df["BMI_flag_high"] = (df["BMI_calc"] > 60).astype(int)
        # capear BMI_calc
        df["BMI_calc"] = cap_outliers_iqr(df["BMI_calc"], k=1.5)
    obesity_df_clean = df


BMI stats before:
count    64020.000000
mean        30.178554
std          7.287730
min         12.000000
25%         25.000000
50%         29.000000
75%         34.000000
max         98.000000
Name: BMI, dtype: float64
High BMI count: 260
BMI stats after capping:
count    64020.000000
mean        29.990245
std          6.545449
min         12.000000
25%         25.000000
50%         29.000000
75%         34.000000
max         47.500000
Name: BMI, dtype: float64
BMI_calc stats: count    20758.000000
mean        30.241842
std          8.333932
min         12.868541
25%         24.088223
50%         29.384757
75%         37.011168
max         54.997991
Name: BMI_calc, dtype: float64


## 6) Codificación categórica y feature engineering
- Obesity: codificamos Gender, CAEC, CALC, MTRANS con OneHot o TargetEncoder según requieras. También creamos `is_obese` (binaria) por defecto: `1` si la etiqueta contiene la palabra 'Obesity' o 'Overweight' (ajustable).


In [15]:
# Normalizar y mapear target ordinal de Obesity
# mapping recomendado (mayor = peor)
# ------------- mapping limpio (normalizado: lowercase + underscores) -------------
ordinal_map = {
    "insufficient_weight": 0,
    "normal_weight": 1,
    "overweight_level_i": 2,
    "overweight_level_ii": 3,
    "obesity_type_i": 4,
    "obesity_type_ii": 5,
    "obesity_type_iii": 6
}

# aliases/typos normalizados -> normalized key (all lower_case underscore)
alias_map = {
    "0rmal_weight": "normal_weight",
    "normalweight": "normal_weight",
    "insufficientweight": "insufficient_weight",
    "overweight_i": "overweight_level_i",
    "overweight_ii": "overweight_level_ii",
    "obesity_i": "obesity_type_i",
    "obesity_ii": "obesity_type_ii",
    "obesity_iii": "obesity_type_iii",
}

def normalize_target_val(s):
    if pd.isna(s):
        return np.nan
    s0 = str(s).strip().lower()
    # reemplazar espacios/guiones por underscore, eliminar caracteres no alfanuméricos salvo _
    s0 = s0.replace("-", "_").replace(" ", "_")
    s0 = re.sub(r'[^a-z0-9_]', '', s0)
    # map alias
    if s0 in alias_map:
        s0 = alias_map[s0]
    return s0

# Aplicar al dataframe (ejecutar solo si obesity_df_clean existe)
if 'obesity_df_clean' in globals():
    df = obesity_df_clean.copy()
    # detectar columna target si no la conoces exacta
    target_candidates = [c for c in df.columns if ('obes' in c.lower()) or ('0be1' in c.lower()) or ('obeldad' in c.lower()) or c.lower().startswith("ob")]
    target_col = target_candidates[0] if target_candidates else None
    if target_col is None:
        raise ValueError("No se encontró columna target para obesidad. Revisa nombres de columnas.")
    # normalizar
    df['_ob_target_norm'] = df[target_col].map(normalize_target_val)
    # Mostrar valores únicos no mapeados para inspección
    uniques = sorted([u for u in df['_ob_target_norm'].dropna().unique()])
    print("Unique normalized target values (sample):", uniques)

    # Mapear a ordinal (valores faltantes -> NaN)
    df['obesity_ordinal'] = df['_ob_target_norm'].map(ordinal_map).astype("Int64")

    # Si hay valores no mapeados, mostrarlos para corregir alias
    unmapped = sorted(set(df['_ob_target_norm'].dropna().unique()) - set(ordinal_map.keys()))
    if unmapped:
        print("Valores no mapeados (revisa y agrega a alias_map si corresponde):", unmapped)

    # Crear target binario (opcional). Umbral: >= Overweight_Level_I => obese
    # Ajusta el umbral si prefieres considerar solo 'Obesity_Type_*' como obesidad (p. ej. >=4)
    threshold_for_obese = ordinal_map["overweight_level_i"]
    df['is_obese'] = (df['obesity_ordinal'].notna() & (df['obesity_ordinal'] >= threshold_for_obese)).astype("Int64")

    # Guardar mapping y columnas en mapping_report
    mapping_report = mapping_report if 'mapping_report' in globals() else {}
    mapping_report['obesity_target'] = {
        "raw_column": target_col,
        "ordinal_map": ordinal_map,
        "alias_map": alias_map,
        "threshold_binary_is_obese": int(threshold_for_obese)
    }

    # Reemplazar en obesity_df_clean
    obesity_df_clean = df.copy()
    print("Mapped obesity_ordinal value counts:")
    print(obesity_df_clean['obesity_ordinal'].value_counts(dropna=False).sort_index())
    print("is_obese counts:")
    print(obesity_df_clean['is_obese'].value_counts(dropna=False))
else:
    print("No existe obesity_df_clean en el entorno.")


Unique normalized target values (sample): ['insufficient_weight', 'normal_weight', 'obesity_type_i', 'obesity_type_ii', 'obesity_type_iii', 'overweight_level_i', 'overweight_level_ii']
Mapped obesity_ordinal value counts:
obesity_ordinal
0    2523
1    3082
2    2427
3    2522
4    2910
5    3248
6    4046
Name: count, dtype: Int64
is_obese counts:
is_obese
1    15153
0     5605
Name: count, dtype: Int64


In [16]:
# --- Obesity: reemplazos definitivos y limpieza de temporales ---
if 'obesity_df_clean' in globals():
    df = obesity_df_clean.copy()

    # Age: si fue mapeada o convertida, asegurar que Age final sea Int64
    if "Age_numeric_temp" in df.columns:
        # si quedó por error
        df.loc[:, "Age"] = pd.to_numeric(df["Age_numeric_temp"], errors="coerce").round().astype("Int64")
        df = df.drop(columns=["Age_numeric_temp"])
    # si hay otra columna temporal Age_* similar, eliminarla
    if "Age_mapped" in df.columns:
        df.loc[:, "Age"] = df["Age_mapped"].astype("Int64")
        df = df.drop(columns=["Age_mapped"])

    #    si Gender_mapped existe por error elimínala y conserva Gender.
    if "Gender_mapped" in df.columns:
        # si Gender ya fue reemplazada, borrar la auxiliar
        df.loc[:, "Gender"] = df["Gender_mapped"].astype("Int64")
        df = df.drop(columns=["Gender_mapped"])

    # CAEC y CALC: ya fueron asignadas en su columna; si existe versión auxiliar bórrala
    for col in ["CAEC", "CALC"]:
        # si existen columnas auxiliares con sufijo _mapped elimínalas
        aux = col + "_mapped"
        if aux in df.columns:
            df.loc[:, col] = df[aux].astype("Int64")
            df = df.drop(columns=[aux])
        # asegurar tipo Int64
        if col in df.columns:
            df.loc[:, col] = df[col].astype("Int64")

    #    eliminar columnas temporales MTRANS_norm/MTRANS_std si quedaron.
    for c in ["MTRANS_norm", "MTRANS_std"]:
        if c in df.columns:
            df = df.drop(columns=[c])
    # Mantén los dummies y MTRANS_unknown (si los creaste).

    # FCVC, NCP, CH2O, FAF, TUE:  eliminar *_num_temp tras redondear.
    temp_num_cols = [c for c in df.columns if c.endswith("_num_temp")]
    if temp_num_cols:
        df = df.drop(columns=temp_num_cols)

    # Target de obesidad:
    raw_target_col = mapping_report.get("obesity_target", {}).get("raw_column") if 'mapping_report' in globals() else None
    if raw_target_col and raw_target_col in df.columns and "obesity_ordinal" in df.columns:
        # reemplazar la columna raw por el ordinal (Int64)
        df.loc[:, raw_target_col] = df["obesity_ordinal"].astype("Int64")
        df = df.drop(columns=["_ob_target_norm", "obesity_ordinal"], errors="ignore")
    else:
        if "_ob_target_norm" in df.columns:
            df = df.drop(columns=["_ob_target_norm"])

    #   is_obese como columna binaria
    # (Si no existe, recrearla)
    if "is_obese" not in df.columns and "obesity_ordinal" in df.columns:
        df.loc[:, "is_obese"] = (df["obesity_ordinal"].notna() & (df["obesity_ordinal"] >= 2)).astype("Int64")

    # Elimina cualquier columna residual con sufijos temporales comunes
    residual_temp = [c for c in df.columns if any(c.endswith(sfx) for sfx in ["_temp","_mapped","_norm","_std"])
                     and not c.endswith("_was_missing") and not c.endswith("_flag_high")]
    if residual_temp:
        df = df.drop(columns=residual_temp)

    # guardar y mostrar resumen
    obesity_df_clean = df.copy()
    print("Obesity cleaned. Columns now:", obesity_df_clean.columns.tolist())
else:
    print("No existe obesity_df_clean en el entorno.")

obesity_df_clean.drop(columns=["Height_m", "BMI_flag_high"], inplace=True)


Obesity cleaned. Columns now: ['Age', 'Height', 'Weight', 'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', '0be1dad', 'Gender_std_female', 'Gender_std_male', 'MTRANS_std_automobile', 'MTRANS_std_bike', 'MTRANS_std_motorbike', 'MTRANS_std_public_transportation', 'MTRANS_std_walking', 'Height_m', 'BMI_calc', 'BMI_flag_high', 'is_obese']


In [17]:
obesity_df_clean.drop(columns=["BMI_calc", "0be1dad"], inplace=True)

In [18]:
obesity_df_clean.head()

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,...,TUE,CALC,Gender_std_female,Gender_std_male,MTRANS_std_automobile,MTRANS_std_bike,MTRANS_std_motorbike,MTRANS_std_public_transportation,MTRANS_std_walking,is_obese
0,24.0,1.699998,81.66995,1,1,2.0,3.0,1,0,3.0,...,1.0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
1,18.0,1.56,57.0,1,1,2.0,3.0,2,0,2.0,...,1.0,0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0
2,18.0,1.71146,50.165754,1,1,2.0,1.0,1,0,2.0,...,2.0,0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0
3,21.0,1.71073,131.274851,1,1,3.0,3.0,1,0,2.0,...,1.0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4,32.0,1.914186,93.798055,1,1,3.0,2.0,1,0,2.0,...,1.0,1,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [19]:
diabetes_df_clean.head()

Unnamed: 0,HighChol,CholCheck,BMI,Smoker,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,GenHlth,MentHlth,PhysHlth,DiffWalk,Stroke,HighBP,Diabetes,Age_mapped,sex_female,sex_male,BMI_flag_high
0,0.0,1.0,26.0,0.0,0.0,1.0,0.0,1.0,0.0,3.0,5.0,30.0,0.0,0.0,1.0,0.0,38,0,1,0
1,1.0,1.0,26.0,1.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,79,0,1,0
2,0.0,1.0,26.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,10.0,0.0,0.0,0.0,0.0,94,0,1,0
3,1.0,1.0,28.0,1.0,0.0,1.0,1.0,1.0,0.0,3.0,0.0,3.0,0.0,0.0,1.0,0.0,72,0,1,0
4,0.0,1.0,29.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,59,1,0,0


In [20]:
hypertension_df_clean.head()

Unnamed: 0,age,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,sex_female,sex_male
0,57.0,3,145,233,1,0.0,150,0,2.3,0,0,1.0,1,0,1
1,64.0,2,130,250,0,1.0,187,0,3.5,0,0,2.0,1,1,0
2,52.0,1,130,204,0,0.0,172,0,1.4,2,0,2.0,1,0,1
3,56.0,1,120,236,0,1.0,178,0,0.8,2,0,2.0,1,1,0
4,66.0,0,120,354,0,1.0,163,1,0.6,2,0,2.0,1,1,0


In [21]:
#pasar todas las columnas target a que tengan el nombre target
obesity_df_clean.rename(columns={'is_obese': 'target'}, inplace=True)
diabetes_df_clean.rename(columns={'Diabetes': 'target'}, inplace=True)
hypertension_df_clean.rename(columns={'target': 'target'}, inplace=True)

## 7) Preparar datasets para modelado
-
X_train, X_test, y_train, y_test y guardar los transformadores.


In [22]:
from sklearn.model_selection import train_test_split

# Definir proporción de test y semilla para reproducibilidad
TEST_SIZE = 0.2
RANDOM_STATE = 42

# ---------- Split Diabetes ----------
if 'diabetes_df_clean' in globals():
    df = diabetes_df_clean.copy()
    X = df.drop(columns=['target'])
    y = df['target']
    X_train_diab, X_test_diab, y_train_diab, y_test_diab = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    print(" Diabetes split ->",
          f"Train: {X_train_diab.shape}, Test: {X_test_diab.shape}")

# ---------- Split Hypertension ----------
if 'hypertension_df_clean' in globals():
    df = hypertension_df_clean.copy()
    X = df.drop(columns=['target'])
    y = df['target']
    X_train_hyp, X_test_hyp, y_train_hyp, y_test_hyp = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    print(" Hypertension split ->",
          f"Train: {X_train_hyp.shape}, Test: {X_test_hyp.shape}")

# ---------- Split Obesity ----------
if 'obesity_df_clean' in globals():
    df = obesity_df_clean.copy()
    X = df.drop(columns=['target'])
    y = df['target']
    X_train_ob, X_test_ob, y_train_ob, y_test_ob = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
    print(" Obesity split ->",
          f"Train: {X_train_ob.shape}, Test: {X_test_ob.shape}")


 Diabetes split -> Train: (51216, 19), Test: (12804, 19)
 Hypertension split -> Train: (22838, 14), Test: (5710, 14)
 Obesity split -> Train: (16606, 21), Test: (4152, 21)


In [23]:
# extraer en csv cada train y test
if 'diabetes_df_clean' in globals():
  X_train_diab.to_csv("X_train_diab.csv", index=False)
  X_test_diab.to_csv("X_test_diab.csv", index=False)
  y_train_diab.to_csv("y_train_diab.csv", index=False)
  y_test_diab.to_csv("y_test_diab.csv", index=False)
  print(" Diabetes saved")

if 'hypertension_df_clean' in globals():
  X_train_hyp.to_csv("X_train_hyp.csv", index=False)
  X_test_hyp.to_csv("X_test_hyp.csv", index=False)
  y_train_hyp.to_csv("y_train_hyp.csv", index=False)
  y_test_hyp.to_csv("y_test_hyp.csv", index=False)
  print(" Hypertension saved")

if 'obesity_df_clean' in globals():
  X_train_ob.to_csv("X_train_ob.csv", index=False)
  X_test_ob.to_csv("X_test_ob.csv", index=False)
  y_train_ob.to_csv("y_train_ob.csv", index=False)
  y_test_ob.to_csv("y_test_ob.csv", index=False)
  print(" Obesity saved")

 Diabetes saved
 Hypertension saved
 Obesity saved
