# Implementación modelo

In [7]:
# ============================================
# 0) Imports y utilidades
# ============================================
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

def load_data(path="data_procesada.csv"):
    df = pd.read_csv(path)
    # Asegurar columnas esperadas
    missing = [c for c in FEATURES if c not in df.columns]
    if missing:
        raise ValueError(f"Faltan columnas en data_procesada: {missing}")

    # Coerción a numérico/bool
    for c in FEATURES:
        # Si es binario, mapear True/False/NaN a 1/0
        if c != "dens_int":
            df[c] = df[c].astype("float").astype("Int64")  # por si viene como 0/1/NaN
            df[c] = df[c].fillna(0).astype(int)
        else:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Imputación mínima para dens_int
    if df["dens_int"].isna().any():
        df["dens_int"] = df["dens_int"].fillna(df["dens_int"].median())

    return df

In [8]:
# ============================================
# 1) Cargar datos
# ============================================
df = load_data("data_procesada.csv")

In [9]:
# --- 1) Variables para clustering  ---
FEATURES = [
    "Servicios_Telecomunicaciones_No",
    "Servicios_Telecomunicaciones_Si",
    "Dispositivos_hogar_No",
    "conexion_hogar_si",
    "interrupciones_si",
    "frec_uso_si",
    "dens_int",
]

# --- 2) Preparar matriz de entrenamiento ---
X = df[FEATURES].copy()
scaler = StandardScaler()
X[["dens_int"]] = scaler.fit_transform(X[["dens_int"]])

# --- 3) Entrenar K-means con k=12 (configuración seleccionada) ---
kmeans = KMeans(n_clusters=12, n_init=50, random_state=123)
labels = kmeans.fit_predict(X)

# Añadir columna de cluster al dataframe original
df_lab = df.copy()
df_lab["cluster"] = labels

In [10]:
# --- 4) Normalización para empatar con shapefile ---
def norm(s: pd.Series) -> pd.Series:
    return (s.astype(str).str.upper()
             .str.normalize("NFKD").str.encode("ascii","ignore").str.decode("utf-8")
             .str.replace(r"[^A-Z0-9\s]", " ", regex=True)
             .str.replace(r"\s+", " ", regex=True)
             .str.strip())

df_lab["dep_norm"] = norm(df_lab["Nombre Departamento"])

In [11]:
df_lab.to_csv("df_lab.csv", index=False, encoding="utf-8")