# Creación dataset ficticio

In [16]:
import pandas as pd
import numpy as np

def generar_datos_ficticios_completos(n=200, seed=42):
    np.random.seed(seed)

    # ---------------------------
    # Catálogos de valores
    # ---------------------------
    direccion_vals = [
        "Kr 14 # 21 105", "kra 46A # 110_64", "Calle 10 4 48 capincito",
        "Carrera 3 6 124", "Cea 3 6 20", "Kra 46A # 110_60"
    ]

    municipio_nombre_vals = [
        "Barranquilla","Medellín","Bucaramanga","Bogotá, D.C.","Cúcuta","Cali",
        "Cartagena","Sincelejo","Buenavista","San Marcos","Itagüí","Soacha",
        "Popayán","Pasto","Mocoa","Santa Marta","Villavicencio","Chía","Neiva"
    ]

    nombre_municipio_vals = [
        "BARRANQUILLA","MEDELLÍN","BUCARAMANGA","BOGOTÁ, D.C.",
        "SAN JOSÉ DE CÚCUTA","SANTIAGO DE CALI","CARTAGENA DE INDIAS","SINCELEJO",
        "POPAYÁN","PASTO","MOCOA","SANTA MARTA","VILLAVICENCIO","CHÍA","NEIVA"
    ]

    nombre_depto_vals = [
        "ATLÁNTICO","ANTIOQUIA","SANTANDER","BOGOTÁ, D.C.","NORTE DE SANTANDER",
        "VALLE DEL CAUCA","BOLÍVAR","SUCRE","CAUCA","NARIÑO","PUTUMAYO",
        "MAGDALENA","META","HUILA"
    ]

    dane5_vals = [8001, 5001, 68001, 11001, 54001, 76001, 13001, 70001, 25175]

    # ---------------------------
    # DataFrame
    # ---------------------------
    df = pd.DataFrame({
        "AÑO": np.random.choice(range(2018, 2024), size=n),
        "GPSLAT": np.random.choice([np.nan] + list(np.random.uniform(4, 11, n)), size=n),
        "GPSLONG": np.random.choice([np.nan] + list(np.random.uniform(-76, -74, n)), size=n),
        "SECTOR": np.random.randint(1, 100, size=n),
        "SECCION": np.random.randint(1, 10, size=n),
        "MANZANA": np.random.randint(1, 50, size=n),
        "DIRECCION_FILTRO": np.random.choice(direccion_vals, size=n),
        "DANE5": np.random.choice(dane5_vals, size=n),
        "PB1": np.random.choice([1,2], size=n),
        "PERSONAS": np.random.choice([1,2,3,4,5,6,7,8,9,10], size=n),
        "EDAD": np.random.randint(12, 80, size=n),
        "REDAD": np.random.choice([1,2,3,4,5,6,7], size=n),
        "GENERO": np.random.choice([1,2], size=n),
        "ESTRATO": np.random.choice([0,1,2,3,4,5,6], size=n),
        "P9": np.random.choice([np.nan] + list(np.random.rand(10)), size=n),
        "P10": np.random.choice(["88","01","010203","112123"], size=n),
        "P34": np.random.choice([np.nan] + list(np.random.rand(10)), size=n),
        "P56": np.random.choice([np.nan] + list(np.random.rand(10)), size=n),
        "P57": np.random.choice([np.nan] + list(np.random.rand(10)), size=n),
        "P64": np.random.choice([np.nan] + list(np.random.rand(10)), size=n),
        "NIVEL_PIRAMIDE": np.random.choice([0,1,2,3], size=n),
        "POBLACION_5_16": np.random.uniform(100, 5000, size=n),
        "Estudiantes_5_16": np.random.uniform(50, 3000, size=n),
        "PROP_EDUC_5_16_MEN": np.random.uniform(0, 1, size=n),
        "HOGARES_INTERNET": np.random.randint(10, 80000, size=n),
        "POBLACIÓN_ICFES": np.random.randint(50, 90000, size=n),
        "TASA_INTERNET_ICFES": np.random.uniform(0, 1, size=n),
        "MUNICIPIO_NOMBRE": np.random.choice(municipio_nombre_vals, size=n),
        "Nombre Municipio": np.random.choice(nombre_municipio_vals, size=n),
        "Nombre Departamento": np.random.choice(nombre_depto_vals, size=n),
        "Servicios_Telecomunicaciones_No": np.random.choice([0,1], size=n),
        "Servicios_Telecomunicaciones_Si": np.random.choice([0,1], size=n),
        "Dispositivos_hogar_No": np.random.choice([True, False], size=n),
        "conexion_hogar_si": np.random.choice([True, False], size=n),
        "interrupciones_si": np.random.choice([True, False], size=n),
        "frec_uso_si": True,
        "dens_int": np.random.uniform(0, 1, size=n)
    })

    return df

# Ejemplo de uso
df_ficticio = generar_datos_ficticios_completos(200)

# Implementación modelo

In [17]:
# ============================================
# 0) Imports y utilidades
# ============================================
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, adjusted_rand_score

# -------- Helpers de preparación --------
FEATURES = [
    "Servicios_Telecomunicaciones_No",
    "Servicios_Telecomunicaciones_Si",
    "Dispositivos_hogar_No",
    "conexion_hogar_si",
    "interrupciones_si",
    "frec_uso_si",
    "dens_int",
]

# Variables SOLO para perfilar (no entrenan el modelo)
SOCIO_GEO = [
    "EDAD", "ESTRATO", "NIVEL_PIRAMIDE",
    "MUNICIPIO_NOMBRE", "Nombre Departamento"
]

def load_data(path="data_procesada.csv"):
    df = pd.read_csv(path)
    # Asegurar columnas esperadas
    missing = [c for c in FEATURES if c not in df.columns]
    if missing:
        raise ValueError(f"Faltan columnas en data_procesada: {missing}")

    # Coerción a numérico/bool
    for c in FEATURES:
        # Si es binario, mapear True/False/NaN a 1/0
        if c != "dens_int":
            df[c] = df[c].astype("float").astype("Int64")  # por si viene como 0/1/NaN
            df[c] = df[c].fillna(0).astype(int)
        else:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    # Imputación mínima para dens_int
    if df["dens_int"].isna().any():
        df["dens_int"] = df["dens_int"].fillna(df["dens_int"].median())

    return df

def apply_weights(X: pd.DataFrame, weights: dict | None):
    """Multiplica columnas por pesos; útil para el Experimento 2."""
    Xw = X.copy()
    if weights:
        for col, w in weights.items():
            if col in Xw.columns:
                Xw[col] = Xw[col].astype(float) * float(w)
    return Xw

def scale_numeric(X: pd.DataFrame, numeric_cols=("dens_int",)):
    Xs = X.copy()
    scaler = StandardScaler()
    Xs[list(numeric_cols)] = scaler.fit_transform(Xs[list(numeric_cols)])
    return Xs

# -------- Métricas y estabilidad --------
def compute_internal_metrics(X: np.ndarray, labels: np.ndarray) -> dict:
    # Silhouette requiere al menos 2 clústers y no todos los puntos iguales
    sil = silhouette_score(X, labels) if len(np.unique(labels)) > 1 else np.nan
    ch  = calinski_harabasz_score(X, labels) if len(np.unique(labels)) > 1 else np.nan
    db  = davies_bouldin_score(X, labels) if len(np.unique(labels)) > 1 else np.nan
    return {"silhouette": sil, "calinski_harabasz": ch, "davies_bouldin": db}

def profile_sharpness(centroids: pd.DataFrame, binary_cols: list[str]) -> float:
    """Promedio de distancia del centroide a 0.5 (cuanto mayor, más 'nítido')."""
    if not binary_cols:
        return np.nan
    diffs = []
    for col in binary_cols:
        if col in centroids.columns:
            diffs.append(np.abs(centroids[col] - 0.5))
    if not diffs:
        return np.nan
    return float(pd.concat(diffs, axis=1).mean().mean())

def cluster_size_ok(labels: np.ndarray, min_ratio=0.02) -> bool:
    n = len(labels)
    sizes = pd.Series(labels).value_counts(normalize=True)
    return (sizes >= min_ratio).all()

def stability_ari(X: np.ndarray, k: int, seeds: list[int], subsample=0.8) -> float:
    """Promedio de ARI entre múltiples corridas con submuestreo."""
    label_list = []
    rng = np.random.default_rng(42)
    n = X.shape[0]
    for seed in seeds:
        idx = rng.choice(n, size=int(n*subsample), replace=False)
        kmeans = KMeans(n_clusters=k, n_init=10, random_state=seed)
        labels = kmeans.fit_predict(X[idx])
        # Volver a tamaño completo (asignación a vecinos más cercanos)
        # Para estabilidad comparamos solo intersecciones
        label_list.append((idx, labels))

    # Promedio de ARI en intersecciones
    if len(label_list) < 2:
        return np.nan
    aris = []
    for i in range(len(label_list)):
        for j in range(i+1, len(label_list)):
            idx_i, lab_i = label_list[i]
            idx_j, lab_j = label_list[j]
            common = np.intersect1d(idx_i, idx_j, assume_unique=False)
            if len(common) < 10:
                continue
            # Mapear a posiciones en cada subconjunto
            map_i = pd.Series(range(len(idx_i)), index=idx_i)
            map_j = pd.Series(range(len(idx_j)), index=idx_j)
            li = lab_i[map_i.loc[common].values]
            lj = lab_j[map_j.loc[common].values]
            aris.append(adjusted_rand_score(li, lj))
    return float(np.mean(aris)) if aris else np.nan

def geographic_entropy(df: pd.DataFrame, labels: np.ndarray, col_region="Nombre Departamento") -> float:
    """Entropía normalizada promedio de la distribución regional por clúster."""
    df_ = df[[col_region]].copy()
    df_["cluster"] = labels
    entropies = []
    for c, g in df_.groupby("cluster"):
        p = g[col_region].value_counts(normalize=True)
        H = -(p * np.log(p + 1e-12)).sum()
        H_max = np.log(len(p)) if len(p) > 1 else 1.0
        entropies.append(H / H_max)
    return float(np.mean(entropies)) if entropies else np.nan

def centroid_table(X: pd.DataFrame, labels: np.ndarray):
    """Promedio por clúster (binarios → tasas; dens_int → media)."""
    dfc = X.copy()
    dfc["cluster"] = labels
    cent = dfc.groupby("cluster").mean(numeric_only=True).sort_index()
    sizes = dfc["cluster"].value_counts().sort_index().to_frame(name="n")
    sizes["pct"] = sizes["n"] / len(X)
    return cent, sizes

# -------- Bucle de experimentos --------
def run_kmeans_grid(df: pd.DataFrame, k_values, weights=None, exp_name="exp"):
    # 1) Subconjunto de features
    X = df[FEATURES].copy()
    # 2) Pesos (opcional para Exp. 2)
    X = apply_weights(X, weights)
    # 3) Escalar dens_int
    X = scale_numeric(X, numeric_cols=("dens_int",))
    X_np = X.values

    results = []
    best = None

    for k in k_values:
        km = KMeans(n_clusters=k, n_init=50, init="k-means++", random_state=123)
        labels = km.fit_predict(X_np)

        # Métricas internas
        m = compute_internal_metrics(X_np, labels)

        # Tamaño mínimo por clúster
        ok = cluster_size_ok(labels, min_ratio=0.02)

        # Centroides y nitidez de perfil
        cent, sizes = centroid_table(X, labels)
        ps = profile_sharpness(cent, [c for c in FEATURES if c != "dens_int"])

        # Estabilidad (rápida): seeds más pequeñas para la malla
        ari = stability_ari(X_np, k, seeds=[0,1,2,3,4], subsample=0.8)

        row = {
            "exp": exp_name, "k": k, "silhouette": m["silhouette"],
            "calinski_harabasz": m["calinski_harabasz"], "davies_bouldin": m["davies_bouldin"],
            "profile_sharpness": ps, "stability_ari": ari,
            "min_size_ok": ok, "sizes": sizes["pct"].round(3).to_dict()
        }
        results.append(row)

        # Actualizar mejor (por Silhouette, luego CH, luego PS y estabilidad)
        if ok:
            if best is None:
                best = (row, labels, cent, sizes)
            else:
                br = best[0]
                better = (
                    (row["silhouette"] > br["silhouette"] + 1e-6) or
                    (np.isclose(row["silhouette"], br["silhouette"]) and row["calinski_harabasz"] > br["calinski_harabasz"] + 1e-6) or
                    (np.isclose(row["silhouette"], br["silhouette"]) and np.isclose(row["calinski_harabasz"], br["calinski_harabasz"]) and row["profile_sharpness"] > br["profile_sharpness"] + 1e-6) or
                    (np.isclose(row["silhouette"], br["silhouette"]) and np.isclose(row["calinski_harabasz"], br["calinski_harabasz"]) and np.isclose(row["profile_sharpness"], br["profile_sharpness"]) and row["stability_ari"] > br["stability_ari"] + 1e-6)
                )
                if better:
                    best = (row, labels, cent, sizes)

    results_df = pd.DataFrame(results).sort_values(["silhouette","calinski_harabasz"], ascending=[False, False])
    return results_df, best

def interpret_clusters(df: pd.DataFrame, labels: np.ndarray):
    """Perfiles con variables socio/geo (no utilizadas en el entrenamiento)."""
    out = {}

    # Anexar labels
    tmp = df.copy()
    tmp["cluster"] = labels

    # Resumen socio-demográfico
    socio_num = tmp.groupby("cluster")[["EDAD", "ESTRATO"]].agg(["mean","median","std","count"])
    # NIVEL_PIRAMIDE distribución
    piramide = (tmp
                .groupby(["cluster", "NIVEL_PIRAMIDE"])
                .size()
                .groupby(level=0)
                .apply(lambda s: (s / s.sum()).round(3))
                .unstack(fill_value=0))

    # Top departamentos y municipios por cluster
    depto = (tmp.groupby(["cluster","Nombre Departamento"]).size()
                .groupby(level=0)
                .apply(lambda s: (s / s.sum()).sort_values(ascending=False).head(5))
                .to_frame("share"))
    mpio = (tmp.groupby(["cluster","MUNICIPIO_NOMBRE"]).size()
                .groupby(level=0)
                .apply(lambda s: (s / s.sum()).sort_values(ascending=False).head(5))
                .to_frame("share"))

    out["socio_num"] = socio_num
    out["nivel_piramide"] = piramide
    out["top_departamentos"] = depto
    out["top_municipios"] = mpio
    return out


In [18]:

# ============================================
# 1) Cargar datos
# ============================================
df = load_data("data_procesada.csv")

# ============================================
# 2) Experimento 1 — Línea base
# ============================================
k_values_exp1 = [4,5,6,7,8,9,10,12]
res1, best1 = run_kmeans_grid(df, k_values_exp1, weights=None, exp_name="exp1")
print("Resumen EXP1 (top 10):")
print(res1.head(10))

# Interpretación del mejor de EXP1
row1, labels1, cent1, sizes1 = best1
geoH1 = geographic_entropy(df, labels1, col_region="Nombre Departamento")
print("\nMejor EXP1:", row1)
print("Entropía geográfica (promedio):", round(geoH1, 3))
print("\nCentroides EXP1:")
print(cent1.round(3))
print("\nTamaños EXP1:")
print(sizes1.assign(pct = (sizes1["pct"]*100).round(1)))

interp1 = interpret_clusters(df, labels1)
# Ejemplo de impresión breve:
print("\nTop departamentos EXP1:")
print(interp1["top_departamentos"].head(15))

Resumen EXP1 (top 10):
    exp   k  silhouette  calinski_harabasz  davies_bouldin  profile_sharpness  \
7  exp1  12    0.581389        5727.729515        0.786539           0.469081   
6  exp1  10    0.552298        5373.025104        0.797002           0.458214   
5  exp1   9    0.526628        5215.895152        0.862672           0.446199   
4  exp1   8    0.503722        5078.551223        0.929814           0.455454   
3  exp1   7    0.489833        5008.885657        0.978842           0.437764   
2  exp1   6    0.473070        4838.416996        0.932473           0.441245   
1  exp1   5    0.436763        4937.847432        0.936604           0.423702   
0  exp1   4    0.376060        4788.867507        1.031835           0.391998   

   stability_ari  min_size_ok  \
7       0.955508         True   
6       0.881761         True   
5       0.784771         True   
4       0.839268         True   
3       0.838855         True   
2       0.861025         True   
1       0.972946

In [19]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# --- 1) Features para clustering (las que usaste en EXP1) ---
FEATURES = [
    "Servicios_Telecomunicaciones_No",
    "Servicios_Telecomunicaciones_Si",
    "Dispositivos_hogar_No",
    "conexion_hogar_si",
    "interrupciones_si",
    "frec_uso_si",
    "dens_int",
]

# --- 2) Preparar matriz de entrenamiento ---
X = df[FEATURES].copy()
scaler = StandardScaler()
X[["dens_int"]] = scaler.fit_transform(X[["dens_int"]])  # solo escalar dens_int

# --- 3) Entrenar K-means con k=12 (configuración seleccionada) ---
kmeans = KMeans(n_clusters=12, n_init=50, random_state=123)
labels = kmeans.fit_predict(X)

# Añadir columna de cluster al dataframe original
df_lab = df.copy()
df_lab["cluster"] = labels

# Aplicación en dataset ficticio

In [20]:
features = X.columns

# Seleccionar solo esas columnas en la nueva muestra
X_ficticio = df_ficticio[features]

# Aplicar predicción de clusters
labels_test = kmeans.predict(X_ficticio)

# Añadir al df ficticio
df_ficticio_lab = df_ficticio.copy()
df_ficticio_lab["cluster"] = labels_test

In [21]:
sil_ficticio = silhouette_score(X_ficticio, labels_test)
print("Silhouette en nueva muestra:", sil_ficticio)

Silhouette en nueva muestra: 0.07785950073257869
