In [2]:
# ============================
# Selección de features útiles
# ============================

import pandas as pd

# --- Configuración ---
file_path = "../data/places_no_social_clean.csv"   # 👈 cámbialo a imputado si quieres
target_candidates = ["depression", "frequent_mental_distress"]

# --- 1. Cargar datos ---
df = pd.read_csv(file_path)

print("Dimensiones:", df.shape)
print("\n📌 Todas las columnas disponibles:")
print(df.columns.tolist())

# --- 2. Función para seleccionar features ---
def select_features(df, target, corr_threshold=0.25, drop_cols=None):
    if drop_cols is None:
        drop_cols = ["stateabbr", "statedesc", "locationname", "locationid", 
                     "totalpopulation", "totalpop18plus"]

    # Copia y convierte a numérico todo
    df_num = df.copy()
    for col in df_num.columns:
        if col not in drop_cols:
            df_num[col] = pd.to_numeric(df_num[col], errors="coerce")

    # Calcular correlaciones
    if target not in df_num.columns:
        raise ValueError(f"⚠️ Target {target} no está en el DataFrame.")

    corrs = df_num.corr(numeric_only=True)[target].drop(target).sort_values(ascending=False)

    # Filtrar
    selected = corrs[abs(corrs) >= corr_threshold]

    return corrs, selected

# --- 3. Ejecutar para cada target ---
for t in target_candidates:
    if t in df.columns:
        print(f"\n🔎 Target: {t}")
        corrs, selected = select_features(
            df, target=t, corr_threshold=0.25, drop_cols=target_candidates
        )
        print("\nTop correlaciones:")
        print(corrs.head(15))
        print("\n✅ Features seleccionadas:")
        print(list(selected.index))
    else:
        print(f"\n⚠️ Target '{t}' no está en este dataset.")


Dimensiones: (3077, 39)

📌 Todas las columnas disponibles:
['stateabbr', 'statedesc', 'locationname', 'totalpopulation', 'totalpop18plus', 'locationid', 'all_teeth_lost', 'annual_checkup', 'any_disability', 'arthritis', 'binge_drinking', 'copd', 'cancer_(non_skin)_or_melanoma', 'cholesterol_screening', 'cognitive_disability', 'colorectal_cancer_screening', 'coronary_heart_disease', 'current_asthma', 'current_cigarette_smoking', 'dental_visit', 'depression', 'diabetes', 'frequent_mental_distress', 'frequent_physical_distress', 'general_health', 'health_insurance', 'hearing_disability', 'high_blood_pressure', 'high_blood_pressure_medication', 'high_cholesterol', 'independent_living_disability', 'mammography', 'mobility_disability', 'obesity', 'physical_inactivity', 'self_care_disability', 'short_sleep_duration', 'stroke', 'vision_disability']

🔎 Target: depression

Top correlaciones:
frequent_mental_distress         0.720885
current_asthma                   0.633135
cognitive_disability 