In [1]:
# 📓 Notebook: eda_features_overview.ipynb
# 🎯 Objetivo: listar y comparar las features disponibles en los datasets principales
# -------------------------------------------------------------------

import pandas as pd

# ==============================
# 1. Cargar datasets principales
# ==============================
path_no_social = "../data/places_no_social_clean.csv"
path_imputed_full = "../data/places_imputed_full_clean.csv"

df_no_social = pd.read_csv(path_no_social)
df_imputed_full = pd.read_csv(path_imputed_full)

print("No Social Clean:", df_no_social.shape)
print("Imputed Full Clean:", df_imputed_full.shape)

# ==============================
# 2. Listar columnas de cada dataset
# ==============================
cols_no_social = set(df_no_social.columns)
cols_imputed = set(df_imputed_full.columns)

print("\n--- Features en No Social Clean ---")
print(sorted(cols_no_social))

print("\n--- Features en Imputed Full Clean ---")
print(sorted(cols_imputed))

# ==============================
# 3. Comparación de columnas
# ==============================
common_features = cols_no_social & cols_imputed
only_no_social = cols_no_social - cols_imputed
only_imputed = cols_imputed - cols_no_social

print("\n✅ Features comunes:", len(common_features))
print(sorted(common_features))

print("\n⚠️ Solo en No Social:", len(only_no_social))
print(sorted(only_no_social))

print("\n⚠️ Solo en Imputed Full:", len(only_imputed))
print(sorted(only_imputed))

# ==============================
# 4. Guardar resumen en CSV
# ==============================
summary = []

for col in sorted(common_features):
    summary.append({"feature": col, "dataset": "Ambos"})

for col in sorted(only_no_social):
    summary.append({"feature": col, "dataset": "Solo No Social"})

for col in sorted(only_imputed):
    summary.append({"feature": col, "dataset": "Solo Imputed Full"})

df_summary = pd.DataFrame(summary)
df_summary.to_csv("../data/features_comparison.csv", index=False)

print("\n📊 Resumen guardado en ../data/features_comparison.csv")
df_summary.head(20)


No Social Clean: (3077, 39)
Imputed Full Clean: (3144, 46)

--- Features en No Social Clean ---
['all_teeth_lost', 'annual_checkup', 'any_disability', 'arthritis', 'binge_drinking', 'cancer_(non_skin)_or_melanoma', 'cholesterol_screening', 'cognitive_disability', 'colorectal_cancer_screening', 'copd', 'coronary_heart_disease', 'current_asthma', 'current_cigarette_smoking', 'dental_visit', 'depression', 'diabetes', 'frequent_mental_distress', 'frequent_physical_distress', 'general_health', 'health_insurance', 'hearing_disability', 'high_blood_pressure', 'high_blood_pressure_medication', 'high_cholesterol', 'independent_living_disability', 'locationid', 'locationname', 'mammography', 'mobility_disability', 'obesity', 'physical_inactivity', 'self_care_disability', 'short_sleep_duration', 'stateabbr', 'statedesc', 'stroke', 'totalpop18plus', 'totalpopulation', 'vision_disability']

--- Features en Imputed Full Clean ---
['all_teeth_lost', 'annual_checkup', 'any_disability', 'arthritis', 'b

Unnamed: 0,feature,dataset
0,all_teeth_lost,Ambos
1,annual_checkup,Ambos
2,any_disability,Ambos
3,arthritis,Ambos
4,binge_drinking,Ambos
5,cancer_(non_skin)_or_melanoma,Ambos
6,cholesterol_screening,Ambos
7,cognitive_disability,Ambos
8,colorectal_cancer_screening,Ambos
9,copd,Ambos
