In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('future.no_silent_downcasting', True)

In [2]:
# ==========================================
# 1. CARGA DE DATOS
# ==========================================
ruta = "/Users/danielarenee/Desktop/MD_proyecto_final/data_adolescentes/adolescentes_ensanut2023_w_n.csv"
data = pd.read_csv(ruta, delimiter=";")

Dataset cargado: (1924, 467)


In [4]:
# ==========================================
# 2. CREACIÓN DE VARIABLE TARGET
# ==========================================

variables_depresion = ['d0601a', 'd0601b', 'd0601c', 'd0601d', 'd0601f', 'd0601g']

data_copy = data.copy()

# Convertir a numérico
for var in variables_depresion:
    data_copy[var] = pd.to_numeric(data_copy[var].replace(' ', np.nan), errors='coerce')

# Invertir d0601f y crear score (rango: 6-23)
data_copy['d0601f_inv'] = 5 - data_copy['d0601f']
variables_score = ['d0601a', 'd0601b', 'd0601c', 'd0601d', 'd0601f_inv', 'd0601g']
data_copy['score_depresion'] = data_copy[variables_score].sum(axis=1)

# Clasificar en 4 categorías ordinales (≤6, 7-8, 9-11, ≥12)
def clasificar_depresion(score):
    if pd.isna(score):
        return np.nan
    elif score <= 6:
        return "Sin depresión"
    elif score <= 8:
        return "Leve"
    elif score <= 11:
        return "Moderada"
    else:
        return "Severa"

data_copy['nivel_depresion'] = data_copy['score_depresion'].apply(clasificar_depresion)

# Mostrar distribución
print("Distribución de clases:")
for nivel, n in data_copy['nivel_depresion'].value_counts().items():
    print(f"  {nivel:<15}: {n:>6} ({n/len(data_copy)*100:>5.1f}%)")

Distribución de clases:
  Sin depresión  :    714 ( 37.1%)
  Leve           :    501 ( 26.0%)
  Moderada       :    490 ( 25.5%)
  Severa         :    219 ( 11.4%)


In [6]:
# ==========================================
# 3. SELECCIÓN DE FEATURES
# ==========================================
# Total: 21 features
ft_demograficas = ['edad', 'sexo']
ft_alimentarias = ['d06a1', 'd06a2', 'd06a3', 'd06a4', 'd06a5', 'd06a6', 'd06a7', 'd06a8', 'd06a9', 'd06a10']

# d0701 - accidentes
# d0801 - agresión
# d0810 - abuso sexual
# d0817 - pensamientos suicidas
# d0819 - autolesiones
# d0101 - consumo de tabaco
# d0108 - consumo de alcohol
# d0421 - ansiedad
# d0601e - sueño

ft_extras = ['d0701', 'd0801', 'd0810', 'd0817', 'd0819', 'd0101', 'd0108', 'd0421', 'd0601e']

selected_features = ft_demograficas + ft_alimentarias + ft_extras


In [10]:
# ==========================================
# 4. PREPARACIÓN DE DATOS
# ==========================================
# Crear dataset con features + target
df_model = data_copy[selected_features + ['nivel_depresion']].copy()

# Convertir features a numérico
for col in selected_features:
    df_model[col] = pd.to_numeric(df_model[col].replace(' ', np.nan), errors='coerce')

# Eliminar filas sin target o con missing en features
df_clean = df_model.dropna()

print(f"Observaciones: {len(data_copy)} → {len(df_clean)} (perdidas: {len(data_copy)-len(df_clean)})")


Observaciones: 1924 → 1592 (perdidas: 332)
