# Explore here

In [2]:
#Paso 1: Definición del problema
#🎯 Problema de negocio
#La depresión es uno de los trastornos mentales más prevalentes a nivel mundial. Se estima que millones de personas la padecen, y muchas de ellas no reciben el tratamiento adecuado debido a estigmas sociales, falta de diagnóstico o acceso limitado a servicios de salud mental. La detección temprana es esencial para intervenir oportunamente y prevenir consecuencias graves como el deterioro en la calidad de vida o incluso el suicidio.

#El objetivo de este proyecto es desarrollar un sistema de predicción que, basándose en información sociodemográfica, historial médico y hábitos de vida de los individuos, sea capaz de detectar indicios de depresión. Esto permitiría generar herramientas de apoyo para psicólogos, profesionales de salud, instituciones educativas o incluso en ambientes laborales, facilitando intervenciones tempranas y preventivas.

#🤖 Problema de Machine Learning
#Se plantea un problema de clasificación binaria, donde a partir de variables como la edad, estado civil, nivel educativo, hábitos alimenticios, patrones de sueño, historial médico, entre otras, se busca predecir si una persona se encuentra en un estado depresivo (1) o no (0).



In [None]:
#Paso 2: Obtencion y carga del conjunto de datos

In [3]:
import pandas as pd

df = pd.read_csv('../data/raw/depression_data.csv')
print(df.head())


               Name  Age Marital Status    Education Level  \
0  Christine Barker   31        Married  Bachelor's Degree   
1  Jacqueline Lewis   55        Married        High School   
2    Shannon Church   78        Widowed    Master's Degree   
3    Charles Jordan   58       Divorced    Master's Degree   
4      Michael Rich   18         Single        High School   

   Number of Children Smoking Status Physical Activity Level  \
0                   2     Non-smoker                  Active   
1                   1     Non-smoker               Sedentary   
2                   1     Non-smoker               Sedentary   
3                   3     Non-smoker                Moderate   
4                   0     Non-smoker               Sedentary   

  Employment Status     Income Alcohol Consumption Dietary Habits  \
0        Unemployed   26265.67            Moderate       Moderate   
1          Employed   42710.36                High      Unhealthy   
2          Employed  125332.79     

In [None]:
#Nota sobre la variable objetivo:
#Dado que el dataset original no incluía una columna explícita que indicara si una persona se encuentra en estado depresivo,
# se generó una variable sintética Depressed basada en criterios clínicamente reconocidos como factores de riesgo de depresión,
# tales como: historial de enfermedad mental, trastornos del sueño, abuso de sustancias, antecedentes familiares, entre otros.
# Esta aproximación se utilizó exclusivamente para fines académicos, con el objetivo de demostrar el proceso completo de Machine
# Learning aplicado a datos de salud mental.

#Depressed = 1 → indica un caso considerado con alto riesgo de depresión.
#Depressed = 0 → indica un caso considerado sin riesgo significativo de depresión.
def calcular_depresion(row):
    factores = 0
    if row['History of Mental Illness'] == 'Yes':
        factores += 1
    if row['History of Substance Abuse'] == 'Yes':
        factores += 1
    if row['Family History of Depression'] == 'Yes':
        factores += 1
    if row['Sleep Patterns'] == 'Poor':
        factores += 1
    if row['Chronic Medical Conditions'] == 'Yes':
        factores += 1
    if row['Physical Activity Level'] == 'Sedentary':
        factores += 1
    if row['Alcohol Consumption'] == 'High':
        factores += 1
    return 1 if factores >= 2 else 0

df['Depressed'] = df.apply(calcular_depresion, axis=1)


In [None]:
#Paso 3: Almacenar la información

In [5]:
import sqlite3
import pandas as pd

#Conectar df a SQL
conn = sqlite3.connect("depression_data.db")

# Almacenar los datos en la base de datos SQLite en una tabla llamada 'health_data'
df.to_sql('health_data', conn, if_exists='replace', index=False)
print("Datos cargados correctamente en la base de datos 'mental_health.db'")


Datos cargados correctamente en la base de datos 'mental_health.db'


In [7]:
# 1: Ver las primeras 5 filas
query1 = "SELECT * FROM health_data LIMIT 5"
print("▶ Consulta 1: Primeras 5 filas")
print(pd.read_sql(query1, conn), "\n")



▶ Consulta 1: Primeras 5 filas
               Name  Age Marital Status    Education Level  \
0  Christine Barker   31        Married  Bachelor's Degree   
1  Jacqueline Lewis   55        Married        High School   
2    Shannon Church   78        Widowed    Master's Degree   
3    Charles Jordan   58       Divorced    Master's Degree   
4      Michael Rich   18         Single        High School   

   Number of Children Smoking Status Physical Activity Level  \
0                   2     Non-smoker                  Active   
1                   1     Non-smoker               Sedentary   
2                   1     Non-smoker               Sedentary   
3                   3     Non-smoker                Moderate   
4                   0     Non-smoker               Sedentary   

  Employment Status     Income Alcohol Consumption Dietary Habits  \
0        Unemployed   26265.67            Moderate       Moderate   
1          Employed   42710.36                High      Unhealthy   
2   

In [8]:
# 2: Contar personas deprimidas (1) vs no deprimidas (0)
query2 = "SELECT Depressed, COUNT(*) as Total FROM health_data GROUP BY Depressed"
print("▶ Consulta 2: Conteo deprimidos vs no")
print(pd.read_sql(query2, conn), "\n")



▶ Consulta 2: Conteo deprimidos vs no
   Depressed   Total
0          0  128135
1          1  285633 



In [9]:
# Consulta 3: Ver distribución por nivel de actividad física
query3 = "SELECT [Physical Activity Level], COUNT(*) as Total FROM health_data GROUP BY [Physical Activity Level]"
print("▶ Consulta 3: Conteo por nivel de actividad física")
print(pd.read_sql(query3, conn), "\n")

▶ Consulta 3: Conteo por nivel de actividad física
  Physical Activity Level   Total
0                  Active   78905
1                Moderate  158013
2               Sedentary  176850 



In [10]:
query4 = "SELECT [Sleep Patterns], COUNT(*) as Total FROM health_data GROUP BY [Sleep Patterns]"
print("▶ Consulta 4: Conteo por patrón de sueño")
print(pd.read_sql(query4, conn), "\n")

▶ Consulta 4: Conteo por patrón de sueño
  Sleep Patterns   Total
0           Fair  196789
1           Good   87397
2           Poor  129582 



In [11]:
# Consulta 5: Porcentaje promedio de depresión por consumo de alcohol
query5 = """
    SELECT [Alcohol Consumption], ROUND(AVG(Depressed), 2) as Porcentaje_Deprimidos
    FROM health_data
    GROUP BY [Alcohol Consumption]
"""
print("▶ Consulta 5: Porcentaje de deprimidos por consumo de alcohol")
print(pd.read_sql(query5, conn), "\n")

# Cerrar conexión
conn.close()

▶ Consulta 5: Porcentaje de deprimidos por consumo de alcohol
  Alcohol Consumption  Porcentaje_Deprimidos
0                High                   0.94
1                 Low                   0.61
2            Moderate                   0.61 



In [12]:
df.head()

Unnamed: 0,Name,Age,Marital Status,Education Level,Number of Children,Smoking Status,Physical Activity Level,Employment Status,Income,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions,Depressed
0,Christine Barker,31,Married,Bachelor's Degree,2,Non-smoker,Active,Unemployed,26265.67,Moderate,Moderate,Fair,Yes,No,Yes,Yes,1
1,Jacqueline Lewis,55,Married,High School,1,Non-smoker,Sedentary,Employed,42710.36,High,Unhealthy,Fair,Yes,No,No,Yes,1
2,Shannon Church,78,Widowed,Master's Degree,1,Non-smoker,Sedentary,Employed,125332.79,Low,Unhealthy,Good,No,No,Yes,No,1
3,Charles Jordan,58,Divorced,Master's Degree,3,Non-smoker,Moderate,Unemployed,9992.78,Moderate,Moderate,Poor,No,No,No,No,0
4,Michael Rich,18,Single,High School,0,Non-smoker,Sedentary,Unemployed,8595.08,Low,Moderate,Fair,Yes,No,Yes,Yes,1


In [13]:
#No aporta nada Name
df = df.drop(columns=['Name'])

In [None]:
#Paso 4: Realiza un análisis descriptivo

In [14]:
df.describe()


Unnamed: 0,Age,Number of Children,Income,Depressed
count,413768.0,413768.0,413768.0,413768.0
mean,49.000713,1.298972,50661.707971,0.690322
std,18.158759,1.237054,40624.100565,0.462362
min,18.0,0.0,0.41,0.0
25%,33.0,0.0,21001.03,0.0
50%,49.0,1.0,37520.135,1.0
75%,65.0,2.0,76616.3,1.0
max,80.0,4.0,209995.22,1.0


In [15]:
df.describe(include=['object'])

Unnamed: 0,Marital Status,Education Level,Smoking Status,Physical Activity Level,Employment Status,Alcohol Consumption,Dietary Habits,Sleep Patterns,History of Mental Illness,History of Substance Abuse,Family History of Depression,Chronic Medical Conditions
count,413768,413768,413768,413768,413768,413768,413768,413768,413768,413768,413768,413768
unique,4,5,3,3,2,3,3,3,2,2,2,2
top,Married,Bachelor's Degree,Non-smoker,Sedentary,Employed,Moderate,Unhealthy,Fair,No,No,No,No
freq,240444,124329,247416,176850,265659,173440,170817,196789,287943,284880,302515,277561


In [17]:
df.isnull().sum()

Age                             0
Marital Status                  0
Education Level                 0
Number of Children              0
Smoking Status                  0
Physical Activity Level         0
Employment Status               0
Income                          0
Alcohol Consumption             0
Dietary Habits                  0
Sleep Patterns                  0
History of Mental Illness       0
History of Substance Abuse      0
Family History of Depression    0
Chronic Medical Conditions      0
Depressed                       0
dtype: int64

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413768 entries, 0 to 413767
Data columns (total 16 columns):
 #   Column                        Non-Null Count   Dtype  
---  ------                        --------------   -----  
 0   Age                           413768 non-null  int64  
 1   Marital Status                413768 non-null  object 
 2   Education Level               413768 non-null  object 
 3   Number of Children            413768 non-null  int64  
 4   Smoking Status                413768 non-null  object 
 5   Physical Activity Level       413768 non-null  object 
 6   Employment Status             413768 non-null  object 
 7   Income                        413768 non-null  float64
 8   Alcohol Consumption           413768 non-null  object 
 9   Dietary Habits                413768 non-null  object 
 10  Sleep Patterns                413768 non-null  object 
 11  History of Mental Illness     413768 non-null  object 
 12  History of Substance Abuse    413768 non-nul

In [None]:
#Paso 5: Realiza un EDA completo

In [19]:
#Revisar subniveles (opciones a elegir por columna) de las columnas 
columnas_categoricas=['Marital Status','Education Level','Smoking Status','Physical Activity Level','Employment Status','Dietary Habits','Sleep Patterns','History of Mental Illness','History of Substance Abuse','Family History of Depression','Chronic Medical Conditions']

for col in columnas_categoricas:
    print(f'Columna {col}: {df[col].nunique()} subniveles')

Columna Marital Status: 4 subniveles
Columna Education Level: 5 subniveles
Columna Smoking Status: 3 subniveles
Columna Physical Activity Level: 3 subniveles
Columna Employment Status: 2 subniveles
Columna Dietary Habits: 3 subniveles
Columna Sleep Patterns: 3 subniveles
Columna History of Mental Illness: 2 subniveles
Columna History of Substance Abuse: 2 subniveles
Columna Family History of Depression: 2 subniveles
Columna Chronic Medical Conditions: 2 subniveles


In [20]:
columnas_numericas = df.select_dtypes(include=['int64', 'float64']).columns

for col in columnas_numericas:
        print(f'Columna {col}: {df[col].nunique()} subniveles')

Columna Age: 63 subniveles
Columna Number of Children: 5 subniveles
Columna Income: 405282 subniveles
Columna Depressed: 2 subniveles


In [None]:
#Dimensiones DF
df.shape

(413768, 16)

In [22]:
df.drop_duplicates(inplace=True)

In [23]:
df.shape

(413768, 16)