# 🧪 Prueba de Hipótesis 3: Relación entre Hábito de Fumar/Beber y Diabetes

**Objetivo:** Verificar si existe relación significativa entre el hábito de fumar y/o beber alcohol y la presencia de diabetes.

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

sns.set(style="whitegrid")

In [8]:
# Cargar datos
data = pd.read_csv('datasets/diabetes_health_indicators.csv')
data.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [9]:
# Revisar columnas para identificar variables de fumar y beber
data.columns

Index(['Diabetes_012', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
       'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
       'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth',
       'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education',
       'Income'],
      dtype='object')

In [10]:
# Tabla de contingencia: Fumar vs Diabetes
cont_smoke = pd.crosstab(data['Diabetes_012'], data['Smoker'])
print(cont_smoke)

Smoker           0.0    1.0
Diabetes_012               
0.0           121879  91824
1.0             2349   2282
2.0            17029  18317


In [11]:
# Prueba de chi-cuadrado para fumar
chi2_smoke, p_smoke, _, _ = chi2_contingency(cont_smoke)
print(f"Estadístico chi-cuadrado (Fumar): {chi2_smoke:.4f}")
print(f"p-valor (Fumar): {p_smoke:.4f}")
if p_smoke < 0.05:
    print("Se rechaza H0: Hay relación significativa entre fumar y diabetes.")
else:
    print("No se rechaza H0: No hay evidencia de relación significativa entre fumar y diabetes.")

Estadístico chi-cuadrado (Fumar): 1010.5118
p-valor (Fumar): 0.0000
Se rechaza H0: Hay relación significativa entre fumar y diabetes.


In [12]:
# Tabla de contingencia: Beber vs Diabetes
cont_drink = pd.crosstab(data['Diabetes_012'], data['HvyAlcoholConsump'])
print(cont_drink)

HvyAlcoholConsump     0.0    1.0
Diabetes_012                    
0.0                200487  13216
1.0                  4423    208
2.0                 34514    832


In [13]:
# Prueba de chi-cuadrado para beber
chi2_drink, p_drink, _, _ = chi2_contingency(cont_drink)
print(f"Estadístico chi-cuadrado (Beber): {chi2_drink:.4f}")
print(f"p-valor (Beber): {p_drink:.4f}")
if p_drink < 0.05:
    print("Se rechaza H0: Hay relación significativa entre beber y diabetes.")
else:
    print("No se rechaza H0: No hay evidencia de relación significativa entre beber y diabetes.")

Estadístico chi-cuadrado (Beber): 850.3240
p-valor (Beber): 0.0000
Se rechaza H0: Hay relación significativa entre beber y diabetes.
