# Análisis del efecto de fumar en el peso de los recien nacidos

Puedes encontrar información del dataset [aquí](https://rpubs.com/phil1234/916282)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm

In [16]:
babies = pd.read_csv("data/ncbirths.csv")

In [17]:
babies.head()

Unnamed: 0,fage,mage,mature,weeks,premie,visits,marital,gained,weight,lowbirthweight,gender,habit,whitemom
0,,13,younger mom,39.0,full term,10.0,not married,38.0,7.63,not low,male,nonsmoker,not white
1,,14,younger mom,42.0,full term,15.0,not married,20.0,7.88,not low,male,nonsmoker,not white
2,19.0,15,younger mom,37.0,full term,11.0,not married,38.0,6.63,not low,female,nonsmoker,white
3,21.0,15,younger mom,41.0,full term,6.0,not married,34.0,8.0,not low,male,nonsmoker,white
4,,15,younger mom,39.0,full term,9.0,not married,27.0,6.38,not low,female,nonsmoker,not white


In [18]:
babies.groupby(["whitemom", "marital"], as_index=False)["mage"].count()

Unnamed: 0,whitemom,marital,mage
0,not white,married,102
1,not white,not married,182
2,white,married,510
3,white,not married,203


In [19]:
babies.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   fage            829 non-null    float64
 1   mage            1000 non-null   int64  
 2   mature          1000 non-null   object 
 3   weeks           998 non-null    float64
 4   premie          998 non-null    object 
 5   visits          991 non-null    float64
 6   marital         999 non-null    object 
 7   gained          973 non-null    float64
 8   weight          1000 non-null   float64
 9   lowbirthweight  1000 non-null   object 
 10  gender          1000 non-null   object 
 11  habit           999 non-null    object 
 12  whitemom        998 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 101.7+ KB


In [20]:
babies.dropna(inplace=True)

In [21]:
babies["mature"] = babies["mature"].map({"younger mom": 1, "mature mom":0})
babies["premie"] = babies["premie"].map({"premie": 1, "full term":0})
babies["marital"] = babies["marital"].map({"not married": 1, "married":0})
babies["whitemom"] = babies["whitemom"].map({"not white": 1, "white":0})
babies["gender"] = babies["gender"].map({"female": 1, "male":0})
babies["habit"] = babies["habit"].map({"smoker": 1, "nonsmoker":0})

In [22]:
babies

Unnamed: 0,fage,mage,mature,weeks,premie,visits,marital,gained,weight,lowbirthweight,gender,habit,whitemom
2,19.0,15,1,37.0,0,11.0,1,38.0,6.63,not low,1,0,0
3,21.0,15,1,41.0,0,6.0,1,34.0,8.00,not low,0,0,0
6,18.0,15,1,37.0,0,12.0,1,76.0,8.44,not low,0,0,1
7,17.0,15,1,35.0,1,5.0,1,15.0,4.69,low,0,0,1
9,20.0,16,1,37.0,0,13.0,1,52.0,6.94,not low,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,47.0,42,0,40.0,0,10.0,0,26.0,8.44,not low,0,0,1
996,34.0,42,0,38.0,0,18.0,0,20.0,6.19,not low,1,0,0
997,39.0,45,0,40.0,0,15.0,0,32.0,6.94,not low,1,0,0
998,55.0,46,0,31.0,1,8.0,0,25.0,4.56,low,1,0,1


In [31]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo

# La convenciones determinan que las features se llamen X 
# Además, tendremos habitualmente un conjunto de train y otro de test (con un split del 80/20, habitualmente)
# Por lo tanto, X_train contiene las features del train
# Este caso es tan sencillo y con pocos datos que no tenemos X_test, pero aparecerá
X_train = babies[["mage", "weeks", "visits", "marital", "gained", "gender", "habit", "whitemom"]]
#X_train = babies.drop(columns=["weight", "lowbirthweight"])

# Añadimos una columna de 1 porque necesitamos una constante que determine el intercept o B0, el punto donde corta la recta con el eje Y
# Por eso añadimos una columna constante de 0s
# Veremos que estoy aparece en muchos otros modelos (en redes neuronales se llama bias)
X_train = sm.add_constant(X_train, prepend=True)
# OLS: creamos el modelo del tipo Ordinary Least Squares (por alguna razón creo que una vez dije Optimal pero es Ordinary)
modelo = sm.OLS(endog=babies["weight"], exog=X_train)
# Ajustamos el modelo, es decir, hacemos el calculo de la mejor recta posible según los criterios del OLS
modelo = modelo.fit()
print(modelo.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.454
Model:                            OLS   Adj. R-squared:                  0.449
Method:                 Least Squares   F-statistic:                     82.31
Date:                Wed, 08 May 2024   Prob (F-statistic):           9.24e-99
Time:                        19:25:41   Log-Likelihood:                -1184.6
No. Observations:                 800   AIC:                             2387.
Df Residuals:                     791   BIC:                             2429.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.3270      0.592     -8.994      0.0

In [32]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo

# La convenciones determinan que las features se llamen X 
# Además, tendremos habitualmente un conjunto de train y otro de test (con un split del 80/20, habitualmente)
# Por lo tanto, X_train contiene las features del train
# Este caso es tan sencillo y con pocos datos que no tenemos X_test, pero aparecerá
X_train = babies[["mage", "weeks", "marital", "gained", "gender", "habit", "whitemom"]]
#X_train = babies.drop(columns=["weight", "lowbirthweight"])

# Añadimos una columna de 1 porque necesitamos una constante que determine el intercept o B0, el punto donde corta la recta con el eje Y
# Por eso añadimos una columna constante de 0s
# Veremos que estoy aparece en muchos otros modelos (en redes neuronales se llama bias)
X_train = sm.add_constant(X_train, prepend=True)
# OLS: creamos el modelo del tipo Ordinary Least Squares (por alguna razón creo que una vez dije Optimal pero es Ordinary)
modelo = sm.OLS(endog=babies["weight"], exog=X_train)
# Ajustamos el modelo, es decir, hacemos el calculo de la mejor recta posible según los criterios del OLS
modelo = modelo.fit()
print(modelo.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.454
Model:                            OLS   Adj. R-squared:                  0.449
Method:                 Least Squares   F-statistic:                     94.15
Date:                Wed, 08 May 2024   Prob (F-statistic):          9.76e-100
Time:                        19:40:24   Log-Likelihood:                -1184.7
No. Observations:                 800   AIC:                             2385.
Df Residuals:                     792   BIC:                             2423.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.3200      0.592     -8.992      0.0

In [33]:
# Creación del modelo utilizando matrices como en scikitlearn
# ==============================================================================
# A la matriz de predictores se le tiene que añadir una columna de 1s para el intercept del modelo

# La convenciones determinan que las features se llamen X 
# Además, tendremos habitualmente un conjunto de train y otro de test (con un split del 80/20, habitualmente)
# Por lo tanto, X_train contiene las features del train
# Este caso es tan sencillo y con pocos datos que no tenemos X_test, pero aparecerá
X_train = babies[["weeks", "marital", "gained", "gender", "habit", "whitemom"]]
#X_train = babies.drop(columns=["weight", "lowbirthweight"])

# Añadimos una columna de 1 porque necesitamos una constante que determine el intercept o B0, el punto donde corta la recta con el eje Y
# Por eso añadimos una columna constante de 0s
# Veremos que estoy aparece en muchos otros modelos (en redes neuronales se llama bias)
X_train = sm.add_constant(X_train, prepend=True)
# OLS: creamos el modelo del tipo Ordinary Least Squares (por alguna razón creo que una vez dije Optimal pero es Ordinary)
modelo = sm.OLS(endog=babies["weight"], exog=X_train)
# Ajustamos el modelo, es decir, hacemos el calculo de la mejor recta posible según los criterios del OLS
modelo = modelo.fit()
print(modelo.summary())

                            OLS Regression Results                            
Dep. Variable:                 weight   R-squared:                       0.453
Model:                            OLS   Adj. R-squared:                  0.449
Method:                 Least Squares   F-statistic:                     109.6
Date:                Wed, 08 May 2024   Prob (F-statistic):          1.69e-100
Time:                        19:40:45   Log-Likelihood:                -1185.4
No. Observations:                 800   AIC:                             2385.
Df Residuals:                     793   BIC:                             2418.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         -5.0427      0.539     -9.357      0.0

In [35]:
modelo.resid.mean()

1.2077006061872453e-14

In [24]:
X_train.corr()

Unnamed: 0,const,fage,mage,mature,weeks,premie,visits,marital,gained,gender,habit,whitemom
const,,,,,,,,,,,,
fage,,1.0,0.780648,-0.496405,-0.007997,-0.011323,0.085239,-0.349831,-0.038946,-0.060528,-0.087924,-0.122129
mage,,0.780648,1.0,-0.641403,-0.040008,0.002518,0.142897,-0.419957,-0.059486,-0.014025,-0.113418,-0.121613
mature,,-0.496405,-0.641403,1.0,0.064255,-0.045385,-0.047617,0.123954,0.028885,-0.001851,0.073479,0.032331
weeks,,-0.007997,-0.040008,0.064255,1.0,-0.725519,0.166717,-0.057601,0.09858,-0.013934,-0.01818,-0.079184
premie,,-0.011323,0.002518,-0.045385,-0.725519,1.0,-0.122986,0.054222,-0.139807,-0.034384,0.046545,0.051758
visits,,0.085239,0.142897,-0.047617,0.166717,-0.122986,1.0,-0.183586,0.055893,0.055895,-0.032514,-0.024758
marital,,-0.349831,-0.419957,0.123954,-0.057601,0.054222,-0.183586,1.0,0.0148,0.00342,0.051816,0.243026
gained,,-0.038946,-0.059486,0.028885,0.09858,-0.139807,0.055893,0.0148,1.0,-0.030932,0.010478,-0.062082
gender,,-0.060528,-0.014025,-0.001851,-0.013934,-0.034384,0.055895,0.00342,-0.030932,1.0,-0.036905,-0.048532
