

> # **1. Importación de las librerías**



In [52]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt
import seaborn as sns

from pandas_profiling import ProfileReport

# **2. Importación del data set**

In [None]:
base = pd.read_csv(r'insurance.csv')

In [None]:
base.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [None]:
base.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


# 3. **Limpleza y transformación de datos**

In [None]:
base.describe(percentiles = [.25, .5, .75, .95, .99])

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
95%,62.0,41.106,3.0,41181.827787
99%,64.0,46.4079,5.0,48537.480726
max,64.0,53.13,5.0,63770.42801


In [36]:
from sklearn.preprocessing import OneHotEncoder
base = pd.concat([base,pd.get_dummies(base['sex'],prefix='sex')],axis=1).drop(['sex'],axis=1)
base = pd.concat([base,pd.get_dummies(base['region'],prefix='region')],axis=1).drop(['region'],axis=1)
base = pd.concat([base,pd.get_dummies(base['smoker'],prefix='smoker')],axis=1).drop(['smoker'],axis=1)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,female,male
0,19,female,27.900,0,yes,southwest,16884.92400,1,0
1,18,male,33.770,1,no,southeast,1725.55230,0,1
2,28,male,33.000,3,no,southeast,4449.46200,0,1
3,33,male,22.705,0,no,northwest,21984.47061,0,1
4,32,male,28.880,0,no,northwest,3866.85520,0,1
...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,0,1
1334,18,female,31.920,0,no,northeast,2205.98080,1,0
1335,18,female,36.850,0,no,southeast,1629.83350,1,0
1336,21,female,25.800,0,no,southwest,2007.94500,1,0


In [43]:
base.head()

Unnamed: 0,age,bmi,children,charges,sex_female,sex_male,region_northeast,region_northwest,region_southeast,region_southwest,smoker_no,smoker_yes
0,19,27.9,0,16884.924,1,0,0,0,0,1,0,1
1,18,33.77,1,1725.5523,0,1,0,0,1,0,1,0
2,28,33.0,3,4449.462,0,1,0,0,1,0,1,0
3,33,22.705,0,21984.47061,0,1,0,1,0,0,1,0
4,32,28.88,0,3866.8552,0,1,0,1,0,0,1,0


# 4. Generación de datos de entrenamiento y prueba

In [45]:
X = base[base.columns.difference(['charges'])]

In [44]:
Y = base['charges'] 

In [46]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 1)

In [47]:
X_train.shape

(1070, 11)

In [48]:
X_test.shape

(268, 11)

# 5 Normalicación de datos

In [49]:
scaler = StandardScaler()

In [50]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Entrenamiento de la regresión

In [56]:
from sklearn.linear_model import LinearRegression 
regr = LinearRegression()

In [77]:
regr.fit(X_train_scaled, Y_train)

LinearRegression()

In [58]:
regr.coef_

array([ 3595.43999401,  1969.23968492,   492.75838658,   254.34544825,
          83.36933982,  -197.21188927,  -134.97505592,    60.52769018,
         -60.52769018, -4806.59265853,  4806.59265853])

In [59]:
regr.intercept_

13230.161574933647

## **6. Normalización del modelo**

In [73]:
from sklearn.metrics import mean_squared_error

preds_train = regr.predict(X_train_scaled)
preds_test = regr.predict(X_test_scaled)
mean_squared_error(preds_test, Y_test)

35479352.80730363

# **7. Análisis de error**

In [64]:
(Y_test - preds_test).describe(percentiles = [.25, .5, .75, .95, .99])

count      268.000000
mean        65.443115
std       5967.238510
min     -10696.432397
25%      -2533.838274
50%       -993.576702
75%       1222.670644
95%       9795.606908
99%      20253.425024
max      25439.753770
Name: charges, dtype: float64

Conclusión: El error obtenido es considerado alto, para poder realizar un tratamiento a este problema es necesario regularizar el modelo

# **8.Regresión polinomial y regularización**

## **8.1 Regresión polinomial**

**Regresión polinomial Grado 2**

In [78]:
from sklearn.preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures(degree=2,include_bias=False)
X_poly_train = poly_features.fit_transform(X_train_scaled)
X_poly_test = poly_features.fit_transform(X_test_scaled)
regr_poly=regr.fit(X_poly_train, Y_train)

In [79]:
from sklearn.metrics import mean_squared_error

preds_train_poly = regr_poly.predict(X_poly_train)
preds_test_poly = regr_poly.predict(X_poly_test)
mean_squared_error(preds_test_poly, Y_test)

20517044.29127224

**Regresión polinomial Grado 3**

In [97]:
poly_features2 = PolynomialFeatures(degree=3,include_bias=False)
X_poly_train2 = poly_features.fit_transform(X_train_scaled)
X_poly_test2 = poly_features.fit_transform(X_test_scaled)
regr_poly2=regr.fit(X_poly_train2, Y_train)
preds_train_poly2 = regr_poly.predict(X_poly_train2)
preds_test_poly2 = regr_poly.predict(X_poly_test2)
mean_squared_error(preds_test_poly2, Y_test)

20517044.29127224

A pesar de disminuir el error con este polinomio es necesario regularizar el modelo

## **8.2 Regresión ridge alpha = 0.5**

In [94]:
from sklearn.linear_model import Ridge
ridge_reg1 = Ridge(alpha=5,solver="cholesky")
ridge_reg1.fit(X_train_scaled,Y_train)
preds_train_ridge1 = ridge_reg1.predict(X_train_scaled)
preds_test_ridge1 = ridge_reg1.predict(X_test_scaled)
mean_squared_error(preds_test_ridge1, Y_test)


35496894.440046735

## **8.3 Regresión ridge alpha = 1.5**

In [93]:
from sklearn.linear_model import Ridge
ridge_reg2 = Ridge(alpha=1.5,solver="cholesky")
ridge_reg2.fit(X_train_scaled,Y_train)
preds_train_ridge2 = ridge_reg2.predict(X_train_scaled)
preds_test_ridge2 = ridge_reg2.predict(X_test_scaled)
mean_squared_error(preds_test_ridge2, Y_test)
 

35484444.1786725

## **8.4 Regresión ridge alpha = 2**

In [95]:
from sklearn.linear_model import Ridge
ridge_reg3 = Ridge(alpha=1.5,solver="cholesky")
ridge_reg3.fit(X_train_scaled,Y_train)
preds_train_ridge3 = ridge_reg3.predict(X_train_scaled)
preds_test_ridge3 = ridge_reg3.predict(X_test_scaled)
mean_squared_error(preds_test_ridge3, Y_test)


35486174.03044901

En conclusión la mejor apriximación es obtenida con la aproximación polinomial.