### Analysis data

In [353]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

In [354]:

data = pd.read_csv('https://storage.googleapis.com/esmartdata-courses-files/ml-course/insurance.csv')
df = data.copy()
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [355]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [356]:
print("Num of duplicates -",df.duplicated().sum())

df = df[~df.duplicated()]

print("Num of duplicates -",df.duplicated().sum())

Num of duplicates - 1
Num of duplicates - 0


In [357]:
df.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [358]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1337.0,1337.0,1337.0,1337.0
mean,39.222139,30.663452,1.095737,13279.121487
std,14.044333,6.100468,1.205571,12110.359656
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29,0.0,4746.344
50%,39.0,30.4,1.0,9386.1613
75%,51.0,34.7,2.0,16657.71745
max,64.0,53.13,5.0,63770.42801


In [359]:
df.describe(include=['O'])

Unnamed: 0,sex,smoker,region
count,1337,1337,1337
unique,2,2,4
top,male,no,southeast
freq,675,1063,364


In [360]:
df.smoker.value_counts()

no     1063
yes     274
Name: smoker, dtype: int64

In [361]:
df.sex.value_counts()

male      675
female    662
Name: sex, dtype: int64

In [362]:
df.region.value_counts()

southeast    364
southwest    325
northeast    324
northwest    324
Name: region, dtype: int64

In [363]:
fig = px.histogram(df, 'charges', nbins=40,  width=700, height=400)
fig.update_layout(template='presentation')
fig.show()

### Preprocessing and cleaning

In [364]:
df_dummy = pd.get_dummies(df, drop_first=True)

df_dummy

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.900,0,16884.92400,0,1,0,0,1
1,18,33.770,1,1725.55230,1,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.880,0,3866.85520,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0
1334,18,31.920,0,2205.98080,0,0,0,0,0
1335,18,36.850,0,1629.83350,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,1


In [365]:
df_dummy.corr()['charges'].sort_values(ascending=False)

charges             1.000000
smoker_yes          0.787234
age                 0.298308
bmi                 0.198401
region_southeast    0.073578
children            0.067389
sex_male            0.058044
region_northwest   -0.038695
region_southwest   -0.043637
Name: charges, dtype: float64

In [366]:
import statsmodels.api as sm

df_new = df_dummy.copy()
results = df_new.pop('charges')
df_new['const'] = np.ones(len(df_new))
cols = ['const']  +  list(df_new.columns[:-1])
df_new = df_new[cols]

ols = sm.OLS(endog=np.array(results),
             exog=np.array(df_new) ).fit()

print(ols.summary(xname=list(df_new.columns)) )


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     500.0
Date:                Tue, 27 Apr 2021   Prob (F-statistic):               0.00
Time:                        23:10:21   Log-Likelihood:                -13538.
No. Observations:                1337   AIC:                         2.709e+04
Df Residuals:                    1328   BIC:                         2.714e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const            -1.194e+04    988.227  

In [367]:
df_new.drop('sex_male', axis=1, inplace=True)

ols = sm.OLS(endog=np.array(results),
             exog=np.array(df_new) ).fit()

print(ols.summary(xname=list(df_new.columns)) )

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.751
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     571.8
Date:                Tue, 27 Apr 2021   Prob (F-statistic):               0.00
Time:                        23:10:21   Log-Likelihood:                -13538.
No. Observations:                1337   AIC:                         2.709e+04
Df Residuals:                    1329   BIC:                         2.713e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const            -1.199e+04    979.209  

In [368]:
df_new.drop(['region_northwest', 'region_southwest', 'region_southeast'] , axis=1, inplace=True)

ols = sm.OLS(endog=np.array(results),
             exog=np.array(df_new) ).fit()

print(ols.summary(xname=list(df_new.columns)) )


df_new.drop('const', axis=1, inplace=True)

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.750
Model:                            OLS   Adj. R-squared:                  0.749
Method:                 Least Squares   F-statistic:                     996.5
Date:                Tue, 27 Apr 2021   Prob (F-statistic):               0.00
Time:                        23:10:21   Log-Likelihood:                -13541.
No. Observations:                1337   AIC:                         2.709e+04
Df Residuals:                    1332   BIC:                         2.712e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -1.21e+04    942.630    -12.835      0.0

In [369]:
df_new.head()

Unnamed: 0,age,bmi,children,smoker_yes
0,19,27.9,0,1
1,18,33.77,1,0
2,28,33.0,3,0
3,33,22.705,0,0
4,32,28.88,0,0





### Comparison of regression methods


In [370]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(df_new, results, test_size=0.2)

In [371]:
from sklearn.linear_model import LinearRegression

regression = LinearRegression()
regression.fit(X_train, Y_train)


df_result = pd.DataFrame(np.array(Y_test), columns=["True"])
df_result['LinearRegression'] = regression.predict(X_test)

df_result.head()


Unnamed: 0,True,LinearRegression
0,16115.3045,26248.787106
1,2803.69785,449.679954
2,11837.16,10870.780327
3,9377.9047,11155.007475
4,8342.90875,12937.206193


In [372]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly_train = poly.fit_transform(X_train)
X_poly_test = poly.transform(X_test)

regressor_poly = LinearRegression()
regressor_poly.fit(X_poly_train, Y_train)

df_result['PolynominalRegression'] = regressor_poly.predict(X_poly_test)
df_result.head()


Unnamed: 0,True,LinearRegression,PolynominalRegression
0,16115.3045,26248.787106,15662.961923
1,2803.69785,449.679954,4453.231068
2,11837.16,10870.780327,12631.575461
3,9377.9047,11155.007475,11596.807986
4,8342.90875,12937.206193,9467.388662


In [373]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor()
grid = {'max_depth':np.arange(1,7)}
grid_search = GridSearchCV(tree, grid)
grid_search.fit(X_train, Y_train)

tree = grid_search.best_estimator_
tree.fit(X_train, Y_train)

df_result['TreeRegression'] = tree.predict(X_test)
df_result.head()

Unnamed: 0,True,LinearRegression,PolynominalRegression,TreeRegression
0,16115.3045,26248.787106,15662.961923,15332.214712
1,2803.69785,449.679954,4453.231068,6831.862776
2,11837.16,10870.780327,12631.575461,12703.894975
3,9377.9047,11155.007475,11596.807986,10424.443264
4,8342.90875,12937.206193,9467.388662,10424.443264


In [374]:
from sklearn.metrics import mean_squared_error

print("MSE for Linear Regression -", mean_squared_error(df_result['True'], df_result['LinearRegression']))
print("MSE for Polynominal Regression -", mean_squared_error(df_result['True'], df_result['PolynominalRegression']))
print("MSE for Tree Regression -", mean_squared_error(df_result['True'], df_result['TreeRegression']))


MSE for Linear Regression - 41220573.47051991
MSE for Polynominal Regression - 27829508.167245362
MSE for Tree Regression - 21541315.70644854


In [375]:
from sklearn.metrics import r2_score

print("R2 score for Linear Regression -",r2_score(df_result['True'], df_result['LinearRegression']))
print("R2 score for Polynominal Regression -",r2_score(df_result['True'], df_result['PolynominalRegression']))
print("R2 score for Tree Regression -",r2_score(df_result['True'], df_result['TreeRegression']))


R2 score for Linear Regression - 0.7475588866521824
R2 score for Polynominal Regression - 0.8295678241670762
R2 score for Tree Regression - 0.8680776791278321
