In [139]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor


In [140]:
df = pd.read_csv('insurance.csv')

In [141]:
num_data = df[['age','bmi','children']]
cat_data = df[['sex','smoker','region']]
class_target = df[['charges']]

In [142]:
num_data.head()

Unnamed: 0,age,bmi,children
0,19,27.9,0
1,18,33.77,1
2,28,33.0,3
3,33,22.705,0
4,32,28.88,0


In [143]:
cat_data.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [144]:
cat_data = pd.get_dummies(cat_data)
cat_data

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1333,0,1,1,0,0,1,0,0
1334,1,0,1,0,1,0,0,0
1335,1,0,1,0,0,0,1,0
1336,1,0,1,0,0,0,0,1


In [145]:
num_data = pd.concat([num_data, cat_data], axis=1)
num_data

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.900,0,1,0,0,1,0,0,0,1
1,18,33.770,1,0,1,1,0,0,0,1,0
2,28,33.000,3,0,1,1,0,0,0,1,0
3,33,22.705,0,0,1,1,0,0,1,0,0
4,32,28.880,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,0,1,1,0,0,1,0,0
1334,18,31.920,0,1,0,1,0,1,0,0,0
1335,18,36.850,0,1,0,1,0,0,0,1,0
1336,21,25.800,0,1,0,1,0,0,0,0,1


In [146]:
class_target.head()

Unnamed: 0,charges
0,16884.924
1,1725.5523
2,4449.462
3,21984.47061
4,3866.8552


In [147]:
#defining features (X) and (Y)
X = num_data
Y = class_target
mse = []
train_acc = []
cols = list(X.columns) 

In [None]:
# loop through each feature (column)
for i in cols: 
    LR = LinearRegression()
    cv_results = cross_validate(LR,X[[i]],Y, cv=10, scoring='neg_mean_squared_error')
    #LR.fit(x_train[[i]], y_train)
    #y_pred = LR.predict(x_test[[i]])
    mse.append(abs(cv_results['test_score']).mean())

In [148]:
def accuracy_score(model):
    y_pred = model.predict(X)
    train_acc = model.score(X, Y)
    mse = (mean_squared_error(Y, y_pred))
    rmse = np.sqrt(mse)
    print('Training accuracy is ', train_acc)
    print('Root Mean Square Error is ', rmse)
    return [(train_acc * 100),  rmse]

In [149]:
LR = LinearRegression()
LR.fit(X, Y)

LinearRegression()

In [150]:
accuracy_score(LR)

Training accuracy is  0.7509130345985207
Root Mean Square Error is  6041.6796511744515


[75.09130345985207, 6041.6796511744515]

In [151]:
LassoR = Lasso(10)
LassoR.fit(X, Y)

Lasso(alpha=10)

In [152]:
accuracy_score(LassoR)

Training accuracy is  0.7508927852939449
Root Mean Square Error is  6041.925222685856


[75.08927852939449, 6041.925222685856]

In [153]:
RidgeR = Ridge(0.1)
RidgeR.fit(X, Y)

Ridge(alpha=0.1)

In [154]:
accuracy_score(RidgeR)

Training accuracy is  0.7509130006886889
Root Mean Square Error is  6041.680062421047


[75.09130006886889, 6041.680062421047]

In [155]:
DTR = DecisionTreeRegressor(max_depth = 3)
DTR.fit(X, Y)

DecisionTreeRegressor(max_depth=3)

In [156]:
accuracy_score(DTR)

Training accuracy is  0.857364547897558
Root Mean Square Error is  4571.891703786278


[85.7364547897558, 4571.891703786278]

In [157]:
RFR = RandomForestRegressor(max_depth = 4)
RFR.fit(X, Y)

  RFR.fit(X, Y)


RandomForestRegressor(max_depth=4)

In [158]:
accuracy_score(RFR)

Training accuracy is  0.8751676627651116
Root Mean Square Error is  4277.063951621027


[87.51676627651116, 4277.063951621027]