In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("insurance.csv")

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [5]:
df.children.value_counts()

0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

In [6]:
le=LabelEncoder()
df['sex']=le.fit_transform(df['sex'])
df['children']=le.fit_transform(df['children'])
df['smoker']=le.fit_transform(df['smoker'])
df['region']=le.fit_transform(df['region'])

In [7]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [8]:
y=df['charges']
X=df.drop('charges',axis=1)

In [9]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,3
1,18,1,33.770,1,0,2
2,28,1,33.000,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.880,0,0,1
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1
1334,18,0,31.920,0,0,0
1335,18,0,36.850,0,0,2
1336,21,0,25.800,0,0,3


In [10]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
sc=StandardScaler()
numeric=['age', 'bmi', 'children']
X_train[numeric]=sc.fit_transform(X_train[numeric])
X_test[numeric]=sc.transform(X_test[numeric])

In [39]:
lr=LinearRegression()
lr.fit(X_train,y_train)
y_pre=lr.predict(X_test)
print("Linear Regression")
print("R2 Score:",r2_score(y_test,y_pre),"\n","MAE:",mean_absolute_error(y_test,y_pre),"\n","MSE:",np.sqrt(mean_squared_error(y_test,y_pre)))

Linear Regression
R2 Score: 0.7602640802497019 
 MAE: 4204.415654724193 
 MSE: 5927.226827909312


In [38]:
dt=DecisionTreeRegressor()
dt.fit(X_train,y_train)
y_pre=dt.predict(X_test)
print("DecisionTreeRegressor")
print("R2 Score:",r2_score(y_test,y_pre),"\n","MAE:",mean_absolute_error(y_test,y_pre),"\n","MSE:",np.sqrt(mean_squared_error(y_test,y_pre)))

DecisionTreeRegressor
R2 Score: 0.7294442146664364 
 MAE: 3050.8779097262445 
 MSE: 6296.70584247296


In [37]:
svr=SVR()
svr.fit(X_train,y_train)
y_pre=svr.predict(X_test)
print("SVR")
print("R2 Score:",r2_score(y_test,y_pre),"\n","MAE:",mean_absolute_error(y_test,y_pre),"\n","MSE:",np.sqrt(mean_squared_error(y_test,y_pre)))

SVR
R2 Score: -0.0813150467358068 
 MAE: 8284.522600650624 
 MSE: 12588.127003492918


In [36]:
rf=RandomForestRegressor()
rf.fit(X_train,y_train)
y_pre=rf.predict(X_test)
print("RandomForestRegressor")
print("R2 Score:",r2_score(y_test,y_pre),"\n","MAE:",mean_absolute_error(y_test,y_pre),"\n","MSE:",np.sqrt(mean_squared_error(y_test,y_pre)))

RandomForestRegressor
R2 Score: 0.8482409971501101 
 MAE: 2563.641983173304 
 MSE: 4715.8743467297045


In [35]:
knn=KNeighborsRegressor()
knn.fit(X_train,y_train)
y_pre=knn.predict(X_test)
print('KNN')
print("R2 Score:",r2_score(y_test,y_pre),"\n","MAE:",mean_absolute_error(y_test,y_pre),"\n","MSE:",np.sqrt(mean_squared_error(y_test,y_pre)))

KNN
R2 Score: 0.705990982031571 
 MAE: 3878.395332119909 
 MSE: 6563.950820437874


# Using Cross Validation

In [21]:
rf=RandomForestRegressor(random_state=42)
score = cross_val_score(rf,X,y,cv=5,n_jobs=-1)
print(score.mean())

<IPython.core.display.Javascript object>

0.835954346067321


In [22]:
lr=LinearRegression()
score = cross_val_score(lr,X,y,cv=10,n_jobs=-1)
print(score.mean())

<IPython.core.display.Javascript object>

0.7448047213193172


In [23]:
dt=DecisionTreeRegressor(random_state=42)
score = cross_val_score(dt,X,y,cv=10,n_jobs=-1)
print(score.mean())

<IPython.core.display.Javascript object>

0.6994528680806197
