In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
import matplotlib.pyplot as plt

In [133]:
import warnings

warnings.filterwarnings('ignore')

In [48]:
def get_cleaned_data():
    df = pd.read_csv('insurance.csv')
    df.dropna(inplace=True)
    df = df.drop_duplicates(keep='first')
    return df

In [103]:
def showshapes(dfsplit):
    print('x_train ',dfsplit[0].shape)
    print('y_train ',dfsplit[2].shape)
    print('x_test ',dfsplit[1].shape)
    print('y_test ',dfsplit[3].shape)

In [104]:
def splitdata(x,y,test_size=0.25,random_state=43):
    x_train,x_test,y_train,y_test  = train_test_split(x,y,test_size=test_size,random_state=random_state)
    return (x_train,x_test,y_train,y_test)

## 1  x = age,sex,bmi,smoker

###  without scaling the data

In [225]:
df = get_cleaned_data()
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [226]:
ll = LabelEncoder()
df['sex'] = ll.fit_transform(df['sex'])
df['smoker'] = ll.fit_transform(df['smoker'])

In [227]:
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523


In [236]:
x = df[['age','sex','bmi','smoker']]
y = df[['charges']]

In [237]:
dfsplit = splitdata(x,y)

In [238]:
showshapes(dfsplit)

x_train  (1002, 4)
y_train  (1002, 1)
x_test  (335, 4)
y_test  (335, 1)


In [243]:
def showscore(svr):
    score = svr.score(dfsplit[1],dfsplit[3])
    print('Score =',score)

In [239]:
def result(mse):
    print('MSE  = ',mse)
    print('RMSE = ',np.sqrt(mse))

In [240]:
svr = SVR(kernel='rbf')
svr.fit(dfsplit[0],np.array(dfsplit[2]).ravel())

SVR()

In [241]:
dfsplit[1]

Unnamed: 0,age,sex,bmi,smoker
1162,30,1,38.830,0
1191,41,0,21.755,0
134,20,0,28.785,0
723,19,1,35.400,0
1180,42,0,41.325,0
...,...,...,...,...
485,31,0,31.065,0
1071,63,1,31.445,0
1254,34,0,27.720,0
1070,37,1,37.070,1


In [242]:
y_pred_1 = svr.predict(dfsplit[1])
mse1 = mean_squared_error(y_pred_1,dfsplit[3])

result(mse1)

MSE  =  169801514.46258458
RMSE =  13030.791014462038


In [245]:
showscore(svr)

Score = -0.13480310366359527


## 2 x = age,sex,bmi,smoker

###  with scaled data

In [246]:
x.head(2)

Unnamed: 0,age,sex,bmi,smoker
0,19,0,27.9,1
1,18,1,33.77,0


In [247]:
ss = StandardScaler()
x['age'] = ss.fit_transform(x[['age']])
x['bmi'] = ss.fit_transform(x[['bmi']])

x.head(2)

Unnamed: 0,age,sex,bmi,smoker
0,-1.440418,0,-0.45316,1
1,-1.511647,1,0.509422,0


In [248]:
dfsplit = splitdata(x,y)

showshapes(dfsplit)

x_train  (1002, 4)
y_train  (1002, 1)
x_test  (335, 4)
y_test  (335, 1)


In [249]:
dfsplit[0].head(2)

Unnamed: 0,age,sex,bmi,smoker
547,1.052623,0,2.629724,0
672,-0.229512,1,-0.15799,0


In [250]:
svr2 = SVR(kernel='rbf')
svr2.fit(dfsplit[0],dfsplit[2])

SVR()

In [251]:
y_pred_2 = svr2.predict(dfsplit[1])
mse2 = mean_squared_error(y_pred_2,dfsplit[3])
result(mse2)

MSE  =  169395565.29365125
RMSE =  13015.205157570557


In [252]:
showscore(svr2)

Score = -0.13209009855116616


## 3. x = age bmi smoker

### svr model without scaling the data

In [253]:
df = get_cleaned_data()

In [254]:
x = df[['age','bmi','smoker',]]
y = df[['charges']]

In [255]:
x

Unnamed: 0,age,bmi,smoker
0,19,27.900,yes
1,18,33.770,no
2,28,33.000,no
3,33,22.705,no
4,32,28.880,no
...,...,...,...
1333,50,30.970,no
1334,18,31.920,no
1335,18,36.850,no
1336,21,25.800,no


In [256]:
ll = LabelEncoder()
x['smoker'] = ll.fit_transform(x[['smoker']])

In [257]:
dfsplit = splitdata(x,y)
showshapes(dfsplit)

x_train  (1002, 3)
y_train  (1002, 1)
x_test  (335, 3)
y_test  (335, 1)


In [258]:
svr = SVR(kernel='rbf')

svr.fit(dfsplit[0],dfsplit[2])

y_pred_3 = svr.predict(dfsplit[1])
mse3 = mean_squared_error(y_pred1,dfsplit[3])

result(mse3)

MSE  =  168580885.84010407
RMSE =  12983.870218086135


In [259]:
showscore(svr)

Score = -0.1345551353207166


## 4. x = age bmi smoker

### svr model with scaled data

In [260]:
df = get_cleaned_data()

In [261]:
x = df[['age','bmi','smoker',]]
y = df[['charges']]

In [262]:
ss = StandardScaler()

In [263]:
x['age'] = ss.fit_transform(x[['age']])
x['bmi'] = ss.fit_transform(x[['bmi']])

In [264]:
ll = LabelEncoder()

In [265]:
x['smoker'] = ll.fit_transform(x[['smoker']])

In [266]:
dfsplit = splitdata(x,y)
showshapes(dfsplit)

x_train  (1002, 3)
y_train  (1002, 1)
x_test  (335, 3)
y_test  (335, 1)


In [267]:
dfsplit[0].head(2)

Unnamed: 0,age,bmi,smoker
547,1.052623,2.629724,0
672,-0.229512,-0.15799,0


In [268]:
svr = SVR(kernel='rbf')

svr.fit(dfsplit[0],np.array(dfsplit[2]).ravel())

y_pred_4 = svr.predict(dfsplit[1])
mse4 = mean_squared_error(y_pred_4,np.array(dfsplit[3]).ravel())

result(mse4)

MSE  =  169267350.66863406
RMSE =  13010.278654534424


In [270]:
showscore(svr)

Score = -0.13123322542571092
