In [70]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [71]:
dataset = pd.read_csv('insurance.csv')
dataset.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [72]:
dataset.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [73]:
df = dataset.copy()

In [74]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [75]:
df.describe(include='O')

Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


In [76]:
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523


In [77]:
df.region.unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [78]:
df.children.unique()

array([0, 1, 3, 2, 5, 4], dtype=int64)

In [79]:
X = df.iloc[:,:-1].values
Y = df.iloc[:,-1].values

In [80]:
#spliting the data into train test split
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X,Y,random_state=42,test_size=0.2)

In [81]:
xtrain.shape,xtest.shape

((1070, 6), (268, 6))

In [82]:
xtrain[0]

array([46, 'female', 19.95, 2, 'no', 'northwest'], dtype=object)

In [83]:
#performing one hot encoding and standardization
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[1,4,5])],remainder='passthrough')
xtrain = ct.fit_transform(xtrain)

In [84]:
xtrain[0]

array([1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 46, 19.95, 2], dtype=object)

In [85]:
xtest = ct.transform(xtest)

In [86]:
xtest[0]

array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 45, 25.175, 2],
      dtype=object)

In [87]:
# feature scaling 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
xtrain[:,[8,9,10]] = sc.fit_transform(xtrain[:,[8,9,10]])

In [88]:
xtrain[0]

array([1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.4722265067674694,
       -1.7565251299004097, 0.734336259529673], dtype=object)

In [90]:
xtest[:,[8,9,10]] =  sc.transform(xtest[:,[8,9,10]])

array([[1.0, 0.0, 1.0, ..., 0.401140074263081, -0.891539245398062,
        0.734336259529673],
       [1.0, 0.0, 1.0, ..., -0.23863781827641423, -0.08946142522315799,
        -0.9111921126101178],
       [1.0, 0.0, 0.0, ..., 1.7517822918464598, -0.6084529559245663,
        -0.9111921126101178],
       ...,
       [0.0, 1.0, 1.0, ..., -0.0964649532676375, -0.4197287629422363,
        -0.08842792654022237],
       [1.0, 0.0, 0.0, ..., 1.0409179668025763, 2.789410255445899,
        -0.9111921126101178],
       [1.0, 0.0, 1.0, ..., 0.8276586692894112, 0.6025272823787207,
        -0.08842792654022237]], dtype=object)

In [91]:
xtest[0]

array([1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.401140074263081,
       -0.891539245398062, 0.734336259529673], dtype=object)

In [92]:
#creating svr model
from sklearn.svm import SVR
model = SVR(kernel='linear')
model.fit(xtrain,ytrain)

In [93]:
model.coef_

array([[  -2.        ,    2.        , -220.        ,  220.        ,
          21.        ,    0.        ,   -5.        ,  -16.        ,
         513.81273414,   82.53786361,   22.21463302]])

In [94]:
predict = model.predict(xtest)

In [95]:
pd.DataFrame({'actual':ytest,'predict':predict})

Unnamed: 0,actual,predict
0,9095.06825,9344.728771
1,5272.17580,9024.649724
2,29330.98315,10444.516471
3,9301.89355,9390.319736
4,33750.29180,8873.684268
...,...,...
263,47055.53210,10519.103985
264,12222.89830,9789.260008
265,6067.12675,9113.717797
266,63770.42801,10354.717699


In [97]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(ytest,predict)

In [98]:
mse,np.sqrt(mse)

(160737190.04582283, 12678.217147762647)

In [99]:
model1 = SVR(kernel='rbf')
model1.fit(xtrain,ytrain)

In [100]:
predict = model1.predict(xtest)

In [101]:
mse = mean_squared_error(ytest,predict)
np.sqrt(mse)

12892.023995416239

In [103]:
# lets try multiple liear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(xtrain,ytrain)
predict = lr.predict(xtest)
mse = mean_squared_error(ytest,predict)
np.sqrt(mse)

5810.464634287665

In [104]:
# multiple linear regression work better than svr model