In [1]:
# Import pandas library
import pandas as pd

In [2]:
# read_csv is function in pandas, which is used to read the csv file and assign it to dataset 
dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
# Convert nominal (categorial) col to numeric using one hot encoding
dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
# Fetch inputs/independent variables and assign to 'independent' vriable
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
# Fetching output/dependent variable from dataset and assign it to variable 'dependent'
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


### Splitting of train and test data

In [7]:
# Importing train_test_split function from sklearn module - used to split the dataset into train and test

from sklearn.model_selection import train_test_split

# The below function returns 4 variables - independent/input train set, independent/input test set, dependent/output train set and dependent/output test set
X_train, X_test, y_train, y_test = train_test_split(independent, dependent, test_size=0.30, random_state=0)

### Model creation

#### 1. Multiple Regression

In [8]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [9]:
# Fetching slope/weight from regressor

weight=regressor.coef_
weight

array([[  257.8006705 ,   321.06004271,   469.58113407,   -41.74825718,
        23418.6671912 ]])

In [10]:
# Fetching bias/intercept from regressor
bais=regressor.intercept_
bais

array([-12057.244846])

In [11]:
# Predicting the result with independent test dataset
y_pred=regressor.predict(X_test)

#### Validation

In [12]:
from sklearn.metrics import r2_score
r_score = r2_score(y_test, y_pred)
r_score

0.7894790349867009

#### 2. Support Vector Machine (SVM)

In [13]:
from sklearn.svm import SVR
# regressor = SVR(kernel="linear") # SVM uses linear and non-linear, it takes huge time to execute, so we go with non-linear
regressor=SVR(kernel="rbf") # for non-linear; rbf - radial bias fn.
regressor.fit(X_train,y_train)

  return f(*args, **kwargs)


SVR()

In [14]:
regressor.intercept_

array([9539.41080564])

In [15]:
regressor.n_support_ # find number of support vector

array([936])

In [16]:
regressor.support_

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [17]:
y_pred=regressor.predict(X_test)

In [18]:
# Validation
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
r_score

-0.08842732776913875

In [19]:
# Standardize the inputs

'''
Since above method provides less r2_score value, so we need try to standardized and thn cal r2_score
'''

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train) # Cal mean and std deviation, cal w.r.t each data points and assign
X_test=sc.transform(X_test)

In [20]:
X_train

array([[-1.5330973 , -0.40713453, -0.89833872, -0.97676557, -0.50466988],
       [-0.03364163,  0.32855417, -0.89833872, -0.97676557, -0.50466988],
       [ 0.89459283,  2.56690911,  3.25603402, -0.97676557, -0.50466988],
       ...,
       [ 0.03776102, -0.91016269, -0.89833872,  1.02378711, -0.50466988],
       [-1.46169465,  0.76659782, -0.89833872,  1.02378711, -0.50466988],
       [-0.46205754, -1.96596021, -0.06746417, -0.97676557, -0.50466988]])

In [21]:
from sklearn.svm import SVR
# regressor = SVR(kernel="linear") # SVM uses linear and non-linear, it takes huge time to execute, so we go with non-linear
regressor=SVR(kernel="rbf",C=1000000, gamma="scale") # for non-linear; rbf - radial bias fn. (Note: refer SVR documentation for more parameters -
                                        # https://scikit-learn.org/1.5/modules/generated/sklearn.svm.SVR.html)
regressor.fit(X_train,y_train)

  return f(*args, **kwargs)


SVR(C=1000000)

In [22]:
y_pred=regressor.predict(X_test)

In [23]:
from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
r_score

0.869634389000845

#### 3. Decision Tree

In [24]:
# read_csv is function in pandas, which is used to read the csv file and assign it to dataset 
dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [25]:
dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [26]:
# Fetch inputs/independent variables and assign to 'independent' vriable
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [27]:
# Fetching output/dependent variable from dataset and assign it to variable 'dependent'
dependent = dataset[['charges']]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [28]:
# Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.30,random_state=0)

In [42]:
# Model creation
from sklearn.tree import DecisionTreeRegressor
regressor=DecisionTreeRegressor(criterion='friedman_mse', max_features='log2', splitter='best')
regressor=regressor.fit(X_train, y_train)

In [43]:
y_pred=regressor.predict(X_test)

In [44]:
# Validation
from sklearn.metrics import r2_score
r_score=r2_score(y_test, y_pred)
r_score

0.7901428039759743

#### 4. Random Forest

In [45]:
# Train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(independent, dependent, test_size=0.30,random_state=0)

In [46]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=50, random_state=0, criterion='friedman_mse', max_features='log2')
regressor.fit(X_train, y_train)

  regressor.fit(X_train, y_train)


RandomForestRegressor(criterion='friedman_mse', max_features='log2',
                      n_estimators=50, random_state=0)

In [47]:
y_pred=regressor.predict(X_test)

In [48]:
# Validation

from sklearn.metrics import r2_score
r_score=r2_score(y_test,y_pred)
r_score

0.8702417511198071

### Saving the model

In [49]:
'''
Saving random forest model as it shows best result among other ML regression model
'''

import pickle
filename="finalized_model_Random_Forest.sav"
pickle.dump(regressor,open(filename,'wb'))
loaded_model=pickle.load(open("finalized_model_Random_Forest.sav",'rb'))

### Tesing the deployed model

In [50]:
result=loaded_model.predict([[1234,345,4565,1,0]])

In [51]:
result

array([16973.7197674])