In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("insurance_pre.csv")

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
dataset = pd.get_dummies(dataset, drop_first=True)

In [5]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [7]:
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [8]:
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [9]:
dependent = dataset[['charges']]

In [10]:
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [11]:
from sklearn.ensemble import RandomForestRegressor

In [13]:
from sklearn.model_selection import GridSearchCV

param_grid = { 'criterion' : [ 'mse', 'mae'], 
             'max_features' :['auto', 'sqrt', 'log2'],
             'n_estimators' : [10,100]}
grid = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose = 3, n_jobs = -1)

#fitting the model for grid search
grid.fit(independent, dependent)

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  36 | elapsed:    4.2s remaining:    4.2s
[Parallel(n_jobs=-1)]: Done  31 out of  36 | elapsed:    4.9s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:    5.2s finished
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=RandomForestRegressor(bootstrap=True, criterion='mse',
                                             max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators='warn', n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae'],

In [17]:
#print best parameter after tuning
#print(grid, best_params_)
re = grid.cv_results_

print("The R_score value for best parameter {}:". format(grid.best_params_))


The R_score value for best parameter {'criterion': 'mae', 'max_features': 'log2', 'n_estimators': 100}:


In [18]:
table = pd.DataFrame.from_dict(re)

In [19]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.044353,0.003264,0.006669,0.0004721011,mse,auto,10,"{'criterion': 'mse', 'max_features': 'auto', '...",0.828041,0.828343,0.800178,0.818854,0.013206,8
1,0.291137,0.00356,0.017337,0.001247426,mse,auto,100,"{'criterion': 'mse', 'max_features': 'auto', '...",0.837193,0.830833,0.820561,0.829529,0.006852,5
2,0.033007,0.001634,0.006334,0.0009429655,mse,sqrt,10,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.832986,0.82323,0.809835,0.822017,0.00949,7
3,0.204344,0.003415,0.018338,0.001246683,mse,sqrt,100,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.84406,0.836204,0.817903,0.832722,0.010959,2
4,0.03341,0.003331,0.007447,0.002528321,mse,log2,10,"{'criterion': 'mse', 'max_features': 'log2', '...",0.8187,0.814976,0.799269,0.810981,0.00842,11
5,0.201589,0.002331,0.017582,0.0003209484,mse,log2,100,"{'criterion': 'mse', 'max_features': 'log2', '...",0.840387,0.837334,0.818208,0.831976,0.009815,3
6,0.189991,0.005214,0.004668,0.000471539,mae,auto,10,"{'criterion': 'mae', 'max_features': 'auto', '...",0.82171,0.812712,0.807174,0.813865,0.00599,10
7,1.193244,0.018121,0.009002,2.247832e-07,mae,auto,100,"{'criterion': 'mae', 'max_features': 'auto', '...",0.827957,0.829367,0.815681,0.824335,0.006146,6
8,0.112985,0.002566,0.004298,0.0004202324,mae,sqrt,10,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.818204,0.809745,0.798472,0.808807,0.008083,12
9,0.772577,0.021902,0.010669,0.00124749,mae,sqrt,100,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.842637,0.832795,0.820472,0.831968,0.009068,4


In [21]:
age_input = float(input("Age:"))
bmi_input = float(input("BMI:"))
children_input = float(input("children:"))
sex_male_input = int(input("Sex Male 0 or 1:"))
smoker_yes_input = int(input("Smoker Yes 0 or 1:"))

Age:40
BMI:47
children:1
Sex Male 0 or 1:0
Smoker Yes 0 or 1:1


In [22]:
future_prediction = grid.predict([[age_input, bmi_input, children_input, sex_male_input,smoker_yes_input]])
print("Future_prediction={}".format(future_prediction))

Future_prediction=[43369.8144861]
