In [1]:
# Importing Libraries

import matplotlib.pyplot as plt
import numpy as np

In [2]:
import pandas as pd

# Reading the Dataset

dataset = pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
# Converting the nominal data to numeric data

dataset = pd.get_dummies(dataset, drop_first = True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
# Getting the column names

dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [5]:
# Splitting the dataset into input and output variables

independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [6]:
dependent = dataset[["charges"]]
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [7]:
#Importing GridSearchCv and RandomForestRegressor to create the best model 

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
param_grid = {'criterion' :['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
              'max_features' : [None, 'sqrt', 'log2'], 'n_estimators' : [10,25,50,100]}

grid = GridSearchCV(RandomForestRegressor(), param_grid, refit = True, verbose = 3, n_jobs = -1 )
grid.fit(independent,dependent)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


  return fit_method(estimator, *args, **kwargs)


In [8]:
# Printing the best model from the grid

re = grid.cv_results_

print("The R_Score for Best parameter is {} : ".format(grid.best_params_))

The R_Score for Best parameter is {'criterion': 'squared_error', 'max_features': 'sqrt', 'n_estimators': 100} : 


In [9]:
# getting all the grid values in table format

table = pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.241619,0.119554,0.043982,0.050527,squared_error,,10,"{'criterion': 'squared_error', 'max_features':...",0.836774,0.759124,0.853801,0.811425,0.822446,0.816714,0.03211,44
1,0.33825,0.101524,0.015083,0.002806,squared_error,,25,"{'criterion': 'squared_error', 'max_features':...",0.846669,0.770341,0.849372,0.819636,0.832652,0.823734,0.028742,32
2,0.417308,0.031783,0.021206,0.006177,squared_error,,50,"{'criterion': 'squared_error', 'max_features':...",0.845393,0.774282,0.858889,0.824486,0.836494,0.827909,0.029075,21
3,0.904524,0.016113,0.021833,0.012122,squared_error,,100,"{'criterion': 'squared_error', 'max_features':...",0.844789,0.770551,0.854026,0.820111,0.839315,0.825758,0.029748,26
4,0.078448,0.009839,0.004157,0.006304,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.848989,0.757344,0.85911,0.813995,0.834906,0.822869,0.036091,34
5,0.21122,0.043269,0.010836,0.003261,squared_error,sqrt,25,"{'criterion': 'squared_error', 'max_features':...",0.84932,0.770072,0.851406,0.826099,0.828831,0.825146,0.0294,30
6,0.264823,0.010358,0.011745,0.01052,squared_error,sqrt,50,"{'criterion': 'squared_error', 'max_features':...",0.854378,0.775455,0.863571,0.827703,0.839815,0.832184,0.030901,11
7,0.629016,0.025465,0.018504,0.006617,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.858187,0.782591,0.867186,0.828353,0.836687,0.834601,0.029552,1
8,0.064938,0.007494,0.005807,0.00112,squared_error,log2,10,"{'criterion': 'squared_error', 'max_features':...",0.83372,0.774619,0.844316,0.824328,0.812371,0.817871,0.024055,43
9,0.195663,0.008601,0.004234,0.003565,squared_error,log2,25,"{'criterion': 'squared_error', 'max_features':...",0.852051,0.779324,0.850303,0.826261,0.833203,0.828228,0.026363,20


In [10]:
# Getting the inputs for future prediction

age_input = float(input('Age: '))
bmi_input = float(input('BMI: '))
children_input = float(input('Children: '))
sex_male_input = float(input('Sex Male 0 or 1: '))
smoker_yes_input = float(input('Smoker yes 0 or 1: '))

Age: 32
BMI: 43
Children: 2
Sex Male 0 or 1: 0
Smoker yes 0 or 1: 1


In [11]:
# Evaluating and printing the Predicted value

Future_Prediction = grid.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Future Prediction {} : ".format(Future_Prediction))                                  

Future Prediction [45866.0823868] : 


