In [1]:
#Importing the required python libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Loading the dataset present in a CSV file
insur_data=pd.read_csv('insurance_pre.csv')
insur_data

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
#Getting the column names of insurance dataset
insur_data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [4]:
#Converting categorical data into numerical data
ins_data_numerical=pd.get_dummies(insur_data,drop_first=True)
ins_data_numerical

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [5]:
#Getting the column names of insurance dataset, after converting categorical data to numerical 
ins_data_numerical.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [6]:
#Storing input and output into separate variables
indep_ins_Data=ins_data_numerical[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dep_ins_data=ins_data_numerical[['charges']]

In [7]:
#Splitting data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(indep_ins_Data,dep_ins_data,test_size=1/3,random_state=0)

In [8]:
#Applying standardization to bring data into same scale
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [9]:
#Building a Grid search cross validated Random Forest model using train data
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
parameter_grid={'criterion':['squared_error','absolute_error','friedman_mse','poisson'], 'max_features':['sqrt','log2'], 'n_estimators':[10,100,1000]}
gridSearch_Rf_Model=GridSearchCV(RandomForestRegressor(), parameter_grid, refit=True, verbose=3, n_jobs=-1)
gridSearch_Rf_Model=gridSearch_Rf_Model.fit(X_train,y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  self.best_estimator_.fit(X, y, **fit_params)


In [10]:
#Getting the best params from the grid and displaying the same
result=gridSearch_Rf_Model.cv_results_
print("The best parameter for which R_score value would be high is{}:".format(gridSearch_Rf_Model.best_params_))

The best parameter for which R_score value would be high is{'criterion': 'friedman_mse', 'max_features': 'sqrt', 'n_estimators': 1000}:


In [11]:
#Creating a table and displaying the results
table=pd.DataFrame().from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.03067,0.005635,0.005297,0.002925,squared_error,sqrt,10,"{'criterion': 'squared_error', 'max_features':...",0.794843,0.757557,0.813258,0.815899,0.748075,0.785926,0.028151,17
1,0.201894,0.003677,0.01594,0.002657,squared_error,sqrt,100,"{'criterion': 'squared_error', 'max_features':...",0.801717,0.765512,0.835598,0.831429,0.758408,0.798533,0.032145,12
2,1.786919,0.008144,0.135593,0.009777,squared_error,sqrt,1000,"{'criterion': 'squared_error', 'max_features':...",0.809985,0.771938,0.836483,0.834832,0.763423,0.803332,0.030705,3
3,0.021306,0.003409,0.004602,0.003775,squared_error,log2,10,"{'criterion': 'squared_error', 'max_features':...",0.774548,0.763258,0.817075,0.813499,0.775106,0.788697,0.022147,13
4,0.188102,0.011474,0.015724,0.000582,squared_error,log2,100,"{'criterion': 'squared_error', 'max_features':...",0.80544,0.776795,0.842073,0.828663,0.759296,0.802453,0.03094,8
5,1.767179,0.013343,0.122842,0.004287,squared_error,log2,1000,"{'criterion': 'squared_error', 'max_features':...",0.808485,0.770532,0.839122,0.831921,0.763217,0.802655,0.031008,7
6,0.097153,0.004927,0.003241,0.004096,absolute_error,sqrt,10,"{'criterion': 'absolute_error', 'max_features'...",0.784985,0.765633,0.82348,0.806714,0.751318,0.786426,0.026274,15
7,0.847671,0.009582,0.009824,0.008035,absolute_error,sqrt,100,"{'criterion': 'absolute_error', 'max_features'...",0.807557,0.774699,0.840709,0.82852,0.760546,0.802406,0.030645,9
8,12.532484,0.331753,0.135043,0.006275,absolute_error,sqrt,1000,"{'criterion': 'absolute_error', 'max_features'...",0.8061,0.775527,0.839191,0.830503,0.765404,0.803345,0.029132,2
9,0.097908,0.007241,0.004171,0.005603,absolute_error,log2,10,"{'criterion': 'absolute_error', 'max_features'...",0.807136,0.77373,0.814034,0.789745,0.745723,0.786074,0.024574,16


In [12]:
#Getting the inputs from the user and predicting for future
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age:30
BMI:20
Children:0
Sex Male 0 or 1:1
Smoker Yes 0 or 1:0


In [13]:
Future_Prediction=gridSearch_Rf_Model.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[16406.93723046]
