In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Loading the dataset
insurance_data=pd.read_csv('insurance_pre.csv')
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
#Getting the column names of insurance dataset
insurance_data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'charges'], dtype='object')

In [4]:
#Converting categorical data into numerical data
ins_data_numerical=pd.get_dummies(insurance_data,drop_first=True)
ins_data_numerical

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [5]:
#Getting the column names of insurance dataset, after converting categorical data to numerical 
ins_data_numerical.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [6]:
#Storing input and output into separate variables
indep_ins_Data=ins_data_numerical[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]
dep_ins_data=ins_data_numerical[['charges']]

In [7]:
#Splitting data into train and test
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(indep_ins_Data,dep_ins_data,test_size=1/3,random_state=0)

In [8]:
#Applying standardization to bring data into same scale
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [9]:
#Building a Grid search cross validated Decision tree model using train data
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
parameter_grid={'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'], 'max_features':['auto','sqrt','log2'], 'splitter':['best','random']}
gridSearch_DtModel=GridSearchCV(DecisionTreeRegressor(), parameter_grid, refit=True, verbose=3, n_jobs=-1)
gridSearch_DtModel=gridSearch_DtModel.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


In [10]:
#Getting the best params from the grid and displaying the same
result=gridSearch_DtModel.cv_results_
print("The best parameter for which R_score value would be high is{}:".format(gridSearch_DtModel.best_params_))

The best parameter for which R_score value would be high is{'criterion': 'squared_error', 'max_features': 'auto', 'splitter': 'best'}:


In [11]:
#Creating a table and displaying the results
table=pd.DataFrame().from_dict(result)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009104,0.00101,0.001603,0.003205,squared_error,auto,best,"{'criterion': 'squared_error', 'max_features':...",0.708347,0.510887,0.798292,0.633189,0.647724,0.659688,0.09441,1
1,0.006141,0.003139,0.003215,0.003055,squared_error,auto,random,"{'criterion': 'squared_error', 'max_features':...",0.538085,0.626312,0.634287,0.644144,0.623242,0.613214,0.038255,7
2,0.00719,0.005967,0.0036,0.004033,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.619703,0.55589,0.549744,0.626418,0.70178,0.610707,0.055396,9
3,0.003589,0.001862,0.003679,0.001016,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.560363,0.505091,0.689262,0.541072,0.564641,0.572086,0.062247,18
4,0.004851,0.001014,0.002666,0.001046,squared_error,log2,best,"{'criterion': 'squared_error', 'max_features':...",0.576994,0.572226,0.688197,0.572245,0.663761,0.614684,0.05067,6
5,0.002937,0.000879,0.001268,0.001161,squared_error,log2,random,"{'criterion': 'squared_error', 'max_features':...",0.689232,0.47044,0.604776,0.496421,0.601773,0.572528,0.079661,17
6,0.0,0.0,0.0,0.0,friedman_mse,auto,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.692959,0.490227,0.763978,0.609323,0.668736,0.645044,0.09192,2
7,0.003125,0.00625,0.0,0.0,friedman_mse,auto,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.687227,0.556639,0.688905,0.516765,0.704147,0.630737,0.07803,3
8,0.006249,0.007654,0.0,0.0,friedman_mse,sqrt,best,"{'criterion': 'friedman_mse', 'max_features': ...",0.582684,0.571598,0.712563,0.583092,0.614887,0.612965,0.051853,8
9,0.0,0.0,0.0,0.0,friedman_mse,sqrt,random,"{'criterion': 'friedman_mse', 'max_features': ...",0.563502,0.622233,0.662111,0.653209,0.599745,0.62016,0.036009,4


In [12]:
#Getting the inputs from the user and predicting for future
age_input=float(input("Age:"))
bmi_input=float(input("BMI:"))
children_input=float(input("Children:"))
sex_male_input=int(input("Sex Male 0 or 1:"))
smoker_yes_input=int(input("Smoker Yes 0 or 1:"))

Age:30
BMI:20
Children:0
Sex Male 0 or 1:1
Smoker Yes 0 or 1:0


In [13]:
Future_Prediction=gridSearch_DtModel.predict([[age_input,bmi_input,children_input,sex_male_input,smoker_yes_input]])
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[16085.1275]
