In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
dataset = pd.read_csv('insurance_pre.csv')

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
dataset = pd.get_dummies(dataset, dtype=int, drop_first=True)

In [5]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
indep=dataset[['age', 'bmi', 'children','sex_male', 'smoker_yes']]
dep=dataset['charges']

In [7]:
from sklearn.preprocessing import StandardScaler
SC=StandardScaler()
indep=SC.fit_transform(indep)

In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
param_grid = {
    'criterion': ['poisson', 'squared_error', 'friedman_mse', 'absolute_error'],
    'splitter': ['random', 'best'],
    'max_features': [None, 'sqrt', 'log2']}
grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose=3,n_jobs=-1)
regressor=grid.fit(indep,dep)
y_pred=regressor.predict(indep)
y_pred

Fitting 5 folds for each of 24 candidates, totalling 120 fits


array([16884.924 ,  1725.5523,  4449.462 , ...,  1629.8335,  2007.945 ,
       29141.3603])

In [9]:
from sklearn.metrics import r2_score
r_score=r2_score(dep,y_pred)
r_score

0.998667156135576

In [10]:
re=grid.cv_results_
print("The R_score value for best parameter {}:".format(grid.best_params_))

The R_score value for best parameter {'criterion': 'poisson', 'max_features': None, 'splitter': 'best'}:


In [11]:
table=pd.DataFrame.from_dict(re)

In [12]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008577,0.003133,0.00243,0.002255,poisson,,random,"{'criterion': 'poisson', 'max_features': None,...",0.699136,0.563067,0.704595,0.693111,0.694084,0.670799,0.054021,11
1,0.010583,0.002516,0.0016,0.000994,poisson,,best,"{'criterion': 'poisson', 'max_features': None,...",0.740656,0.660119,0.766234,0.736776,0.701817,0.72112,0.036758,1
2,0.002324,0.003001,0.006865,0.004141,poisson,sqrt,random,"{'criterion': 'poisson', 'max_features': 'sqrt...",0.688857,0.617165,0.723608,0.638047,0.559755,0.645486,0.056906,18
3,0.008087,0.001874,0.000811,0.001622,poisson,sqrt,best,"{'criterion': 'poisson', 'max_features': 'sqrt...",0.694136,0.587857,0.289817,0.751359,0.716335,0.607901,0.168139,23
4,0.009065,0.001807,0.001079,0.001322,poisson,log2,random,"{'criterion': 'poisson', 'max_features': 'log2...",0.649792,0.677655,0.66438,0.667496,0.674889,0.666842,0.009786,14
5,0.006853,0.003738,0.001407,0.002815,poisson,log2,best,"{'criterion': 'poisson', 'max_features': 'log2...",0.672774,0.53195,0.7008,0.675187,0.711554,0.658453,0.064964,17
6,0.006339,0.001582,0.0,0.0,squared_error,,random,"{'criterion': 'squared_error', 'max_features':...",0.695103,0.614745,0.70004,0.728361,0.708989,0.689447,0.039041,6
7,0.008684,0.002439,0.000202,0.000403,squared_error,,best,"{'criterion': 'squared_error', 'max_features':...",0.735635,0.658737,0.742996,0.724848,0.681467,0.708737,0.032865,3
8,0.006124,0.001877,0.001685,0.001773,squared_error,sqrt,random,"{'criterion': 'squared_error', 'max_features':...",0.731523,0.529259,0.617781,0.674702,0.486601,0.607973,0.090267,22
9,0.00528,0.002534,0.002282,0.001563,squared_error,sqrt,best,"{'criterion': 'squared_error', 'max_features':...",0.756182,0.450965,0.761558,0.688223,0.666715,0.664729,0.113129,15


In [13]:
import pickle

In [14]:
filename="DT_grid.sav"
pickle.dump(regressor,open(filename,"wb"))

In [15]:
loaded_model=pickle.load(open("DT_grid.sav",'rb'))

In [39]:
age=float(input("Age:"))
bmi=float(input("BMI:"))
children=float(input("Children:"))
sex_male=int(input("Sex Male 0 or 1:"))
smoker=int(input("Smoker Yes 0 or 1:"))

Age: 25
BMI: 25
Children: 0
Sex Male 0 or 1: 1
Smoker Yes 0 or 1: 0


In [17]:
Preinput=SC.transform([[age,bmi,children,sex_male,smoker]])



In [18]:
Preinput

array([[-1.01155712, -0.92904899, -0.07876719, -1.0105187 ,  1.97058663]])

In [19]:
Future_Prediction=loaded_model.predict(Preinput)
print("Future_Prediction={}".format(Future_Prediction))

Future_Prediction=[16577.7795]
