In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
dataset = pd.read_csv("insurance_pre.csv")

In [3]:
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [4]:
dataset = pd.get_dummies(dataset, drop_first=True)

In [5]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [6]:
dataset.columns

Index(['age', 'bmi', 'children', 'charges', 'sex_male', 'smoker_yes'], dtype='object')

In [7]:
independent = dataset[['age', 'bmi', 'children', 'sex_male', 'smoker_yes']]

In [8]:
independent

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [9]:
dependent = dataset[['charges']]

In [10]:
dependent

Unnamed: 0,charges
0,16884.92400
1,1725.55230
2,4449.46200
3,21984.47061
4,3866.85520
...,...
1333,10600.54830
1334,2205.98080
1335,1629.83350
1336,2007.94500


In [11]:
from sklearn.tree import DecisionTreeRegressor

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "criterion": ["mae"],
    "max_depth": [3, 4, 5],
    "min_samples_split": [2, 3, 4],
    "max_features": ["auto", "sqrt", "log2"],
    "min_weight_fraction_leaf": [0.0, 0.01, 0.05],
}
grid = GridSearchCV(DecisionTreeRegressor(), param_grid, refit = True, verbose = 3, n_jobs = -1)

#fitting the model for grid search
grid.fit(independent, dependent)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 108 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 130 out of 243 | elapsed:    4.6s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done 243 out of 243 | elapsed:    4.7s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mae'], 'max_depth': [3, 4, 5],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_split': [2, 3, 4],
              

In [15]:
#print best parameter after tuning
#print(grid, best_params_)
re = grid.cv_results_

print("The R_score value for best parameter {}:". format(grid.best_params_))


The R_score value for best parameter {'criterion': 'mae', 'max_depth': 5, 'max_features': 'auto', 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.01}:


In [16]:
table = pd.DataFrame.from_dict(re)

In [17]:
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_min_samples_split,param_min_weight_fraction_leaf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.026545,0.003862,0.000000,0.000000,mae,3,auto,2,0,"{'criterion': 'mae', 'max_depth': 3, 'max_feat...",0.848083,0.838964,0.823073,0.836707,0.010334,19
1,0.028345,0.002321,0.000000,0.000000,mae,3,auto,2,0.01,"{'criterion': 'mae', 'max_depth': 3, 'max_feat...",0.848083,0.838964,0.823073,0.836707,0.010334,19
2,0.026670,0.003771,0.005333,0.003771,mae,3,auto,2,0.05,"{'criterion': 'mae', 'max_depth': 3, 'max_feat...",0.847923,0.837327,0.817121,0.834124,0.012777,25
3,0.026184,0.004158,0.000000,0.000000,mae,3,auto,3,0,"{'criterion': 'mae', 'max_depth': 3, 'max_feat...",0.848083,0.838964,0.823073,0.836707,0.010334,19
4,0.028320,0.003054,0.002667,0.003772,mae,3,auto,3,0.01,"{'criterion': 'mae', 'max_depth': 3, 'max_feat...",0.848083,0.838964,0.823073,0.836707,0.010334,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76,0.016248,0.001666,0.002942,0.000182,mae,5,log2,3,0.01,"{'criterion': 'mae', 'max_depth': 5, 'max_feat...",0.840962,0.798349,0.809661,0.816324,0.018023,28
77,0.015995,0.003139,0.002077,0.000981,mae,5,log2,3,0.05,"{'criterion': 'mae', 'max_depth': 5, 'max_feat...",0.827714,0.755105,0.638978,0.740599,0.077731,41
78,0.015080,0.002303,0.001614,0.001187,mae,5,log2,4,0,"{'criterion': 'mae', 'max_depth': 5, 'max_feat...",0.556855,0.835875,0.607658,0.666796,0.121342,48
79,0.015788,0.005401,0.001876,0.001475,mae,5,log2,4,0.01,"{'criterion': 'mae', 'max_depth': 5, 'max_feat...",0.852471,0.742043,0.758160,0.784225,0.048704,33


In [18]:
age_input = float(input("Age:"))
bmi_input = float(input("BMI:"))
children_input = float(input("children:"))
sex_male_input = int(input("Sex Male 0 or 1:"))
smoker_yes_input = int(input("Smoker Yes 0 or 1:"))

Age:23
BMI:45
children:0
Sex Male 0 or 1:1
Smoker Yes 0 or 1:0


In [19]:
future_prediction = grid.predict([[age_input, bmi_input, children_input, sex_male_input,smoker_yes_input]])
print("Future_prediction={}".format(future_prediction))

Future_prediction=[2901.197925]
