In [1]:
import pandas as pd

In [2]:
#read dataset
dataset=pd.read_csv("insurance_pre.csv")
dataset

Unnamed: 0,age,sex,bmi,children,smoker,charges
0,19,female,27.900,0,yes,16884.92400
1,18,male,33.770,1,no,1725.55230
2,28,male,33.000,3,no,4449.46200
3,33,male,22.705,0,no,21984.47061
4,32,male,28.880,0,no,3866.85520
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,10600.54830
1334,18,female,31.920,0,no,2205.98080
1335,18,female,36.850,0,no,1629.83350
1336,21,female,25.800,0,no,2007.94500


In [3]:
dataset=pd.get_dummies(dataset,drop_first=True)
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [4]:
independent=dataset[['age','bmi','children','sex_male','smoker_yes']]
dependent=dataset['charges']

In [5]:
#split into training set and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(independent, dependent, test_size=1/3, random_state=0)


In [6]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)


In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor 
param_grid={'criterion':['mse','mae','friedman_mse'],'max_features': ['auto','sqrt','log2'],'splitter':['best','random']}


In [8]:
#gridsearch cv hyperparameter tuning model
#systematically search for the best combination for hyperparameters given model
grid=GridSearchCV(DecisionTreeRegressor(),param_grid,refit=True,verbose=3,n_jobs=-1)
grid.fit(X_train,y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done  39 out of  54 | elapsed:    3.7s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    3.7s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae', 'friedman_mse'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             p

In [9]:
#print best parameter after tuning
print(grid.best_params_)

{'criterion': 'mse', 'max_features': 'auto', 'splitter': 'random'}


In [10]:
#(cv_results_) contains a dictionary of results from the cross-validation search.
re=grid.cv_results_
print("The R_score value for best parameter is {}:".format(grid.best_params_))


The R_score value for best parameter is {'criterion': 'mse', 'max_features': 'auto', 'splitter': 'random'}:


In [11]:
#convert table
table=pd.DataFrame.from_dict(re)
table

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_features,param_splitter,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.006622,0.001521,0.00179,0.000473,mse,auto,best,"{'criterion': 'mse', 'max_features': 'auto', '...",0.536884,0.680679,0.686407,0.634547,0.069214,7
1,0.00335,0.002475,0.00525,0.004062,mse,auto,random,"{'criterion': 'mse', 'max_features': 'auto', '...",0.726168,0.65377,0.684487,0.688184,0.02968,1
2,0.004281,0.00253,0.002727,0.001972,mse,sqrt,best,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.437119,0.659998,0.697542,0.598039,0.115004,11
3,0.005035,0.000627,0.002367,0.001674,mse,sqrt,random,"{'criterion': 'mse', 'max_features': 'sqrt', '...",0.452546,0.616851,0.564672,0.544587,0.06858,16
4,0.005751,0.001531,0.002731,0.001417,mse,log2,best,"{'criterion': 'mse', 'max_features': 'log2', '...",0.687809,0.662964,0.686056,0.678953,0.011319,2
5,0.004033,0.000649,0.002117,0.001516,mse,log2,random,"{'criterion': 'mse', 'max_features': 'log2', '...",0.501273,0.612895,0.590082,0.568008,0.048176,13
6,0.023871,0.004845,0.004621,0.005429,mae,auto,best,"{'criterion': 'mae', 'max_features': 'auto', '...",0.663261,0.625392,0.695729,0.661463,0.028727,3
7,0.022973,0.001677,0.001762,0.000609,mae,auto,random,"{'criterion': 'mae', 'max_features': 'auto', '...",0.605465,0.658801,0.671192,0.645108,0.028531,5
8,0.018863,0.00239,0.002493,0.001058,mae,sqrt,best,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.581318,0.63938,0.609125,0.609909,0.023717,10
9,0.012693,0.001657,0.001112,8.1e-05,mae,sqrt,random,"{'criterion': 'mae', 'max_features': 'sqrt', '...",0.478468,0.576573,0.64515,0.566632,0.068428,15


In [12]:
grid_predictions=grid.predict(X_test)
grid_predictions

array([ 9748.9106  ,  8930.93455 , 47269.854   , 13143.33665 ,
        9264.797   ,  4399.731   ,  2196.4732  , 10848.1343  ,
        7418.522   ,  5253.524   ,  6184.2994  , 30284.64294 ,
        6849.026   ,  4571.41305 , 35147.52848 , 10600.5483  ,
       12142.5786  ,  3292.52985 ,  6455.86265 , 33750.2918  ,
       17904.52705 , 10806.839   ,  9625.92    , 24535.69855 ,
        2250.8352  ,  4673.3922  ,  3161.454   ,  7954.517   ,
        3757.8448  ,  8027.968   ,  7954.517   , 48673.5588  ,
       13405.3903  , 20781.48892 , 15359.1045  ,  3554.203   ,
        8733.22925 , 44585.45587 , 38415.474   ,  1880.07    ,
        5266.3656  ,  2866.091   , 21082.16    , 49577.6624  ,
       34617.84065 ,  3579.8287  , 10600.5483  ,  5966.8874  ,
        4719.52405 , 12044.342   , 13126.67745 ,  2331.519   ,
       24535.69855 , 46661.4424  , 12235.8392  ,  2689.4954  ,
        1981.5819  ,  8442.667   ,  6849.026   , 14349.8544  ,
        1252.407   , 47291.055   , 15612.19335 , 25333.

In [13]:
# print classification report
from sklearn.metrics import r2_score
r_score=r2_score(y_test,grid_predictions)
print("The R_score value for best parameter{}:".format(grid.best_params_),r_score)

The R_score value for best parameter{'criterion': 'mse', 'max_features': 'auto', 'splitter': 'random'}: 0.7465522816362338


In [14]:
Future_Prediction=grid.predict([[38,46.7,2,1,1]])
#change the paramter,play with it.
print("Future_Prediction={}".format(Future_Prediction))


Future_Prediction=[36580.28216]


In [15]:
r_score

0.7465522816362338

In [16]:
# Save the model
import pickle
filename="final_DR_model.sav"
pickle.dump(grid,open(filename,"wb"))

In [17]:
loaded_model=pickle.load(open("final_DR_model.sav","rb"))
loaded_model
                         

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=DecisionTreeRegressor(criterion='mse', max_depth=None,
                                             max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort=False, random_state=None,
                                             splitter='best'),
             iid='warn', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae', 'friedman_mse'],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'splitter': ['best', 'random']},
             p