<a href="https://colab.research.google.com/github/WoradeeKongthong/medical_cost_regression/blob/master/06_Medical_Cost_Decision_Tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

In [0]:
## importing the dataset

df = pd.read_csv('/content/drive/My Drive/life-long learning/MyProjects/Medical Cost/cleaned_insurance.csv', usecols=[0,1,2,3,4,5,6])

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,True,southwest,16884.924
1,18,male,33.77,1,False,southeast,1725.5523
2,28,male,33.0,3,False,southeast,4449.462
3,33,male,22.705,0,False,northwest,21984.47061
4,32,male,28.88,0,False,northwest,3866.8552


In [0]:
## Data Preprocessing

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [5]:
print(X)

[[19 'female' 27.9 0 True 'southwest']
 [18 'male' 33.77 1 False 'southeast']
 [28 'male' 33.0 3 False 'southeast']
 ...
 [18 'female' 36.85 0 False 'southeast']
 [21 'female' 25.8 0 False 'southwest']
 [61 'female' 29.07 0 True 'northwest']]


In [6]:
print(y)

[16884.924   1725.5523  4449.462  ...  1629.8335  2007.945  29141.3603]


In [0]:
# encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [1,4,5])], remainder='passthrough')
X = ct.fit_transform(X)

In [8]:
print(X[:5])

[[0.0 1.0 0.0 0.0 1.0 19 27.9 0]
 [1.0 0.0 0.0 1.0 0.0 18 33.77 1]
 [1.0 0.0 0.0 1.0 0.0 28 33.0 3]
 [1.0 0.0 1.0 0.0 0.0 33 22.705 0]
 [1.0 0.0 1.0 0.0 0.0 32 28.88 0]]


In [0]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [38]:
# Training the Decision Tree model on the Training set
from sklearn.tree import DecisionTreeRegressor

regressor = DecisionTreeRegressor(criterion='mse', splitter='best', max_features='auto', max_depth=5)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [39]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)[:10])

[[ 1837.85  2200.83]
 [ 8324.53  4500.34]
 [ 1837.85  1628.47]
 [ 6487.59  4239.89]
 [43835.6  41919.1 ]
 [ 4604.56  2850.68]
 [14991.62 12142.58]
 [ 9515.7   9361.33]
 [ 6681.58 24671.66]
 [ 6487.59  6775.96]]


In [40]:
# Model Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from math import sqrt
n = X_test.shape[0]
p = X_test.shape[1]
r_square = r2_score(y_test, y_pred)
adj_r_square = 1 - (1 - r_square) * ((n - 1) / (n - p - 1)) 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mean_squared_error(y_test, y_pred))

print("R-square : ",r_square)
print("Adjusted R-square : ",adj_r_square)
print("MAE : ", mae)
print("MSE : ",mse)
print("RMSE : ",rmse)

R-square :  0.840570230254406
Adjusted R-square :  0.8356457586020325
MAE :  2716.937386277255
MSE :  22958287.12831985
RMSE :  4791.480682244252


In [41]:
# Applying K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
y_train = y_train.reshape(len(y_train),)
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {}".format(accuracies.mean()))
print("Standard Deviation: {}".format(accuracies.std()))

Accuracy: 0.8350922835031925
Standard Deviation: 0.057692244353634514


In [36]:
# Improving the model

# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'criterion':['mse','friedman_mse'],'splitter':['best','random'],'max_features':['auto','sqrt','log2'],'max_depth':[3,5,7,9]}]
grid_search = GridSearchCV(estimator = regressor,
                          param_grid = parameters,
                          cv = 10)
grid_search = grid_search.fit(X_train,y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy : {}".format(best_accuracy))
print("Best Parameters : ", best_parameters)

Best Accuracy : 0.835071481793929
Best Parameters :  {'criterion': 'mse', 'max_depth': 5, 'max_features': 'auto', 'splitter': 'best'}


In [42]:
#-----------------training evaluation--------------------------------
n = X_train.shape[0]
p = X_train.shape[1]
r_square = r2_score(y_train, regressor.predict(X_train))
adj_r_square = 1 - (1 - r_square) * ((n - 1) / (n - p - 1)) 
mae = mean_absolute_error(y_train,  regressor.predict(X_train))
mse = mean_squared_error(y_train, regressor.predict(X_train))
rmse = sqrt(mean_squared_error(y_train,  regressor.predict(X_train)))

print("R-square : ",r_square)
print("Adjusted R-square : ",adj_r_square)
print("MAE : ", mae)
print("MSE : ",mse)
print("RMSE : ",rmse)

R-square :  0.885061717890517
Adjusted R-square :  0.8841950767436029
MAE :  2310.9091673664225
MSE :  16916420.66452167
RMSE :  4112.957654112387
