<a href="https://colab.research.google.com/github/WoradeeKongthong/medical_cost_regression/blob/master/08_Medical_Cost_XGBRegressor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

In [0]:
## Data Preprocessing

# importing the dataset
df = pd.read_csv('/content/drive/My Drive/life-long learning/MyProjects/Medical Cost/cleaned_insurance.csv',usecols=[0,1,2,3,4,5,6])

In [3]:
df.head()   

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,True,southwest,16884.924
1,18,male,33.77,1,False,southeast,1725.5523
2,28,male,33.0,3,False,southeast,4449.462
3,33,male,22.705,0,False,northwest,21984.47061
4,32,male,28.88,0,False,northwest,3866.8552


In [0]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [5]:
print(X)

[[19 'female' 27.9 0 True 'southwest']
 [18 'male' 33.77 1 False 'southeast']
 [28 'male' 33.0 3 False 'southeast']
 ...
 [18 'female' 36.85 0 False 'southeast']
 [21 'female' 25.8 0 False 'southwest']
 [61 'female' 29.07 0 True 'northwest']]


In [6]:
print(y)

[16884.924   1725.5523  4449.462  ...  1629.8335  2007.945  29141.3603]


In [0]:
# encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [1,4,5])], remainder='passthrough')
X = ct.fit_transform(X)

In [8]:
print(X[:5])

[[0.0 1.0 0.0 0.0 1.0 19 27.9 0]
 [1.0 0.0 0.0 1.0 0.0 18 33.77 1]
 [1.0 0.0 0.0 1.0 0.0 28 33.0 3]
 [1.0 0.0 1.0 0.0 0.0 33 22.705 0]
 [1.0 0.0 1.0 0.0 0.0 32 28.88 0]]


In [0]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [53]:
# Training XGBoost on the Training set
from xgboost import XGBRegressor

regressor = XGBRegressor(objective ='reg:squarederror', n_estimators = 20, learning_rate=0.3,
                         max_depth=3)
regressor.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.3, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=20,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [54]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)[:10])

[[ 9628.74  7441.5 ]
 [ 8287.28  7261.74]
 [ 8547.56  6414.18]
 [ 9858.82  8827.21]
 [27240.51 28950.47]
 [11122.28  9800.89]
 [10229.56  9288.03]
 [ 7428.43  6551.75]
 [13313.48 13063.88]
 [13759.23 36910.61]]


In [55]:
# Model Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from math import sqrt

n = X_test.shape[0]
p = X_test.shape[1]
r_square = r2_score(y_test, y_pred)
adj_r_square = 1 - (1 - r_square) * ((n - 1) / (n - p - 1)) 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)

print("R-square : ",r_square)
print("Adjusted R-square : ",adj_r_square)
print("MAE : ", mae)
print("MSE : ",mse)
print("RMSE : ",rmse)

R-square :  0.8036815803683599
Adjusted R-square :  0.7976176909588883
MAE :  2792.8236628224113
MSE :  28105179.994468458
RMSE :  5301.431881526769


In [56]:
# Applying K-Fold Cross Validation
from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {}".format(accuracies.mean()))
print("Standard Deviation: {}".format(accuracies.std()))

Accuracy: 0.8761244503107577
Standard Deviation: 0.04252241114987262


In [58]:
# Improving the model

# Grid Search
from sklearn.model_selection import GridSearchCV

parameters = [{'reg_alpha':[0,0.1,0.3,1,3],'reg_lambda':[0,0.1,0.3,1,3]}]
grid_search = GridSearchCV(estimator = regressor,
                          param_grid = parameters,
                          cv = 10)
grid_search = grid_search.fit(X_train,y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy : {}".format(best_accuracy))
print("Best Parameters : ", best_parameters)

Best Accuracy : 0.8761245505217563
Best Parameters :  {'reg_alpha': 3, 'reg_lambda': 1}


In [57]:
#-----------------training evaluation--------------------------------
n = X_train.shape[0]
p = X_train.shape[1]
r_square = r2_score(y_train, regressor.predict(X_train))
adj_r_square = 1 - (1 - r_square) * ((n - 1) / (n - p - 1)) 
mae = mean_absolute_error(y_train,  regressor.predict(X_train))
mse = mean_squared_error(y_train, regressor.predict(X_train))
rmse = sqrt(mean_squared_error(y_train,  regressor.predict(X_train)))

print("R-square : ",r_square)
print("Adjusted R-square : ",adj_r_square)
print("MAE : ", mae)
print("MSE : ",mse)
print("RMSE : ",rmse)

R-square :  0.9004480518606484
Adjusted R-square :  0.8996974245419728
MAE :  2004.8648965279424
MSE :  14664977.386843367
RMSE :  3829.4878752704476
