## Import libraries

In [None]:
# import libraries
import data_handler as dh
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV
from joblib import dump
import os

import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:
# get data
x_train, x_test, y_train, y_test = dh.get_data("./insurance.csv")

## Get the best Decision Tree Regression model

In [None]:
param_grid = {'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],\
    'max_depth':[2,3,4,5,6,7,8,9,10,11,12,15,20,56],\
        'ccp_alpha':np.arange(0,5,0.5)\
            }
        
decisionTreeRegressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(decisionTreeRegressor, param_grid, cv=10)
gs = gs.fit(x_train, y_train)

decision_tree_model = gs.best_estimator_
print("Best estimator found by grid search:", decision_tree_model)

Train_pred_y = gs.predict(x_train)
Test_pred_y = gs.predict(x_test)

train_error = mean_squared_error(y_train, Train_pred_y, squared=False)
test_error = mean_squared_error(y_test, Test_pred_y, squared=False)

print('Train Error {:0.2f}'.format( train_error ))
print('Test Error {:0.2f}'.format( test_error ))

## Get the best GradientBoostingRegressor model

In [None]:
param_grid = {'loss':['squared_error', 'absolute_error', 'huber', 'quantile'],\
    'learning_rate':np.exp(np.arange(-3,3,1)),\
        'n_estimators':np.arange(50,300,50),\
            'subsample':np.arange(-3,3,0.5),\
                'criterion':['friedman_mse', 'squared_error', 'mse', 'mae'],\
                    'alpha':np.arange(0.1,1.3,0.3),\
                        'max_depth':[2,3,4,5,6,7,8,9,10,11,12,15,20],\
                            'ccp_alpha':np.arange(0,5,0.5)\
                                }

# param_grid = {'learning_rate':np.exp(np.arange(-2,2,1)),\
#                 'criterion':['friedman_mse', 'squared_error'],\
#                         'max_depth':[2,3,4,5,6],\
#                             'ccp_alpha':np.arange(0,2,0.5)\
#                                 }
        
model = GradientBoostingRegressor(random_state=0)
gs = GridSearchCV(model, param_grid, cv=10)
gs = gs.fit(x_train, y_train)

gradient_boost_model = gs.best_estimator_
print("Best estimator found by grid search:", gradient_boost_model)

Train_pred_y = gs.predict(x_train)
Test_pred_y = gs.predict(x_test)

train_error = mean_squared_error(y_train, Train_pred_y, squared=False)
test_error = mean_squared_error(y_test, Test_pred_y, squared=False)

print('Train Error {:0.2f}%'.format( train_error ))
print('Test Error {:0.2f}%'.format( test_error ))

## Get the best RandomForestRegressor model

In [None]:
param_grid = {'criterion':['absolute_error', 'squared_error', 'poisson'],\
    'max_depth':[2,3,4,5,6,7,8,9,10,11,12,15,20,56],\
        'bootstrap':[True, False],\
            'oob_score':[True, False],\
                'ccp_alpha':np.arange(0,5,0.5)\
                    }

# param_grid = {'criterion':['absolute_error', 'squared_error'],\
#     'max_depth':[2,3,4,5,6],\
#         'bootstrap':[True, False],\
#                 'ccp_alpha':np.arange(0,2,0.5)\
#                     }
        
model = RandomForestRegressor(random_state=0)
gs = GridSearchCV(model, param_grid, cv=10)
gs = gs.fit(x_train, y_train)

random_forest_model = gs.best_estimator_
print("Best estimator found by grid search:", random_forest_model)

Train_pred_y = gs.predict(x_train)
Test_pred_y = gs.predict(x_test)

train_error = mean_squared_error(y_train, Train_pred_y, squared=False)
test_error = mean_squared_error(y_test, Test_pred_y, squared=False)

print('Train Error {:0.2f}%'.format( train_error ))
print('Test Error {:0.2f}%'.format( test_error ))

## Choose and Save Best model

In [None]:
best_model = None
best_pred = np.inf
for model in [decision_tree_model, gradient_boost_model, random_forest_model]:
    Test_pred_y = model.predict(x_test)
    test_error = mean_squared_error(y_test, Test_pred_y, squared=False)
    if test_error <= best_pred:
        best_model, best_pred = model, test_error
        

In [None]:
#save model
path = './data/best_model.joblib'
try:
    os.remove(path)
except:
    pass
dump(best_model, path)
