In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
from sklearn import ensemble
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Scipy libraries
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Utils functions
from utils.utils import kfold, five_two

# Load Dataset

In [2]:
folder_path = "../data/"

data_path = folder_path + "complex_processed_data.csv"
standardized_data_path = folder_path + 'complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = folder_path + 'complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

# Process Dataset

Process Dataset before the model creation.
The following actions were done:
* Split the independent variable from the dependent ones;
* Split Dataset for training and testing.

In [3]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

# Gradient Boosting regression - XGBoost

In [5]:
# The cross validation scheme to be used for train and test
folds = kfold()

In [6]:
# Create basic xgboost model

params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error",
}

xgboost = ensemble.GradientBoostingRegressor(**params)
xgboost.fit(x_train, y_train)

r2 = r2_score(y_test, xgboost.predict(x_test))
print("The r2 score on test set: {:.4f}".format(r2))

The r2 score on test set: 0.1356


## GridSearch

In [8]:
# Specify range of hyperparameters to tune
hyper_params = {
    'n_estimators':[100, 200, 300, 350, 400, 500],
    'max_depth':[2, 3, 4, 5],
    "min_samples_split": [1,2,3,4],
    "min_samples_leaf": [1,1.5,2],
    "learning_rate": [0.01,0.02,0.03,0.4,0.5],
    "loss": ["squared_error", "absolute_error", "huber", "quantile"],
    "criterion": ["friedman_mse", "squared_error", "mse"],
    }


# Call GridSearchCV()
model_cv = GridSearchCV(
    estimator = ensemble.GradientBoostingRegressor(),
    param_grid = hyper_params,
    scoring= 'r2',
    cv = folds,
    verbose = 1,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )


# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 17280 candidates, totalling 172800 fits
{'criterion': 'mse', 'learning_rate': 0.03, 'loss': 'squared_error', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}


In [None]:
# Create new model with best_params_ from grid search
# Use cross validation on the best_params_ model


xgboost_best = ensemble.GradientBoostingRegressor(
    n_estimators=model_cv.best_params_['n_estimators'],
    max_depth=model_cv.best_params_['max_depth'],
    min_samples_split=model_cv.best_params_['min_samples_split'],
    min_samples_leaf=model_cv.best_params_['min_samples_leaf'],
    learning_rate=model_cv.best_params_['learning_rate'],
    loss=model_cv.best_params_['loss']
    )

xgboost_best.fit(x_train, y_train)

In [10]:
r2 = r2_score(y_test, xgboost_best.predict(x_test))
print("The r2 score on test set: {:.4f}".format(r2))

The r2 score on test set: 0.2585


# Saving trained model

In [11]:
filename = '../models/xgboost_model.joblib'
joblib.dump(xgboost_best, filename)

['../models/xgboost_model.joblib']

# Compare before and after gridSearch

In [11]:
five_two(
    reg1=xgboost,
    reg2=xgboost_best,
    X=df_sol_X,
    y=df_sol_y
)

Fold  1 score difference = -0.003338
Fold  2 score difference = -0.063062
Fold  1 score difference = -0.141471
Fold  2 score difference = -0.010327
Fold  1 score difference = -0.126887
Fold  2 score difference = -0.056949
Fold  1 score difference = -0.031220
Fold  2 score difference = -0.060850
Fold  1 score difference = -0.040075
Fold  2 score difference = -0.132171
Regression 1 mean score and stdev : 0.131289 + 0.050382
Regression 2 mean score and stdev : 0.197924 + 0.039552
Score difference mean + stdev : -0.066635 + 0.047752
t_value for the current test is -0.056402


In [12]:
from sklearn.model_selection import cross_val_score
cross_val_score(estimator=xgboost_best, X=df_sol_X, y=df_sol_y, cv=folds, scoring='r2')

array([ 0.20218046,  0.25680103,  0.24075854,  0.34294074,  0.24914948,
        0.26067139, -0.15171532,  0.15028745,  0.26910739,  0.20538644])

In [13]:
cross_val_score(estimator=xgboost, X=df_sol_X, y=df_sol_y, cv=folds, scoring='r2')

array([ 0.10321528,  0.15026094,  0.18005148,  0.12085832,  0.21122356,
        0.22748294, -0.30682261,  0.10017498,  0.1331599 ,  0.15394367])

In [14]:
five_two(
    reg1=xgboost,
    reg2=xgboost_best,
    X=x_test,
    y=y_test
)

Fold  1 score difference = -0.050991
Fold  2 score difference = -0.029920
Fold  1 score difference = 0.092379
Fold  2 score difference = -0.093822
Fold  1 score difference = -0.336986
Fold  2 score difference = -0.150148
Fold  1 score difference = 0.086856
Fold  2 score difference = -0.077419
Fold  1 score difference = -0.213519
Fold  2 score difference = -0.030706
Regression 1 mean score and stdev : 0.004064 + 0.199994
Regression 2 mean score and stdev : 0.084492 + 0.168357
Score difference mean + stdev : -0.080428 + 0.123418
t_value for the current test is -0.446487


In [15]:
cross_val_score(estimator=xgboost_best, X=x_test, y=y_test, cv=folds, scoring='r2')

array([-0.44776424,  0.47917305,  0.47277477, -0.37593265, -0.46241663,
        0.53907536,  0.15501931, -0.20648296,  0.04277133, -0.37305712])

In [16]:
cross_val_score(estimator=xgboost, X=x_test, y=y_test, cv=folds, scoring='r2')

array([-0.36968716,  0.50656648,  0.37753683, -0.35287715, -0.89388905,
        0.61267725,  0.27556721, -0.52265983, -0.09172963, -0.50688134])