In [1]:
# General libraries
import pandas as pd
import numpy as np

# Scikit Learn libraries
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Scipy libraries
from scipy import stats
import joblib
import warnings
warnings.filterwarnings('ignore')

# Utils functions
from utils.utils import kfold, five_two

# Load Dataset

In [2]:
folder_path = "../data/"

data_path = folder_path + "complex_processed_data.csv"
standardized_data_path = folder_path + 'complex_processed_standardized_data.csv'
standardized_poutliers_removed_data_path = folder_path + 'complex_processed_standardized_outliers_removed_data.csv'

df_solubility = pd.read_csv(standardized_data_path)

# Process Dataset

Process Dataset before the model creation.
The following actions were done:
* Split the independent variable from the dependent ones;
* Split Dataset for training and testing.

In [3]:
# Split dataset into X and Y for machine learning

df_sol_X = df_solubility.copy()
df_sol_X.drop(columns=['solubility'], axis=1, inplace=True)

df_sol_y = df_solubility[['solubility']]

In [4]:
x_train, x_test, y_train, y_test = train_test_split(
                        df_sol_X, df_sol_y, 
                        train_size = 0.8,
                        test_size = 0.2,
                        random_state = 10
                        )

# Gradient Boosting regression - XGBoost

In [5]:
# The cross validation scheme to be used for train and test
folds = kfold()

## GridSearch

In [6]:
# Specify range of hyperparameters to tune
hyper_params = {
    'colsample_bytree': [0.2, 0.3, 0.5],
    'learning_rate': [0.5, 0.1, 0.15, 5],
    'n_estimators': [25, 50, 100],
    'subsample': [0.5, 0.8, 1],
    'max_depth': [1, 2, 3],
    'booster': ['gbtree', 'gblinear', 'dart'],
    'objective': ['reg:logistic', "reg:squarederror"]
}



# Call GridSearchCV()
model_cv = GridSearchCV(
    estimator = xgb.XGBRegressor(),
    param_grid = hyper_params,
    scoring= 'r2',
    cv = folds,
    verbose = 1,
    return_train_score=True,
    n_jobs = -1,
    refit = True
    )


# Fit the model
best_model = model_cv.fit(x_train, np.ravel(y_train)) 

print(model_cv.best_params_)

Fitting 10 folds for each of 1458 candidates, totalling 14580 fits
{'booster': 'gbtree', 'colsample_bytree': 0.2, 'learning_rate': 0.15, 'max_depth': 1, 'n_estimators': 50, 'objective': 'reg:squarederror', 'subsample': 0.5}


In [7]:
# Create new model with best_params_ from grid search
# Use cross validation on the best_params_ model

xgboost_best = xgb.XGBRegressor(
    colsample_bytree=model_cv.best_params_['colsample_bytree'],
    n_estimators=model_cv.best_params_['n_estimators'],
    subsample=model_cv.best_params_['subsample'],
    max_depth=model_cv.best_params_['max_depth'],
    learning_rate=model_cv.best_params_['learning_rate'],
    booster=model_cv.best_params_['booster'],
    objective=model_cv.best_params_['objective']
    )

xgboost_best.fit(x_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.2,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.15, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=1, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [8]:
r2 = r2_score(y_test, xgboost_best.predict(x_test))
print("The r2 score on test set: {:.4f}".format(r2))

The r2 score on test set: 0.2622


# Saving trained model

In [9]:
filename = '../models/xgboost_model.joblib'
joblib.dump(xgboost_best, filename)

['../models/xgboost_model.joblib']