# Model tuning

Tuning models hyperparameters

- Model: xgboost and random forest model
- Tuner: optuna

In [1]:
import numpy as np
import pandas as pd

In [9]:
PATH = '../data/maccs_data.csv'

data = pd.read_csv(PATH)
data['gap'] = data['lumo'] - data['homo']
data = data.drop(['smile', 'homo', 'lumo'], axis = 1)

In [10]:
data.head()

Unnamed: 0,maccs_1,maccs_2,maccs_3,maccs_4,maccs_5,maccs_6,maccs_7,maccs_8,maccs_9,maccs_10,...,maccs_158,maccs_159,maccs_160,maccs_161,maccs_162,maccs_163,maccs_164,maccs_165,maccs_166,gap
0,0,0,0,0,0,0,0,1,0,0,...,1,1,0,1,0,1,1,1,0,0.2561
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,1,0,0.1526
2,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,1,1,0,0.2286
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,1,0,1,1,0,0.1958
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0.2864


In [24]:
from sklearn.model_selection import train_test_split

X, y = data.drop(['gap'], axis = 1), data['gap']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.2, random_state = 1)

In [25]:
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import optuna

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [33]:
dtrain = xgb.DMatrix(X_train, label = y_train)
dvalid = xgb.DMatrix(X_valid, label = y_valid)

# define objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'objective': 'reg:squarederror'
    }

    clf = xgb.train(params, dtrain)
    y_pred = clf.predict(dvalid)

    rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
    return rmse
                                                        
# optuna optimize
study = optuna.create_study(direction = "maximize")
study.optimize(objective, n_trials=10)
    
print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))
    
# classifier
best_params = study.best_trial.params
best_params['objective'] = 'reg:squarederror'
clf = xgb.XGBRegressor()
    
clf.fit(X_train, y_train)

[32m[I 2021-03-05 21:52:39,472][0m A new study created in memory with name: no-name-23e4f05c-ffb2-4028-8529-9443c34f3334[0m
[32m[I 2021-03-05 21:52:39,648][0m Trial 0 finished with value: 0.12262049044584077 and parameters: {'n_estimators': 575, 'max_depth': 20, 'learning_rate': 0.07570624006261806, 'subsample': 0.9123603661838464, 'colsample_bytree': 0.5507663570509362, 'gamma': 3}. Best is trial 0 with value: 0.12262049044584077.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-05 21:52:39,832][0m Trial 1 finished with value: 0.15091203367359993 and parameters: {'n_estimators': 417, 'max_depth': 10, 'learning_rate': 0.053363380896637905, 'subsample': 0.8505028742725644, 'colsample_bytree': 0.7628612487682878, 'gamma': 8}. Best is trial 1 with value: 0.15091203367359993.[0m
[32m[I 2021-03-05 21:52:40,033][0m Trial 2 finished with value: 0.12093314150535892 and parameters: {'n_estimators': 422, 'max_depth': 16, 'learning_rate': 0.07722130459346976, 'subsample': 0.8904801121544121, 'colsample_bytree': 0.761884539615871, 'gamma': 8}. Best is trial 1 with value: 0.15091203367359993.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-05 21:52:40,156][0m Trial 3 finished with value: 0.21573505322913472 and parameters: {'n_estimators': 442, 'max_depth': 18, 'learning_rate': 0.01605418424822754, 'subsample': 0.5289205573693312, 'colsample_bytree': 0.5590611031962702, 'gamma': 2}. Best is trial 3 with value: 0.21573505322913472.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-05 21:52:40,291][0m Trial 4 finished with value: 0.18182159619871976 and parameters: {'n_estimators': 587, 'max_depth': 14, 'learning_rate': 0.03382137136545998, 'subsample': 0.7085055224065627, 'colsample_bytree': 0.5231890105565586, 'gamma': 9}. Best is trial 3 with value: 0.21573505322913472.[0m
[32m[I 2021-03-05 21:52:40,488][0m Trial 5 finished with value: 0.12649019672880185 and parameters: {'n_estimators': 434, 'max_depth': 15, 'learning_rate': 0.06848378544212519, 'subsample': 0.874807019109507, 'colsample_bytree': 0.670562724023281, 'gamma': 0}. Best is trial 3 with value: 0.21573505322913472.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-05 21:52:40,761][0m Trial 6 finished with value: 0.18582296711954935 and parameters: {'n_estimators': 592, 'max_depth': 18, 'learning_rate': 0.031565065587690774, 'subsample': 0.6398174755199284, 'colsample_bytree': 0.6364250818095577, 'gamma': 4}. Best is trial 3 with value: 0.21573505322913472.[0m
[32m[I 2021-03-05 21:52:40,944][0m Trial 7 finished with value: 0.13390489506909603 and parameters: {'n_estimators': 484, 'max_depth': 15, 'learning_rate': 0.06622120298696631, 'subsample': 0.5728107263931106, 'colsample_bytree': 0.5957017214172236, 'gamma': 4}. Best is trial 3 with value: 0.21573505322913472.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-05 21:52:41,109][0m Trial 8 finished with value: 0.12242863501171654 and parameters: {'n_estimators': 429, 'max_depth': 20, 'learning_rate': 0.07587177855847219, 'subsample': 0.6945884889375569, 'colsample_bytree': 0.54287868358516, 'gamma': 10}. Best is trial 3 with value: 0.21573505322913472.[0m


Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




[32m[I 2021-03-05 21:52:41,384][0m Trial 9 finished with value: 0.10319126448431772 and parameters: {'n_estimators': 510, 'max_depth': 12, 'learning_rate': 0.09514037750924906, 'subsample': 0.854359637634585, 'colsample_bytree': 0.8969471623216986, 'gamma': 3}. Best is trial 3 with value: 0.21573505322913472.[0m


Best trial: score 0.21573505322913472, params {'n_estimators': 442, 'max_depth': 18, 'learning_rate': 0.01605418424822754, 'subsample': 0.5289205573693312, 'colsample_bytree': 0.5590611031962702, 'gamma': 2}


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [34]:
y_pred = clf.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(mean_absolute_error(y_test, y_pred))
print(r2_score(y_test, y_pred))

0.01574441079742409
0.01168375595381411
0.8931247241501626


In [35]:
clf

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)