In [1]:
# Fixing a problem with Skopt (see https://github.com/scikit-optimize/scikit-optimize/issues/981)
!conda install scipy=='1.5.3' --y

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | 

In [2]:
!pip install scikit-learn=='0.23.2'



In [3]:
# Importing core libraries
import numpy as np
import pandas as pd
from time import time
import pprint
import joblib
from functools import partial

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
from xgboost import XGBRegressor

# Model selection
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt import gp_minimize, forest_minimize
from skopt import gbrt_minimize, dummy_minimize

# Data processing
from sklearn.preprocessing import OrdinalEncoder

In [4]:
# Loading data 
X_train = pd.read_csv("../input/30-days-of-ml/train.csv")
X_test = pd.read_csv("../input/30-days-of-ml/test.csv")

# Preparing data as a tabular matrix
y_train = X_train.target
X_train = X_train.set_index('id').drop('target', axis='columns')
X_test = X_test.set_index('id')

# Pointing out categorical features
categoricals = [item for item in X_train.columns if 'cat' in item]

# Dealing with categorical data using OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
X_train[categoricals] = ordinal_encoder.fit_transform(X_train[categoricals])
X_test[categoricals] = ordinal_encoder.transform(X_test[categoricals])

In [5]:
# Setting the scoring function
scoring = partial(mean_squared_error, squared=False)

In [6]:
# Setting the cv strategy
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [7]:
# Setting the search space
space = [Real(0.01, 1.0, 'uniform', name='learning_rate'),
         Integer(1, 8, name='max_depth'),
         Real(0.1, 1.0, 'uniform', name='subsample'),
         Real(0.1, 1.0, 'uniform', name='colsample_bytree'),  # subsample ratio of columns by tree
         Real(0, 100., 'uniform', name='reg_lambda'),      # L2 regularization
         Real(0, 100., 'uniform', name='reg_alpha'),       # L1 regularization
         Real(1, 30, 'uniform', name='min_child_weight'),     # minimum sum of instance weight (hessian)
]

In [None]:
model = XGBRegressor(n_estimators=10_000, 
                     booster='gbtree', random_state=0)

In [8]:
# The objective function to be minimized
def make_objective(model, X, y, space, cv, scoring, validation=0.2):
    # This decorator converts your objective function with named arguments into one that
    # accepts a list as argument, while doing the conversion automatically.
    @use_named_args(space) 
    def objective(**params):
        model.set_params(**params)
        print("\nTesting: ", params)
        validation_scores = list()
        for k, (train_index, test_index) in enumerate(kf.split(X, y)):
            val_index = list()
            train_examples = len(train_index)
            train_examples = int(train_examples * (1 - validation))
            train_index, val_index = train_index[:train_examples], train_index[train_examples:]
            
            start_time = time()
            model.fit(X.iloc[train_index,:], y[train_index],
                      early_stopping_rounds=50,
                      eval_set=[(X.iloc[val_index,:], y[val_index])], 
                      verbose=0
                    )
            end_time = time()
            
            rounds = model.best_iteration
            
            test_preds = model.predict(X.iloc[test_index,:])
            test_score = scoring(y[test_index], test_preds)
            print(f"CV Fold {k+1} rmse:{test_score:0.5f} - {rounds} rounds - it took {end_time-start_time:0.0f} secs")
            validation_scores.append(test_score)
            
            if len(history[k]) >= 10:
                threshold = np.percentile(history[k], q=25)
                if test_score > threshold:
                    print(f"Early stopping for under-performing fold: threshold is {threshold:0.5f}")
                    return np.mean(validation_scores)
                
            history[k].append(test_score)
        return np.mean(validation_scores)

    return objective

In [10]:
objective = make_objective(model,
                           X_train, y_train,
                           space=space,
                           cv=kf,
                           scoring=scoring)

In [None]:
def onstep(res):
    global counter
    x0 = res.x_iters   # List of input points
    y0 = res.func_vals # Evaluation of input points
    print('Last eval: ', x0[-1], 
          ' - Score ', y0[-1])
    print('Current iter: ', counter, 
          ' - Best Score ', res.fun, 
          ' - Best Args: ', res.x)
    joblib.dump((x0, y0), 'checkpoint.pkl') # Saving a checkpoint to disk
    counter += 1

In [None]:
counter = 0
history = {i:list() for i in range(5)}
used_time = 0

In [11]:
gp_round = dummy_minimize(func=objective,
                          dimensions=space,
                          n_calls=30,
                          callback=[onstep],
                          random_state=0)


Testing:  {'learning_rate': 0.5969161720427683, 'max_depth': 1, 'subsample': 0.6424870384644795, 'colsample_bytree': 0.5903948646972073, 'reg_lambda': 42.36547993389048, 'reg_alpha': 64.58941130666562, 'min_child_weight': 13.690029126618084}
CV Fold 1 rmse:0.72157 - 2190 rounds - it took 119 secs
CV Fold 2 rmse:0.71872 - 2307 rounds - it took 124 secs
CV Fold 3 rmse:0.72394 - 2166 rounds - it took 116 secs
CV Fold 4 rmse:0.71826 - 1681 rounds - it took 91 secs
CV Fold 5 rmse:0.71747 - 2670 rounds - it took 143 secs
Last eval:  [0.5969161720427683, 1, 0.6424870384644795, 0.5903948646972073, 42.36547993389048, 64.58941130666562, 13.690029126618084]  - Score  0.7199923709490351
Current iter:  0  - Best Score  0.7199923709490351  - Best Args:  [0.5969161720427683, 1, 0.6424870384644795, 0.5903948646972073, 42.36547993389048, 64.58941130666562, 13.690029126618084]

Testing:  {'learning_rate': 0.8928552707742591, 'max_depth': 1, 'subsample': 0.3453906651221019, 'colsample_bytree': 0.5298986

In [12]:
x0, y0 = joblib.load('checkpoint.pkl')
print(len(x0))

30


In [13]:
x0, y0 = joblib.load('checkpoint.pkl')

gp_round = gp_minimize(func=objective,
                       x0=x0,              # already examined values for x
                       y0=y0,              # observed values for x0
                       dimensions=space,
                       acq_func='gp_hedge',
                       n_calls=30,
                       n_initial_points=0,
                       callback=[onstep],
                       random_state=0)

Last eval:  [0.3188255455248059, 6, 0.29115144900127776, 0.3001989019481369, 21.874937373677188, 56.95735345747381, 14.111162015969347]  - Score  0.725149009914586
Current iter:  30  - Best Score  0.7192252469331295  - Best Args:  [0.26773751831153064, 6, 0.7334997251863298, 0.19020419858107102, 91.94826137446736, 71.42412995491115, 29.966563190468133]

Testing:  {'learning_rate': 0.04750495343906687, 'max_depth': 4, 'subsample': 0.868163246780575, 'colsample_bytree': 0.11711450684675054, 'reg_lambda': 80.20861835126621, 'reg_alpha': 84.49765104431548, 'min_child_weight': 12.063897523196292}
CV Fold 1 rmse:0.71945 - 5956 rounds - it took 454 secs
CV Fold 2 rmse:0.71676 - 5201 rounds - it took 394 secs
CV Fold 3 rmse:0.72145 - 4923 rounds - it took 375 secs
CV Fold 4 rmse:0.71493 - 5251 rounds - it took 398 secs
CV Fold 5 rmse:0.71628 - 4407 rounds - it took 334 secs
Last eval:  [0.04750495343906687, 4, 0.868163246780575, 0.11711450684675054, 80.20861835126621, 84.49765104431548, 12.063

In [14]:
x0, y0 = joblib.load('checkpoint.pkl')
print(len(x0))

60


In [15]:
print(f"Best score: {gp_round.fun:0.5f}")
print("Best hyperparameters:")
for sp, x in zip(gp_round.space, gp_round.x):
    print(f"{sp.name:25} : {x}")

Best score: 0.71742
Best hyperparameters:
learning_rate             : 0.01
max_depth                 : 6
subsample                 : 1.0
colsample_bytree          : 0.1
reg_lambda                : 20.733696423681973
reg_alpha                 : 57.3382961174106
min_child_weight          : 30.0
