In [1]:
# Fixing a problem with Skopt (see https://github.com/scikit-optimize/scikit-optimize/issues/981)
!conda install scipy=='1.5.3' --y

Collecting package metadata (current_repodata.json): - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ done
Solving environment: / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - \ | 

In [2]:
!pip install scikit-learn=='0.23.2'



In [3]:
# Importing core libraries
import numpy as np
import pandas as pd
import shutil
from time import time
import pprint
import joblib
from functools import partial
import gc

# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
from xgboost import XGBRegressor

# Model selection
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt import gp_minimize, forest_minimize, gbrt_minimize, dummy_minimize

# Data processing
from sklearn.preprocessing import OrdinalEncoder

In [4]:
# Loading data 
X_train = pd.read_csv("../input/30-days-of-ml/train.csv")
X_test = pd.read_csv("../input/30-days-of-ml/test.csv")

# Preparing data as a tabular matrix
X_train = X_train.set_index('id')
X_test = X_test.set_index('id')

# Pointing out categorical features
categoricals = [item for item in X_train.columns if 'cat' in item]

# Dealing with categorical data using OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
X_train[categoricals] = ordinal_encoder.fit_transform(X_train[categoricals]).astype(int)
X_test[categoricals] = ordinal_encoder.transform(X_test[categoricals]).astype(int)

In [5]:
# Setting the scoring function
scoring = partial(mean_squared_error, squared=False)

In [6]:
# Setting the cv strategy
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [7]:
import tensorflow as tf

In [8]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

tf.keras.utils.get_custom_objects().update({'leaky-relu': tf.keras.layers.Activation(tf.keras.layers.LeakyReLU(alpha=0.2))})

In [9]:
def create_model(cat0_dim, cat1_dim, cat2_dim,
                 cat3_dim, cat4_dim, cat5_dim, 
                 cat6_dim, cat7_dim, cat8_dim, cat9_dim,
                 layers, layer_1, layer_2, layer_3, layer_4, layer_5, 
                 activation, dropout, batch_normalization, learning_rate, **others):
    
    dims = {'cat0': cat0_dim, 'cat1': cat1_dim, 'cat2': cat2_dim, 
            'cat3': cat3_dim, 'cat4': cat4_dim, 'cat5': cat5_dim,
            'cat6': cat6_dim, 'cat7': cat7_dim, 'cat8': cat8_dim, 'cat9': cat9_dim}
    
    vocab = {h:X_train['cat4'].unique().astype(int) for h in ['cat0', 'cat1', 'cat2', 'cat3', 
                                                              'cat4', 'cat5', 'cat6', 'cat7', 
                                                              'cat8', 'cat9']}
    
    layers = [layer_1, layer_2, layer_3, layer_4, layer_5][:layers]
    
    feature_columns = list()

    for header in ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
                   'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']:
        feature_columns.append(tf.feature_column.numeric_column(header))

    for header in ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 
                   'cat6', 'cat7', 'cat8', 'cat9']:
        feature_columns.append(tf.feature_column.embedding_column(
            tf.feature_column.categorical_column_with_vocabulary_list(header, vocabulary_list=vocab[header]), 
            dimension=dims[header]))

    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

    network_struct = [feature_layer]

    for nodes in layers:
        network_struct.append(tf.keras.layers.Dense(nodes, activation=activation))
        if batch_normalization is True:
            network_struct.append(tf.keras.layers.BatchNormalization())
        if dropout > 0:
            network_struct.append(tf.keras.layers.Dropout(dropout))    

    model = tf.keras.Sequential(network_struct + [tf.keras.layers.Dense(1)])

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss= tf.keras.losses.MeanSquaredError(),
                  metrics=['mean_squared_error'])
    
    return model

In [10]:
# Setting the search space
    
space = [Integer(1, 2, name='cat0_dim'),
         Integer(1, 2, name='cat1_dim'),
         Integer(1, 2, name='cat2_dim'),
         Integer(1, 3, name='cat3_dim'),
         Integer(1, 3, name='cat4_dim'),
         Integer(1, 3, name='cat5_dim'),
         Integer(1, 4, name='cat6_dim'),
         Integer(1, 4, name='cat7_dim'),
         Integer(1, 6, name='cat8_dim'),
         Integer(1, 8, name='cat9_dim'),
         Integer(1, 5, name='layers'),
         Integer(2, 256, name='layer_1'),
         Integer(2, 256, name='layer_2'),
         Integer(2, 256, name='layer_3'),
         Integer(2, 256, name='layer_4'),
         Integer(2, 256, name='layer_5'),
         Categorical(['relu', 'leaky-relu'], name='activation'),
         Real(0.0, 0.5, 'uniform', name='dropout'),
         Categorical([True, False], name='batch_normalization'),
         Categorical([0.01, 0.005, 0.002, 0.001], name='learning_rate'),
         Integer(256, 1024, name='batch_size')
        ]

In [None]:
# The objective function to be minimized
def make_objective(model_fn, X, space, cv, scoring, validation=0.2):
    # This decorator converts your objective function with named arguments into one that
    # accepts a list as argument, while doing the conversion automatically.
    @use_named_args(space) 
    def objective(**params):
        
        print("\nTesting: ", params)
        validation_scores = list()
        
        for k, (train_index, test_index) in enumerate(kf.split(X)):
            val_index = list()
            train_examples = len(train_index)
            train_examples = int(train_examples * (1 - validation))
            train_index, val_index = train_index[:train_examples], train_index[train_examples:]
            
            start_time = time()
            
            model = model_fn(**params)
            measure_to_monitor = 'val_mean_squared_error'
            modality='min'

            early_stopping = tf.keras.callbacks.EarlyStopping(monitor=measure_to_monitor, 
                                                              mode=modality, 
                                                              patience=5, 
                                                              verbose=0)

            model_checkpoint = tf.keras.callbacks.ModelCheckpoint('best.model', 
                                               monitor=measure_to_monitor, 
                                               mode=modality, 
                                               save_best_only=True, 
                                               verbose=0)

            run = model.fit(df_to_dataset(X_train.iloc[train_index, :], batch_size=params['batch_size']),
                            validation_data=df_to_dataset(X_train.iloc[val_index, :], batch_size=1024),
                            epochs=1_000,
                            callbacks=[model_checkpoint, early_stopping],
                            verbose=0)
            
            end_time = time()
            
            rounds = np.argmin(run.history['val_mean_squared_error']) + 1
            
            model = tf.keras.models.load_model('best.model')
            shutil.rmtree('best.model')
            
            test_preds = model.predict(df_to_dataset(X.iloc[test_index, :], shuffle=False, batch_size=1024)).flatten()
            test_score = scoring(X.iloc[test_index, :]['target'], test_preds)
            print(f"CV Fold {k+1} rmse:{test_score:0.5f} - {rounds} rounds - it took {end_time-start_time:0.0f} secs")
            validation_scores.append(test_score)
            
            if len(history[k]) >= 10:
                threshold = np.percentile(history[k], q=25)
                if test_score > threshold:
                    print(f"Early stopping for under-performing fold: threshold is {threshold:0.5f}")
                    return np.mean(validation_scores)
                
            history[k].append(test_score)
        return np.mean(validation_scores)

    return objective

In [11]:
def onstep(res):
    global counter
    x0 = res.x_iters   # List of input points
    y0 = res.func_vals # Evaluation of input points
    print('Last eval: ', x0[-1], 
          ' - Score ', y0[-1])
    print('Current iter: ', counter, 
          ' - Best Score ', res.fun, 
          ' - Best Args: ', res.x)
    joblib.dump((x0, y0), 'checkpoint.pkl') # Saving a checkpoint to disk
    counter += 1

In [12]:
objective = make_objective(create_model,
                           X_train,
                           space=space,
                           cv=kf,
                           scoring=scoring)

In [None]:
counter = 0
history = {i:list() for i in range(5)}
used_time = 0

In [13]:
gp_round = dummy_minimize(func=objective,
                          dimensions=space,
                          n_calls=10,
                          callback=[onstep],
                          random_state=0)

gc.collect()


Testing:  {'cat0_dim': 2, 'cat1_dim': 2, 'cat2_dim': 1, 'cat3_dim': 2, 'cat4_dim': 2, 'cat5_dim': 3, 'cat6_dim': 1, 'cat7_dim': 4, 'cat8_dim': 1, 'cat9_dim': 1, 'layers': 5, 'layer_1': 60, 'layer_2': 195, 'layer_3': 232, 'layer_4': 41, 'layer_5': 89, 'activation': 'relu', 'dropout': 0.19639239805041492, 'batch_normalization': False, 'learning_rate': 0.005, 'batch_size': 371}
CV Fold 1 rmse:0.73977 - 13 rounds - it took 272 secs
CV Fold 2 rmse:0.73682 - 19 rounds - it took 376 secs
CV Fold 3 rmse:0.74341 - 6 rounds - it took 173 secs
CV Fold 4 rmse:0.73713 - 8 rounds - it took 191 secs
CV Fold 5 rmse:0.73633 - 18 rounds - it took 335 secs
Last eval:  [2, 2, 1, 2, 2, 3, 1, 4, 1, 1, 5, 60, 195, 232, 41, 89, 'relu', 0.19639239805041492, False, 0.005, 371]  - Score  0.7386918524594882
Current iter:  0  - Best Score  0.7386918524594882  - Best Args:  [2, 2, 1, 2, 2, 3, 1, 4, 1, 1, 5, 60, 195, 232, 41, 89, 'relu', 0.19639239805041492, False, 0.005, 371]

Testing:  {'cat0_dim': 1, 'cat1_dim':

72432

In [15]:
x0, y0 = joblib.load('checkpoint.pkl')

gp_round = gp_minimize(func=objective,
                       x0=x0,              # already examined values for x
                       y0=y0,              # observed values for x0
                       dimensions=space,
                       acq_func='gp_hedge',
                       n_calls=30,
                       n_initial_points=0,
                       callback=[onstep],
                       random_state=0)

gc.collect()

Last eval:  [2, 2, 2, 3, 2, 2, 4, 1, 1, 8, 2, 20, 42, 184, 209, 13, 'relu', 0.17702333104190512, False, 0.01, 472]  - Score  0.7381087316889905
Current iter:  10  - Best Score  0.7381087316889905  - Best Args:  [2, 2, 2, 3, 2, 2, 4, 1, 1, 8, 2, 20, 42, 184, 209, 13, 'relu', 0.17702333104190512, False, 0.01, 472]

Testing:  {'cat0_dim': 2, 'cat1_dim': 1, 'cat2_dim': 1, 'cat3_dim': 2, 'cat4_dim': 3, 'cat5_dim': 3, 'cat6_dim': 2, 'cat7_dim': 4, 'cat8_dim': 4, 'cat9_dim': 2, 'layers': 2, 'layer_1': 140, 'layer_2': 17, 'layer_3': 72, 'layer_4': 85, 'layer_5': 225, 'activation': 'relu', 'dropout': 0.20626493255557954, 'batch_normalization': False, 'learning_rate': 0.005, 'batch_size': 990}
CV Fold 1 rmse:0.74059 - 12 rounds - it took 167 secs
Early stopping for under-performing fold: threshold is 0.73979
Last eval:  [2, 1, 1, 2, 3, 3, 2, 4, 4, 2, 2, 140, 17, 72, 85, 225, 'relu', 0.20626493255557954, False, 0.005, 990]  - Score  0.7405885357711931
Current iter:  11  - Best Score  0.7381087316

78767

In [16]:
x0, y0 = joblib.load('checkpoint.pkl')
print(len(x0))

40


In [17]:
print(f"Best score: {gp_round.fun:0.5f}")
print("Best hyperparameters:")
for sp, x in zip(gp_round.space, gp_round.x):
    print(f"{sp.name:25} : {x}")

Best score: 0.73811
Best hyperparameters:
cat0_dim                  : 2
cat1_dim                  : 2
cat2_dim                  : 2
cat3_dim                  : 3
cat4_dim                  : 2
cat5_dim                  : 2
cat6_dim                  : 4
cat7_dim                  : 1
cat8_dim                  : 1
cat9_dim                  : 8
layers                    : 2
layer_1                   : 20
layer_2                   : 42
layer_3                   : 184
layer_4                   : 209
layer_5                   : 13
activation                : relu
dropout                   : 0.17702333104190512
batch_normalization       : False
learning_rate             : 0.01
batch_size                : 472
