In [1]:
!pip install scikit-learn -U -q

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 0.22.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.10.0, but you have google-cloud-bigquery 2.34.4 which is incompatible.
bigframes 0.22.0 requires google-cloud-storage>=2.0.0, but you have google-cloud-storage 1.44.0 which is incompatible.
bigframes 0.22.0 requires pandas<2.1.4,>=1.5.0, but you have pandas 2.2.2 which is incompatible.
cesium 0.12.3 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
dataproc-jupyter-plugin 0.1.79 requires pydantic~=1.10.0, but you have pydantic 2.9.2 which is incompatible.[0m[31m
[0m

In [2]:
# Importing core libraries
import numpy as np
import pandas as pd
import shutil
from time import time
import pprint
import joblib
from functools import partial
import gc
import os
# Suppressing warnings because of skopt verbosity
import warnings
warnings.filterwarnings("ignore")

# Classifier/Regressor
from xgboost import XGBRegressor

# Model selection
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer
from skopt.utils import use_named_args # decorator to convert a list of parameters to named arguments
from skopt import gp_minimize, forest_minimize, gbrt_minimize, dummy_minimize

# Data processing
from sklearn.preprocessing import OrdinalEncoder

In [3]:
# Loading data 
X_train = pd.read_csv("../input/30-days-of-ml/train.csv")
X_test = pd.read_csv("../input/30-days-of-ml/test.csv")

# Preparing data as a tabular matrix
X_train = X_train.set_index('id')
X_test = X_test.set_index('id')

# Pointing out categorical features
categoricals = [item for item in X_train.columns if 'cat' in item]

# Dealing with categorical data using OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
X_train[categoricals] = ordinal_encoder.fit_transform(X_train[categoricals]).astype(int)
X_test[categoricals] = ordinal_encoder.transform(X_test[categoricals]).astype(int)

In [4]:
# Setting the scoring function
scoring = partial(mean_squared_error, squared=False)

In [5]:
# Setting the cv strategy
kf = KFold(n_splits=5, shuffle=True, random_state=0)

In [6]:
import tensorflow as tf

In [7]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('target')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    return ds

In [8]:
def create_model(cat0_dim, cat1_dim, cat2_dim,
                 cat3_dim, cat4_dim, cat5_dim,
                 cat6_dim, cat7_dim, cat8_dim, cat9_dim,
                 layers, layer_1, layer_2, layer_3, layer_4, layer_5,
                 activation, dropout, batch_normalization, learning_rate, **others):
    
    # Define name labels
    categorical_labels = ['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5',
                          'cat6', 'cat7', 'cat8', 'cat9']
    numeric_labels = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
                      'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13']
    
    # Define dimensions for categorical features
    dims = {'cat0': cat0_dim, 'cat1': cat1_dim, 'cat2': cat2_dim, 
            'cat3': cat3_dim, 'cat4': cat4_dim, 'cat5': cat5_dim,
            'cat6': cat6_dim, 'cat7': cat7_dim, 'cat8': cat8_dim, 'cat9': cat9_dim}
    
    # Vocabulary based on unique categorical values in X_train
    vocab = {h: X_train[h].max() for h in categorical_labels}
    
    layers = [layer_1, layer_2, layer_3, layer_4, layer_5][:layers]

    # Input placeholders for all features (continuous and categorical)
    continuous_inputs = []
    categorical_inputs = []
    flattened_categorical_inputs = []
    
    # Continuous feature inputs
    for header in numeric_labels:
        input_layer = tf.keras.Input(shape=(1,), name=header)
        continuous_inputs.append(input_layer)

    # Categorical feature inputs with embeddings
    for header in categorical_labels:
        input_layer = tf.keras.Input(shape=(1,), name=header)
        embedded_input = tf.keras.layers.Embedding(input_dim=vocab[header] + 1, 
                                                   output_dim=dims[header], 
                                                   name=f'{header}_embedding')(input_layer)
        flattened_input = tf.keras.layers.Flatten()(embedded_input)
        categorical_inputs.append(input_layer)
        flattened_categorical_inputs.append(flattened_input)

    # Concatenate all inputs (continuous and categorical)
    concatenated_inputs = tf.keras.layers.Concatenate()(continuous_inputs + flattened_categorical_inputs)
        
    # Build the dense network based on the provided architecture
    x = concatenated_inputs
    for nodes in layers:
        x = tf.keras.layers.Dense(nodes, activation=activation)(x)
        if batch_normalization:
            x = tf.keras.layers.BatchNormalization()(x)
        if dropout > 0:
            x = tf.keras.layers.Dropout(dropout)(x)

    # Final output layer
    output = tf.keras.layers.Dense(1)(x)

    # Define the complete model using Functional API
    model = tf.keras.Model(inputs=continuous_inputs + categorical_inputs, outputs=output)

    # Compile the model
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                  loss=tf.keras.losses.MeanSquaredError(),
                  metrics=['mean_squared_error'])

    return model


In [9]:
# Setting the search space

from tensorflow.keras.activations import relu, leaky_relu

space = [Integer(1, 2, name='cat0_dim'),
         Integer(1, 2, name='cat1_dim'),
         Integer(1, 2, name='cat2_dim'),
         Integer(1, 3, name='cat3_dim'),
         Integer(1, 3, name='cat4_dim'),
         Integer(1, 3, name='cat5_dim'),
         Integer(1, 4, name='cat6_dim'),
         Integer(1, 4, name='cat7_dim'),
         Integer(1, 6, name='cat8_dim'),
         Integer(1, 8, name='cat9_dim'),
         Integer(1, 5, name='layers'),
         Integer(2, 256, name='layer_1'),
         Integer(2, 256, name='layer_2'),
         Integer(2, 256, name='layer_3'),
         Integer(2, 256, name='layer_4'),
         Integer(2, 256, name='layer_5'),
         Categorical([leaky_relu, relu], name='activation'),
         Real(0.0, 0.5, 'uniform', name='dropout'),
         Categorical([True, False], name='batch_normalization'),
         Categorical([0.01, 0.005, 0.002, 0.001], name='learning_rate'),
         Integer(256, 1024, name='batch_size')
        ]

In [10]:
counter = 0
history = {i:list() for i in range(5)}
used_time = 0

from time import time

def onstep(res):
    global counter
    x0 = res.x_iters   # List of input points
    y0 = res.func_vals # Evaluation of input points
    print('Last eval: ', x0[-1], 
          ' - Score ', y0[-1])
    print('Current iter: ', counter, 
          ' - Best Score ', res.fun, 
          ' - Best Args: ', res.x)
    joblib.dump((x0, y0), 'checkpoint.pkl') # Saving a checkpoint to disk
    counter += 1
    
# The objective function to be minimized
def make_objective(model_fn, X, space, cv, scoring, validation=0.2, batch_size=50_000):
    # This decorator converts your objective function with named arguments into one that
    # accepts a list as argument, while doing the conversion automatically.
    @use_named_args(space) 
    def objective(**params):
        
        print("\nTesting: ", params)
        validation_scores = list()
        
        for k, (train_index, test_index) in enumerate(kf.split(X)):
            val_index = list()
            train_examples = len(train_index)
            train_examples = int(train_examples * (1 - validation))
            train_index, val_index = train_index[:train_examples], train_index[train_examples:]
            
            start_time = time()
            
            model = model_fn(**params)
            measure_to_monitor = 'val_mean_squared_error'
            modality='min'

            early_stopping = tf.keras.callbacks.EarlyStopping(monitor=measure_to_monitor, 
                                                              mode=modality, 
                                                              patience=5, 
                                                              verbose=0)

            model_checkpoint = tf.keras.callbacks.ModelCheckpoint('best_model.keras', 
                                               monitor=measure_to_monitor, 
                                               mode=modality, 
                                               save_best_only=True, 
                                               verbose=0)

            run = model.fit(df_to_dataset(X_train.iloc[train_index, :], batch_size=params['batch_size']),
                            validation_data=df_to_dataset(X_train.iloc[val_index, :], batch_size=1024),
                            epochs=1_000,
                            callbacks=[model_checkpoint, early_stopping],
                            verbose=0)
            
            end_time = time()
            
            rounds = np.argmin(run.history['val_mean_squared_error']) + 1
            
            model = tf.keras.models.load_model('best_model.keras')
            os.remove('best_model.keras')
            
            test_preds = model.predict(df_to_dataset(X.iloc[test_index, :], 
                                                     shuffle=False,
                                                     batch_size=1024), verbose=0).flatten()
            test_score = scoring(X.iloc[test_index, :]['target'], test_preds)
            print(f"CV Fold {k+1} rmse:{test_score:0.5f} - {rounds} rounds - it took {end_time-start_time:0.0f} secs")
            validation_scores.append(test_score)
            
            if len(history[k]) >= 10:
                threshold = np.percentile(history[k], q=25)
                if test_score > threshold:
                    print(f"Early stopping for under-performing fold: threshold is {threshold:0.5f}")
                    return np.mean(validation_scores)
                
            history[k].append(test_score)
        return np.mean(validation_scores)

    return objective

In [11]:
objective = make_objective(create_model,
                           X_train,
                           space=space,
                           cv=kf,
                           scoring=scoring)

In [12]:
gp_round = dummy_minimize(func=objective,
                          dimensions=space,
                          n_calls=10,
                          callback=[onstep],
                          random_state=0)

gc.collect()


Testing:  {'cat0_dim': 2, 'cat1_dim': 2, 'cat2_dim': 1, 'cat3_dim': 2, 'cat4_dim': 2, 'cat5_dim': 3, 'cat6_dim': 1, 'cat7_dim': 4, 'cat8_dim': 1, 'cat9_dim': 1, 'layers': 5, 'layer_1': 60, 'layer_2': 195, 'layer_3': 232, 'layer_4': 41, 'layer_5': 89, 'activation': <function leaky_relu at 0x79322f30e200>, 'dropout': 0.19639239805041492, 'batch_normalization': False, 'learning_rate': 0.005, 'batch_size': 371}


I0000 00:00:1728169537.912735      67 service.cc:145] XLA service 0x7931b40020b0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1728169537.912799      67 service.cc:153]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1728169547.365487      67 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


CV Fold 1 rmse:0.74001 - 7 rounds - it took 99 secs
CV Fold 2 rmse:0.73718 - 3 rounds - it took 70 secs
CV Fold 3 rmse:0.74225 - 6 rounds - it took 89 secs
CV Fold 4 rmse:0.73626 - 3 rounds - it took 71 secs
CV Fold 5 rmse:0.73594 - 19 rounds - it took 172 secs
Last eval:  [2, 2, 1, 2, 2, 3, 1, 4, 1, 1, 5, 60, 195, 232, 41, 89, <function leaky_relu at 0x79322f30e200>, 0.19639239805041492, False, 0.005, 371]  - Score  0.7383292049045582
Current iter:  0  - Best Score  0.7383292049045582  - Best Args:  [2, 2, 1, 2, 2, 3, 1, 4, 1, 1, 5, 60, 195, 232, 41, 89, <function leaky_relu at 0x79322f30e200>, 0.19639239805041492, False, 0.005, 371]

Testing:  {'cat0_dim': 1, 'cat1_dim': 2, 'cat2_dim': 2, 'cat3_dim': 3, 'cat4_dim': 1, 'cat5_dim': 3, 'cat6_dim': 4, 'cat7_dim': 1, 'cat8_dim': 2, 'cat9_dim': 4, 'layers': 4, 'layer_1': 149, 'layer_2': 144, 'layer_3': 169, 'layer_4': 34, 'layer_5': 195, 'activation': <function leaky_relu at 0x79322f30e200>, 'dropout': 0.47233445852479206, 'batch_normaliza

84734

In [13]:
x0, y0 = joblib.load('checkpoint.pkl')
print(len(x0))

10


In [14]:
x0, y0 = joblib.load('checkpoint.pkl')

gp_round = gp_minimize(func=objective,
                       x0=x0,              # already examined values for x
                       y0=y0,              # observed values for x0
                       dimensions=space,
                       acq_func='gp_hedge',
                       n_calls=30,
                       n_initial_points=0,
                       callback=[onstep],
                       random_state=0)

gc.collect()

Last eval:  [2, 2, 2, 3, 2, 2, 4, 1, 1, 8, 2, 20, 42, 184, 209, 13, <function leaky_relu at 0x79322f30e200>, 0.17702333104190512, False, 0.01, 472]  - Score  0.7383688549900136
Current iter:  10  - Best Score  0.737617355775065  - Best Args:  [2, 1, 2, 3, 3, 1, 3, 4, 6, 5, 2, 172, 86, 205, 70, 8, <function leaky_relu at 0x79322f30e200>, 0.10443837804741737, True, 0.002, 399]

Testing:  {'cat0_dim': 1, 'cat1_dim': 1, 'cat2_dim': 2, 'cat3_dim': 1, 'cat4_dim': 1, 'cat5_dim': 3, 'cat6_dim': 2, 'cat7_dim': 4, 'cat8_dim': 2, 'cat9_dim': 2, 'layers': 2, 'layer_1': 79, 'layer_2': 228, 'layer_3': 3, 'layer_4': 55, 'layer_5': 204, 'activation': <function leaky_relu at 0x79322f30e200>, 'dropout': 0.105496665074676, 'batch_normalization': True, 'learning_rate': 0.001, 'batch_size': 956}
CV Fold 1 rmse:0.73896 - 37 rounds - it took 281 secs
CV Fold 2 rmse:0.74849 - 1 rounds - it took 50 secs
Early stopping for under-performing fold: threshold is 0.73663
Last eval:  [1, 1, 2, 1, 1, 3, 2, 4, 2, 2, 2,

64128

In [15]:
x0, y0 = joblib.load('checkpoint.pkl')
print(len(x0))

40


In [16]:
print(f"Best score: {gp_round.fun:0.5f}")
print("Best hyperparameters:")
for sp, x in zip(gp_round.space, gp_round.x):
    print(f"{sp.name:25} : {x}")

Best score: 0.73697
Best hyperparameters:
cat0_dim                  : 2
cat1_dim                  : 2
cat2_dim                  : 1
cat3_dim                  : 2
cat4_dim                  : 3
cat5_dim                  : 3
cat6_dim                  : 2
cat7_dim                  : 2
cat8_dim                  : 3
cat9_dim                  : 2
layers                    : 5
layer_1                   : 219
layer_2                   : 89
layer_3                   : 246
layer_4                   : 175
layer_5                   : 79
activation                : <function relu at 0x79322f471900>
dropout                   : 0.04619831589851348
batch_normalization       : True
learning_rate             : 0.001
batch_size                : 265
