In [None]:
import os

import optuna
import numpy as np
import pandas as pd
import xgboost as xgb
from kaggle_secrets import UserSecretsClient
from sklearn.model_selection import cross_val_score

In [None]:
os.environ['KAGGLE_USERNAME'] = UserSecretsClient().get_secret('KAGGLE_USERNAME')
os.environ['KAGGLE_KEY'] = UserSecretsClient().get_secret('KAGGLE_KEY')

# Load Data
To minimise the amount of moving parts in this experiment, no filtering of samples or features of any kind is performed.

Action is defined as `resp > 0`.

In [None]:
# load pre-loaded parquet file from private dataset, otherwise build it from raw csv
try:
    dtrain = pd.read_parquet('../input/dtrain-parquet/dtrain.parquet')
except:
    dtrain = pd.read_csv('../input/jane-street-market-prediction/train.csv', index_col='ts_id')
    dtrain = dtrain.astype({c: np.float32 for c, t in dtrain.dtypes.items() if t == np.float64})

In [None]:
# separate target(s) and various labels from independent variables
dlabels = dtrain[['date', 'weight', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']]
dtrain = dtrain.drop(dlabels.columns, axis=1)
dlabels['action'] = (dlabels['resp'] > 0).astype('int')

# Baseline

Again, the less variables in this experiment the better, so XGBoost only gets altered along three of its influencial parameters. When one parameter is changed, the other ones stay constant. Leaderboard scores were retrieved using another notebook: https://www.kaggle.com/jorijnsmit/benchmarking-the-public-leaderboard

Basically this is the training set we are going to the tune the cv 'model' on:

In [None]:
# results from submissions to public leaderboard
logbook = pd.DataFrame({
    'seed': [13, 24, 35, 13, 24, 35, 13, 24, 35, 13, 24, 35, 13, 24, 35, 13, 24, 35, 13, 24, 35, 13, 24, 35],
    'colsample_bytree': [1., 1., 1., .5, .5, .5, 1., 1., 1., .5, .5, .5, 1., 1., 1., .5, .5, .5, 1., 1., 1., .5, .5, .5],
    'max_depth': [6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 6, 8, 8 ,8, 8, 8, 8],
    'learning_rate': [.3, .3, .3, .3, .3, .3, .3, .3, .3, .3, .3, .3, .1, .1, .1, .1, .1, .1, .1, .1, .1, .1, .1, .1],
    'leaderboard_pub': [4356.358, 4356.358, 4356.358, 3360.083, 3370.690, 3672.200, 2926.589, 2926.589, 2926.589, 4723.146, 3384.003, 2889.649, 3430.554, 3430.554, 3430.554, 3425.223, 3626.656, 3941.692, 3543.335, 3543.335, 3543.335, 3626.274, 3917.678, 3557.301],
})
logbook

# Tune CV
Goal now is to test various values for the parameters of `PurgedGroupTimeSeriesSplit` and compare the cross-validation scores to the observed leaderboard scores.

Import of the `class PurgedTimeSeriesSplit` hidden below:

In [None]:
from sklearn.model_selection._split import _BaseKFold, indexable, _num_samples
from sklearn.utils.validation import _deprecate_positional_args

# https://www.kaggle.com/marketneutral/purged-rolling-time-series-cv-split
# https://github.com/getgaurav2/scikit-learn/blob/d4a3af5cc9da3a76f0266932644b884c99724c57/sklearn/model_selection/_split.py#L2243
class PurgedGroupTimeSeriesSplit(_BaseKFold):
    """Time Series cross-validator variant with non-overlapping groups.
    Allows for a gap in groups to avoid potentially leaking info from
    train into test if the model has windowed or lag features.
    Provides train/test indices to split time series data samples
    that are observed at fixed time intervals according to a
    third-party provided group.
    In each split, test indices must be higher than before, and thus shuffling
    in cross validator is inappropriate.
    This cross-validation object is a variation of :class:`KFold`.
    In the kth split, it returns first k folds as train set and the
    (k+1)th fold as test set.
    The same group will not appear in two different folds (the number of
    distinct groups has to be at least equal to the number of folds).
    Note that unlike standard cross-validation methods, successive
    training sets are supersets of those that come before them.
    Read more in the :ref:`User Guide <cross_validation>`.
    Parameters
    ----------
    n_splits : int, default=5
        Number of splits. Must be at least 2.
    max_train_group_size : int, default=Inf
        Maximum group size for a single training set.
    group_gap : int, default=None
        Gap between train and test
    max_test_group_size : int, default=Inf
        We discard this number of groups from the end of each train split
    """

    @_deprecate_positional_args
    def __init__(self,
                 n_splits=5,
                 *,
                 max_train_group_size=np.inf,
                 max_test_group_size=np.inf,
                 group_gap=None,
                 ):
        super().__init__(n_splits, shuffle=False, random_state=None)
        self.max_train_group_size = max_train_group_size
        self.group_gap = group_gap
        self.max_test_group_size = max_test_group_size

    def split(self, X, y=None, groups=None):
        """Generate indices to split data into training and test set.
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data, where n_samples is the number of samples
            and n_features is the number of features.
        y : array-like of shape (n_samples,)
            Always ignored, exists for compatibility.
        groups : array-like of shape (n_samples,)
            Group labels for the samples used while splitting the dataset into
            train/test set.
        Yields
        ------
        train : ndarray
            The training set indices for that split.
        test : ndarray
            The testing set indices for that split.
        """
        if groups is None:
            raise ValueError(
                "The 'groups' parameter should not be None")
        X, y, groups = indexable(X, y, groups)
        n_samples = _num_samples(X)
        n_splits = self.n_splits
        group_gap = self.group_gap
        max_test_group_size = self.max_test_group_size
        max_train_group_size = self.max_train_group_size
        n_folds = n_splits + 1
        group_dict = {}
        u, ind = np.unique(groups, return_index=True)
        unique_groups = u[np.argsort(ind)]
        n_samples = _num_samples(X)
        n_groups = _num_samples(unique_groups)
        for idx in np.arange(n_samples):
            if (groups[idx] in group_dict):
                group_dict[groups[idx]].append(idx)
            else:
                group_dict[groups[idx]] = [idx]
        if n_folds > n_groups:
            raise ValueError(
                f'Cannot have number of folds={n_folds} greater than the number of groups={n_groups}'
            )

        group_test_size = min(n_groups // n_folds, max_test_group_size)
        group_test_starts = range(n_groups - n_splits * group_test_size,
                                  n_groups, group_test_size)
        for group_test_start in group_test_starts:
            train_array = []
            test_array = []

            group_st = max(0, group_test_start - group_gap - max_train_group_size)
            for train_group_idx in unique_groups[group_st:(group_test_start - group_gap)]:
                train_array_tmp = group_dict[train_group_idx]
                
                train_array = np.sort(np.unique(
                                      np.concatenate((train_array,
                                                      train_array_tmp)),
                                      axis=None), axis=None)
                
            train_end = np.array(train_array).size
 
            for test_group_idx in unique_groups[group_test_start:
                                                group_test_start +
                                                group_test_size]:
                test_array_tmp = group_dict[test_group_idx]
                test_array = np.sort(np.unique(
                                              np.concatenate((test_array,
                                                              test_array_tmp)),
                                     axis=None), axis=None)

            test_array = test_array[group_gap:]
                    
            yield [int(i) for i in train_array], [int(i) for i in test_array]

There is a big bug in some of the shared implementations of the utility score: `np.bincount` is great, but it also produces zero counts. Therefore, `p_i.size` or `len(p_i)` produce faulty results. For cross-validation folds `date[-1]` is not usable either. In the implementation below those issues are fixed with the use of `np.count_nonzero`.

In [None]:
def utility(estimator, X, y):
    date = dlabels.loc[y.index, 'date'].values
    weight = dlabels.loc[y.index, 'weight'].values
    resp = dlabels.loc[y.index, 'resp'].values
    action = estimator.predict(X)

    p_i = np.bincount(date, weight * resp * action)
    t = p_i.sum() / np.sqrt((p_i ** 2).sum()) * np.sqrt(250 / np.count_nonzero(p_i))
    u = np.clip(t, 0, 6) * p_i.sum()

    return u

In [None]:
def scaled_utility(estimator, X, y):
    # scale of the public test set (~1mm rows) w.r.t. the given test set
    scale = 1000000 / y.index.size

    date = dlabels.loc[y.index, 'date'].values
    weight = dlabels.loc[y.index, 'weight'].values
    resp = dlabels.loc[y.index, 'resp'].values
    action = estimator.predict(X)

    p_i = np.bincount(date, weight * resp * action)
    t = p_i.sum() / np.sqrt((p_i ** 2).sum()) * np.sqrt(250 / np.count_nonzero(p_i))
    u = np.clip(t, 0, 6) * p_i.sum()

    return u * scale

It is common to take the mean of all cross-validation scores. However, since we are working with an ordinal dataset, an argument could be made for weighing cv scores from folds that occur later heavier. Donate et al. suggests that "in the forecasting domain, recent patterns should have a higher importance when compared with older ones". [Donate et al., p. 6](https://doi.org/10.1016/j.neucom.2012.02.053) He goes on to define a weighted cross-validation function based on $w_i = 1 / 2^{n + 1 - i}$ and is able to conclude that it "improves the accuracy of the forecasts, outperforming both the no weight n−fold ensemble and the simpler holdout validation (0-fold)" (p. 13).

Instead of using his fixed weights method we implement [`pandas`' exponentially weighted mean](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core.window.ewm.ExponentialMovingWindow.mean.html), where $(1 - \alpha)^i$ is the weight $w_i$:

$$
y_{t} = \frac
{x_{t} + (1 - \alpha)x_{t-1} + (1 - \alpha)^2x_{t-2} + ... + (1 - \alpha)^tx_0}
{1 + (1 - \alpha) + (1 - \alpha)^2 + ... + (1 - \alpha)^t}
$$

This allows for a gradual adjustment of the decay of the weights, giving a better ability to tune its use. Notice that the formula above results in a so-called moving window ($y_t$ is still a vector) so we only take the last value of it. On top of that we force support for `alpha=0`; a linear mean:

In [None]:
def smooth_mean(array, alpha=0):
    if alpha == 0:
        return array.mean()
    else:
        return pd.Series(array).ewm(alpha=alpha).mean().iat[-1]

Build the experiment. Extract the model's parameters from the leaderboard dataframe and pass it to the `XGBClassifier` model. Crucial point here is to calculate squared error `(leaderboard_pub - cv_score)**2` and return that as the outcome.

We square root it later to compare root mean squared error (RMSE).

In [None]:
def objective(trial):
    """Return the squared error of the cv score and leaderboard score."""

    errors = []

    for idx in logbook.index:
        booster = xgb.XGBClassifier(
            max_depth=logbook.at[idx, 'max_depth'],
            learning_rate=logbook.at[idx, 'learning_rate'],
            colsample_bytree=logbook.at[idx, 'colsample_bytree'],
            tree_method='gpu_hist',
            n_jobs=-1,
            seed=0,
        )

        if trial.suggest_int('scale_utility_score', 0, 1) == 1:
            scorer = scaled_utility
        else:
            scorer = utility
 
        cv_params = {
            'n_splits': trial.suggest_int('n_splits', 3, 20),
            'group_gap': trial.suggest_int('group_gap', 0, 498),
            'max_train_group_size': trial.suggest_int('max_train_group_size', 1, 499),
            'max_test_group_size': trial.suggest_int('max_test_group_size', 1, 499),
        }
            
        cv_scores = cross_val_score(
            estimator=booster,
            X=dtrain,
            y=dlabels['action'],
            groups=dlabels['date'],
            scoring=scorer,
            cv=PurgedGroupTimeSeriesSplit(**cv_params),
        )

        cv_score = smooth_mean(cv_scores, alpha=trial.suggest_float('smooth_mean_alpha', 0, 1))

        errors.append((logbook.at[idx, 'leaderboard_pub'] - cv_score)**2)

    return np.array(errors).mean()

In [None]:
# just pipeline stuff where trial results are saved and loaded
if not os.path.exists('./optunadb/optuna.db'):
    !cp -r ../input/optunadb/ ./optunadb/

study = optuna.create_study(
    storage='sqlite:///optunadb/optuna.db',
    study_name='cv_purged_full_logbook',
    direction='minimize',
    load_if_exists=True,
)

In [None]:
study.optimize(
    objective,
    #n_trials=3,
    timeout=60*60*1, # 8 hours to stay under kaggle's notebook runtime limit of 9 hours
    catch=(AttributeError, ValueError), # ignore some bugs in PurgedGroupTimeSeriesSplit that sometimes occurs
)

In [None]:
# more pipeline stuff
best = study.best_value
name = study.study_name
n_trials = study.trials_dataframe().index.size

!pip install kaggle==1.5.4
!kaggle datasets metadata -p optunadb jorijnsmit/optunadb
!kaggle datasets version -p optunadb -m 'updated {name} up to trial {n_trials} with best of {best}'

In [None]:
experiments = study.trials_dataframe().sort_values('value').set_index('number')
experiments['value'] = np.sqrt(experiments['value'])
experiments = experiments.rename(columns={'value': 'params_RMSE'})
experiments.drop([c for c in experiments.columns if 'params' not in c], axis=1).rename(columns={'params_RMSE': 'RMSE'}).head(20)

In [None]:
study.best_params