In [1]:
import os, time, gc, random
import numpy as np
import datatable as dt
import pandas as pd
from tqdm.notebook import tqdm
from random import choices
from numba import njit
import xgboost as xgb
import treelite, treelite_runtime

from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization

import tensorflow as tf
import tensorflow_addons as tfa

import warnings
warnings.filterwarnings('ignore')

# Processing

In [2]:
%%time
train = dt.fread('/kaggle/working/input/train.csv').to_pandas()
# train = dt.fread('/kaggle/input/jane-street-market-prediction/train.csv').to_pandas()
train = train.query('date > 85').reset_index(drop=True)
train = train.loc[train.weight > 0].reset_index(drop = True)

features = [c for c in train.columns if 'feature' in c]
f_mean = train[features[1:]].mean()
train[features[1:]] = train[features[1:]].fillna(f_mean)
f_mean = f_mean.values

train['action'] = (train['resp'] > 0).astype('int')

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']

CPU times: user 24min 8s, sys: 35.7 s, total: 24min 44s
Wall time: 48.3 s


In [3]:
X_train = train.loc[:, features].values
tf_y_train = (train[resp_cols] > 0).astype(int).values
xgb_y_train = (train['action'] > 0).astype(int).values
del train
gc.collect()

44

# XGBoost

In [4]:
%%time
dtrain = xgb.DMatrix(X_train, xgb_y_train)
params = {
    'n_estimators': 500,
    'max_depth': 8,
    'learning_rate': 0.014,
    'subsample': 0.46,
    'colsample_bytree': 0.99,
    'objective': 'binary:logistic',
    'random_state': 2020,
    'tree_method': 'gpu_hist',
    'eval_metric': 'logloss'
}
xgb_clf = xgb.train(params, dtrain, 200)

Parameters: { n_estimators } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


CPU times: user 41.1 s, sys: 11.7 s, total: 52.8 s
Wall time: 26.8 s


In [5]:
model = treelite.Model.from_xgboost(xgb_clf)
toolchain = 'gcc'
model.export_lib(toolchain=toolchain, libpath='./xgb_model.so', params={'parallel_comp': 32}, verbose=True)
xgb_predictor = treelite_runtime.Predictor('./xgb_model.so', verbose=True)
del xgb_clf
gc.collect()

[01:19:40] ../src/compiler/ast_native.cc:45: Using ASTNativeCompiler
[01:19:40] ../src/compiler/ast/split.cc:31: Parallel compilation enabled; member trees will be divided into 32 translation units.
[01:19:41] ../src/c_api/c_api.cc:120: Code generation finished. Writing code to files...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file recipe.json...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu27.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu22.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu24.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu21.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu8.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu5.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu6.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu3.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu7.c...
[01:19:41] ../src/c_api/c_api.cc:125: Writing file tu26.c...
[01:19:41] ../src/c_api/c_api.cc:125: Wri

0

# Tensorflow

In [6]:
tf.random.set_seed(1111)
def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):
    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
        
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate), # RectifiedAdam Optimizer (known to be robust to the choice in learning rate)
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    ) 
    
    return model

In [7]:
epochs = 100
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

tf.keras.backend.clear_session()
tf_predictor = create_mlp(len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate)
tf_predictor.fit(X_train, tf_y_train, epochs=epochs, batch_size=batch_size, verbose=2)
tf_predictor.save('tf_model.h5')

Epoch 1/100
384/384 - 5s - loss: 0.7170 - AUC: 0.5120
Epoch 2/100
384/384 - 6s - loss: 0.6941 - AUC: 0.5275
Epoch 3/100
384/384 - 6s - loss: 0.6913 - AUC: 0.5342
Epoch 4/100
384/384 - 6s - loss: 0.6904 - AUC: 0.5383
Epoch 5/100
384/384 - 6s - loss: 0.6899 - AUC: 0.5412
Epoch 6/100
384/384 - 6s - loss: 0.6897 - AUC: 0.5428
Epoch 7/100
384/384 - 6s - loss: 0.6895 - AUC: 0.5440
Epoch 8/100
384/384 - 6s - loss: 0.6893 - AUC: 0.5451
Epoch 9/100
384/384 - 6s - loss: 0.6890 - AUC: 0.5468
Epoch 10/100
384/384 - 6s - loss: 0.6888 - AUC: 0.5477
Epoch 11/100
384/384 - 6s - loss: 0.6887 - AUC: 0.5482
Epoch 12/100
384/384 - 6s - loss: 0.6885 - AUC: 0.5495
Epoch 13/100
384/384 - 6s - loss: 0.6883 - AUC: 0.5502
Epoch 14/100
384/384 - 6s - loss: 0.6882 - AUC: 0.5506
Epoch 15/100
384/384 - 5s - loss: 0.6881 - AUC: 0.5509
Epoch 16/100
384/384 - 6s - loss: 0.6880 - AUC: 0.5519
Epoch 17/100
384/384 - 5s - loss: 0.6878 - AUC: 0.5525
Epoch 18/100
384/384 - 5s - loss: 0.6877 - AUC: 0.5530
Epoch 19/100
384/38

# Inference

In [8]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

In [9]:
@njit
def fast_fillna(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [10]:
%%time
opt_th = 0.503
tmp = np.zeros(len(features))
for (test_df, prediction_df) in tqdm(iter_test):
    if test_df['weight'].values[0] > 0:
        x_tt = test_df.loc[:, features].values
        x_tt[0, :] = fast_fillna(x_tt[0, :], tmp)
        tmp = x_tt[0, :]
        xgb_preds = xgb_predictor.predict(treelite_runtime.DMatrix(x_tt))
#         xgb_preds = xgb_predictor.predict(treelite_runtime.Batch.from_npy2d(x_tt))
        tf_preds = np.median(tf_predictor(x_tt))
        prediction_df["action"].values[0] = int((0.6 * tf_preds + 0.4 * xgb_preds) >= opt_th)
    else:
        prediction_df["action"].values[0] = 0
    env.predict(prediction_df)

0it [00:00, ?it/s]

CPU times: user 3min 15s, sys: 2.7 s, total: 3min 18s
Wall time: 3min 18s
