In [1]:
import warnings
warnings.filterwarnings('ignore')

import os, time, gc, random
import datatable as dt
import numpy as np
import janestreet

from sklearn.metrics import roc_auc_score, roc_curve, log_loss
from sklearn.model_selection import GroupKFold
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load
from numba import njit

import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
from torch.nn.modules.loss import _WeightedLoss
from torch.autograd import Variable
from torch.utils.data import DataLoader
import torch.nn.functional as F
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Concatenate, Lambda, GaussianNoise, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers.experimental.preprocessing import Normalization
import tensorflow as tf
import tensorflow_addons as tfa

# Processing

In [2]:
%%time
train = dt.fread('/kaggle/working/input/train.csv').to_pandas()
# train = dt.fread('/kaggle/input/jane-street-market-prediction/train.csv').to_pandas()
train = train.query('date > 85').reset_index(drop=True)
train = train.loc[train.weight > 0].reset_index(drop = True)

features = [c for c in train.columns if 'feature' in c]
f_mean = train[features[1:]].mean()
train[features[1:]] = train[features[1:]].fillna(f_mean)
f_mean = f_mean.values

train['action'] = (train['resp'] > 0).astype('int')

resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']

CPU times: user 5min 36s, sys: 19.5 s, total: 5min 56s
Wall time: 16.1 s


In [3]:
X_train = train.loc[:, features].values
y_train = (train[resp_cols] > 0).astype(int).values
del train
gc.collect()

22

# Pytorch

In [4]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

class MarketDataset:
    def __init__(self, X_train, y_train):
        self.features = X_train
        self.label = y_train

    def __len__(self):
        return len(self.label)

    def __getitem__(self, idx):
        return {
            'features': torch.tensor(self.features[idx], dtype=torch.float),
            'label': torch.tensor(self.label[idx], dtype=torch.float)
        }
    
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)
        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()
        return loss

class Model(nn.Module):
    def __init__(self, num_colunms, num_labels):
        super(Model, self).__init__()
        self.batch_norm0 = nn.BatchNorm1d(num_colunms)
        self.dropout0 = nn.Dropout(0.2)

        self.dense1 = nn.Linear(num_colunms, 384)
        self.batch_norm1 = nn.BatchNorm1d(384)
        self.dropout1 = nn.Dropout(0.2)

        self.dense2 = nn.Linear(384, 896)
        self.batch_norm2 = nn.BatchNorm1d(896)
        self.dropout2 = nn.Dropout(0.2)

        self.dense3 = nn.Linear(896, 896)
        self.batch_norm3 = nn.BatchNorm1d(896)
        self.dropout3 = nn.Dropout(0.2)

        self.dense4 = nn.Linear(896, 394)
        self.batch_norm4 = nn.BatchNorm1d(394)
        self.dropout4 = nn.Dropout(0.2)

        self.dense5 = nn.Linear(394, num_labels)

        self.Relu = nn.ReLU(inplace=True)
        self.PReLU = nn.PReLU()
        self.LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=True)
        # self.GeLU = nn.GELU()
        self.RReLU = nn.RReLU()
    
    def forward(self, x):
        x = self.batch_norm0(x)
        x = self.dropout0(x)

        x = self.dense1(x)
        x = self.batch_norm1(x)
        x = x * F.sigmoid(x)
        x = self.dropout1(x)

        x = self.dense2(x)
        x = self.batch_norm2(x)
        x = x * F.sigmoid(x)
        x = self.dropout2(x)
        
        x = self.dense3(x)
        x = self.batch_norm3(x)
        x = x * F.sigmoid(x)
        x = self.dropout3(x)
        
        x = self.dense4(x)
        x = self.batch_norm4(x)
        x = x * F.sigmoid(x)
        x = self.dropout4(x)

        x = self.dense5(x)

        return x

def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0

    for data in dataloader:
        optimizer.zero_grad()
        features = data['features'].to(device)
        label = data['label'].to(device)
        outputs = model(features)
        loss = loss_fn(outputs, label)
        loss.backward()
        optimizer.step()
        if scheduler:
            scheduler.step()
        final_loss += loss.item()
    final_loss /= len(dataloader)
    return final_loss

In [5]:
epochs = 5
num_colunms = len(features)
num_labels = len(resp_cols)
batch_size = 4096
label_smoothing = 1e-2
learning_rate = 1e-3

train_set = MarketDataset(X_train, y_train)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4)

torch.cuda.empty_cache()
pt_predictor = Model(num_colunms=num_colunms, num_labels=num_labels)
pt_predictor.to(device)

optimizer = torch.optim.Adam(pt_predictor.parameters(), lr=learning_rate)
loss_fn = SmoothBCEwLogits(smoothing=label_smoothing)

start = time.time()
for epoch in range(epochs):
    train_loss = train_fn(pt_predictor, optimizer, None, loss_fn, train_loader, device)
    end = time.time()
    print('Epoch:{}, Time:{:.2f}s, Loss {}'.format(epoch, end - start, train_loss))
    start = end
torch.save(pt_predictor, 'pt_model.pth')

Epoch:0, Time:14.46s, Loss 0.6913047307170928
Epoch:1, Time:12.66s, Loss 0.6892876237009963
Epoch:2, Time:13.65s, Loss 0.6887070592492819
Epoch:3, Time:12.57s, Loss 0.6883050406662127
Epoch:4, Time:12.36s, Loss 0.6879889594080547


# Tensorflow

In [6]:
tf.random.set_seed(1111)
def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):
    inp = tf.keras.layers.Input(shape=(num_columns,))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)):
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i + 1])(x)
        
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation("sigmoid")(x)

    model = tf.keras.models.Model(inputs=inp, outputs=out)
    model.compile(
        optimizer=tfa.optimizers.RectifiedAdam(learning_rate=learning_rate), # RectifiedAdam Optimizer (known to be robust to the choice in learning rate)
        loss=tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing),
        metrics=tf.keras.metrics.AUC(name="AUC"),
    ) 
    
    return model

In [7]:
epochs = 10
batch_size = 4096
hidden_units = [160, 160, 160]
dropout_rates = [0.2, 0.2, 0.2, 0.2]
label_smoothing = 1e-2
learning_rate = 1e-3

tf.keras.backend.clear_session()
tf_predictor = create_mlp(len(features), 5, hidden_units, dropout_rates, label_smoothing, learning_rate)
tf_predictor.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=2)
tf_predictor.save('tf_model.h5')

Epoch 1/10
384/384 - 4s - loss: 0.7170 - AUC: 0.5120
Epoch 2/10
384/384 - 5s - loss: 0.6941 - AUC: 0.5275
Epoch 3/10
384/384 - 5s - loss: 0.6913 - AUC: 0.5342
Epoch 4/10
384/384 - 4s - loss: 0.6904 - AUC: 0.5383
Epoch 5/10
384/384 - 4s - loss: 0.6899 - AUC: 0.5412
Epoch 6/10
384/384 - 5s - loss: 0.6897 - AUC: 0.5428
Epoch 7/10
384/384 - 5s - loss: 0.6895 - AUC: 0.5440
Epoch 8/10
384/384 - 5s - loss: 0.6893 - AUC: 0.5451
Epoch 9/10
384/384 - 5s - loss: 0.6890 - AUC: 0.5468
Epoch 10/10
384/384 - 5s - loss: 0.6888 - AUC: 0.5477


# Inference

In [8]:
import janestreet
env = janestreet.make_env()
iter_test = env.iter_test()

In [9]:
@njit
def fast_fillna(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [10]:
%%time
opt_th = 0.503
tmp = np.zeros(len(features))
pt_predictor.eval()
for (test_df, prediction_df) in tqdm(iter_test):
    if test_df['weight'].values[0] > 0:
        x_tt = test_df.loc[:, features].values
        x_tt[0, :] = fast_fillna(x_tt[0, :], tmp)
        tmp = x_tt[0, :]
        pt_preds = np.median(pt_predictor(torch.tensor(x_tt, dtype=torch.float).to(device)).sigmoid().detach().cpu().numpy())
        tf_preds = np.median(tf_predictor(x_tt))
        prediction_df["action"].values[0] = int((0.6 * tf_preds + 0.4 * pt_preds) >= opt_th)
    else:
        prediction_df["action"].values[0] = 0
    env.predict(prediction_df)

0it [00:00, ?it/s]

CPU times: user 3min 30s, sys: 2.82 s, total: 3min 33s
Wall time: 3min 29s
