# How to validate a model on chronologically ordered data which also contains groups?
Since it takes quite some time to get a utility (leaderboard) score back for our model, it would be nice to be able to 'locally' calculate an indication of a model's performance; independent of the (time expensive and limited) submission API. This would allow for much better tuning of hyper-parameters or other aspects of the model's training process.

In this notebook I want to lay out a couple of techniques that can be used to do this. For every step we will see that there is a problem with using it for this particular competition. Fortunately the last chapter provides a solution! If you are not interested in an introduction in test and validation techniques, then skip to the bottom. First up: train and test subsets.

i use this notebook for reference https://www.kaggle.com/gogo827jz/jane-street-neural-network-starter

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import Libraries 📚**

In [None]:
import warnings
warnings.filterwarnings('ignore')

import os, gc
# import cudf
import pandas as pd
import numpy as np
# import cupy as cp
import janestreet
import xgboost as xgb
from hyperopt import hp, fmin, tpe, Trials
from hyperopt.pyll.base import scope
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from joblib import dump, load
import seaborn as sns


import tensorflow as tf
tf.random.set_seed(42)
import tensorflow.keras.backend as K
import tensorflow.keras.layers as layers
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, ModelCheckpoint, EarlyStopping

# **Importing Data ✍**

In [None]:
train = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
features = [c for c in train.columns if 'feature' in c]





In [None]:
train.shape

# **Understanding Data Features 📈**

In [None]:
train.describe()


# **Features Correlation**

In [None]:
correlations = train.corr(method='pearson')

In [None]:
fig, axs = plt.subplots(figsize=(16, 16))
sns.heatmap(correlations)

# **Missing Values**

In [None]:
#  Missing Values
print('Train Nan Valued colas: %d' %train.isna().any().sum())

In [None]:
n_features = 40
nan_val = train.isna().sum()[train.isna().sum() > 0].sort_values(ascending=False)
print(nan_val)


fig, axs = plt.subplots(figsize=(10, 10))

sns.barplot(y = nan_val.index[0:n_features], 
            x = nan_val.values[0:n_features], 
            alpha = 0.8
           )

plt.title(f'NaN values of train dataset (Top {n_features})')
plt.xlabel('NaN values')
fig.savefig(f'nan_values_top_{n_features}_features.png')
plt.show()


# **Exploratory Data Analysis 📊**

**Weight and Resp Distribution Plots**

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 6))
sns.distplot(train['resp'], ax=axs[0])
sns.distplot(train['weight'], ax=axs[1])
fig.savefig('resp_weight_distplot.png')

In [None]:
fig, ax = plt.subplots(figsize=(16, 8))

resp = train['resp'].cumsum()
resp_1 = train['resp_1'].cumsum()
resp_2 = train['resp_2'].cumsum()
resp_3 = train['resp_3'].cumsum()
resp_4 = train['resp_4'].cumsum()

resp.plot(linewidth=2)
resp_1.plot(linewidth=2)
resp_2.plot(linewidth=2)
resp_3.plot(linewidth=2)
resp_4.plot(linewidth=2)

ax.set_xlabel ("Trade", fontsize=12)
ax.set_title ("Cumulative Trade returns", fontsize=18)

plt.legend(loc="upper left");
plt.savefig('cummulative_trade_growth.png')

**Resp Violin Plots**

In [None]:
fig, ax = plt.subplots(figsize=(16, 12))
sns.violinplot(data=train[["resp", "resp_1", "resp_2", "resp_3", "resp_4"]], 
               inner="points", 
               linewidth=1, 
               palette="Set3", 
               ax=ax)    
fig.savefig('resp_violinplot.png')

# **Understanding Data Spread**

***preprocessing***

In [None]:
f_mean = train[features[1:]].mean()
train = train.query('weight > 0').reset_index(drop = True)


In [None]:
train[features[1:]] = train[features[1:]].fillna(f_mean)
train['action'] = (train['resp'] > 0).astype('int')
np.save('f_mean.npy', f_mean)

# **Training**

In [None]:
def create_mlp(num_columns, num_labels, hidden_units, dropout_rates, label_smoothing, learning_rate):
    
    inp = tf.keras.layers.Input(shape = (num_columns, ))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(dropout_rates[0])(x)
    for i in range(len(hidden_units)): 
        x = tf.keras.layers.Dense(hidden_units[i])(x)
        x = tf.keras.layers.BatchNormalization()(x)
        x = tf.keras.layers.Activation(tf.keras.activations.swish)(x)
        x = tf.keras.layers.Dropout(dropout_rates[i+1])(x)    
        
    x = tf.keras.layers.Dense(num_labels)(x)
    out = tf.keras.layers.Activation('sigmoid')(x)
    
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                  loss = tf.keras.losses.BinaryCrossentropy(label_smoothing = label_smoothing), 
                  metrics = tf.keras.metrics.AUC(name = 'AUC'), 
                 )
    
    return model

# **validation**

In [None]:
batch_size = 4096
hidden_units = [384, 896, 896, 394]
dropout_rates = [
    0.10143786981358652,
    0.19720339053599725,
    0.2703017847244654,
    0.23148340929571917,
    0.2357768967777311,
]
label_smoothing = 1e-2
learning_rate = 1e-7

oof = np.zeros(len(train['action']))
gkf = GroupKFold(n_splits = 5)
for fold, (tr, te) in enumerate(gkf.split(train['action'].values, train['action'].values, train['date'].values)):
    
    X_tr, X_val = train.loc[tr, features].values, train.loc[te, features].values
    y_tr, y_val = train.loc[tr, 'action'].values, train.loc[te, 'action'].values
    
    ckp_path = f'JSModel_{fold}.hdf5'
    model = create_mlp(X_tr.shape[1], 1, hidden_units, dropout_rates, label_smoothing, learning_rate)
    rlr = ReduceLROnPlateau(monitor = 'val_AUC', factor = 0.1, patience = 3, verbose = 0, 
                            min_delta = 1e-4, mode = 'max')
    ckp = ModelCheckpoint(ckp_path, monitor = 'val_AUC', verbose = 0, 
                          save_best_only = True, save_weights_only = True, mode = 'max')
    es = EarlyStopping(monitor = 'val_AUC', min_delta = 1e-4, patience = 7, mode = 'max', 
                       baseline = None, restore_best_weights = True, verbose = 0)
    model.fit(X_tr, y_tr, validation_data = (X_val, y_val), epochs = 1000, 
              batch_size = batch_size, callbacks = [rlr, ckp, es], verbose = 0)
                
    oof[te] += model.predict(X_val, batch_size = batch_size * 4).ravel()
    score = roc_auc_score(y_val, oof[te])
    print(f'Fold {fold} ROC AUC:\t', score)
    
    # Finetune 3 epochs on validation set with small learning rate
    model = create_mlp(X_tr.shape[1], 1, hidden_units, dropout_rates, label_smoothing, learning_rate / 100)
    model.load_weights(ckp_path)
    model.fit(X_val, y_val, epochs = 4, batch_size = batch_size, verbose = 0)
    model.save_weights(ckp_path)

In [None]:
score_oof = roc_auc_score(train['action'].values, oof)
print(score_oof)

# **Load model**

In [None]:
num_models = 2

models = []
for i in range(num_models):
    clf = create_mlp(len(features), 1, hidden_units, dropout_rates, label_smoothing, learning_rate)
    clf.load_weights(f'./JSModel_{i}.hdf5')
    models.append(clf)


In [None]:
f_mean = np.load('./f_mean.npy')


# **Submitting**

In [None]:
env = janestreet.make_env()
env_iter = env.iter_test()

In [None]:
opt_th =  0.502
for (test_df, pred_df) in tqdm(env_iter):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        pred = 0.
        for clf in models:
            pred += clf(x_tt, training = False).numpy().item() / num_models
        pred_df.action = np.where(pred >= opt_th, 1, 0).astype(int)
    else:
        pred_df.action = 0
    env.predict(pred_df)