# Import Libraries 📂

In [1]:
import numpy as np
import pandas as pd

from tensorflow.keras.layers import Input, Dense, BatchNormalization, Dropout, Activation
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.layers.experimental.preprocessing import Normalization
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
import tensorflow_addons as tfa

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm

# Importing Data 📚



In [2]:
%%time
train = pd.read_csv('../input/jane-street-market-prediction/train.csv')
train = train.astype({c: np.float32 for c in train.select_dtypes(include='float64').columns}) #limit memory use

CPU times: user 1min 23s, sys: 6.67 s, total: 1min 30s
Wall time: 2min 25s


# Preparing Data

In [3]:
#Нам не нужны сделки с нулевым весом, поэтому мы их игнорируем
train = train.query('weight > 0').reset_index(drop = True)
train.shape

(1981287, 138)

In [4]:
#Данные будут с 86 дня
train = train.query('date > 85').reset_index(drop = True)
train.shape

(1571415, 138)

In [5]:
#Заполняем пропущенные значения средним 
train.fillna(train.mean(),inplace=True)

In [6]:
#Генерируем значения 0 или 1 на основе функций resp и сохраняем их в столбце 'action'
train['action'] = (train['resp'] > 0 ).astype('int')

In [7]:
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']

In [8]:
features_train_data  = train.iloc[:,7:137]

In [9]:
# Найдем пары признаков с корреляцией > |0.9|
def corrFilter(x: pd.DataFrame, bound: float):
    xCorr = x.corr()
    xFiltered = xCorr[((xCorr >= bound) | (xCorr <= -bound)) & (xCorr !=1.000)]
    xFlattened = xFiltered.unstack().sort_values().drop_duplicates()
    return xFlattened

high_correlations=corrFilter(features_train_data, .9).to_frame()

In [10]:
all_drop_cols = set(high_correlations.index.get_level_values(0))

In [11]:
features = features_train_data.columns.tolist()

In [12]:
for i in all_drop_cols:
    features.remove(i)

In [13]:
f_mean = train.loc[:, features].mean()

# Creating Train and Test DataFrame 

In [14]:
# VALID_DAYS = 50  # using for valid
VALID_DAYS = 0   # using for LB
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp_4', 'resp']

# ------------------------------- #
#            NN sample            #
# ------------------------------- #
print('Make NN sample...')
# train
X_train = train[train['date'] <= 499-VALID_DAYS]
y_train = np.stack([(X_train[c]>0).astype('int') for c in resp_cols]).T
X_train = X_train.loc[:, features].values
print(X_train.shape, y_train.shape)

# valid
X_valid = train[train['date'] > 499-VALID_DAYS]
y_valid = np.stack([(X_valid[c]>0).astype('int') for c in resp_cols]).T
X_valid = X_valid.loc[:, features].values
print(X_valid.shape, y_valid.shape)

Make NN sample...
(1571415, 90) (1571415, 5)
(0, 90) (0, 5)


In [15]:
# ------------------------------------------- #
#                   model                     #
# ------------------------------------------- #
HIDDEN_LAYER_1 = [256, 256]
HIDDEN_LAYER_2 = [160, 160, 160]
HIDDEN_LAYER_3 = [128, 128, 128, 128]
TARGET_NUM = 5   # Оптимизируем эти 5 респ

input = tf.keras.layers.Input(shape=(X_train.shape[1], ))

#part_1
x1 = tf.keras.layers.BatchNormalization()(input)
x1 = tf.keras.layers.Dropout(0.25)(x1)
for units in HIDDEN_LAYER_1:
    x1 = tf.keras.layers.Dense(units)(x1)
    x1 = tf.keras.layers.BatchNormalization()(x1)
    x1 = tf.keras.layers.Activation(tf.keras.activations.swish)(x1)
    x1 = tf.keras.layers.Dropout(0.25)(x1)

# part_2
x2 = tf.keras.layers.BatchNormalization()(input)
x2 = tf.keras.layers.Dropout(0.25)(x2)
for units in HIDDEN_LAYER_2:
    x2 = tf.keras.layers.Dense(units)(x2)
    x2 = tf.keras.layers.BatchNormalization()(x2)
    x2 = tf.keras.layers.Activation(tf.keras.activations.swish)(x2)
    x2 = tf.keras.layers.Dropout(0.25)(x2)
    
# part_3
x3 = tf.keras.layers.BatchNormalization()(input)
x3 = tf.keras.layers.Dropout(0.25)(x3)
for units in HIDDEN_LAYER_3:
    x3 = tf.keras.layers.Dense(units)(x3)
    x3 = tf.keras.layers.BatchNormalization()(x3)
    x3 = tf.keras.layers.Activation(tf.keras.activations.swish)(x3)
    x3 = tf.keras.layers.Dropout(0.25)(x3)

x = tf.keras.layers.concatenate([x1, x2, x3])
x = tf.keras.layers.Dense(TARGET_NUM)(x)

output = tf.keras.layers.Activation("sigmoid")(x)

model = tf.keras.models.Model(inputs=input, outputs=output)
model.compile(
    optimizer = tfa.optimizers.RectifiedAdam(learning_rate=1e-3),
    metrics   = tf.keras.metrics.AUC(name="AUC"),
    loss      = tf.keras.losses.BinaryCrossentropy(label_smoothing=1e-2),
)

In [16]:
history = model.fit(
    x = X_train, 
    y = y_train, 
    epochs=25, 
    batch_size=4096, 
    validation_data=(X_valid, y_valid),
)
models = []
models.append(model)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


# Prediction

In [17]:
THRESHOLD = 0.502

import janestreet
from tqdm import tqdm
janestreet.make_env.__called__ = False
env = janestreet.make_env()

print('predicting...')

for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        
        X_test = test_df.loc[:, features].values
        if np.isnan(X_test.sum()):  # Заполните numpy, скорость будет быстрее
            X_test = np.nan_to_num(X_test) + np.isnan(X_test) * f_mean.values
        pred = model(X_test, training = False).numpy()
        pred = np.mean(pred)
        
        pred_df.action = np.where(pred >= THRESHOLD, 1, 0).astype(int)

    else:
        pred_df.action = 0
    env.predict(pred_df)

0it [00:00, ?it/s]

predicting...


15219it [05:58, 42.41it/s]
