In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras
from keras import layers
from keras.models import Model
from keras import Input
from keras.layers import LSTM
from keras.utils import to_categorical
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/fi2010/FI2010/DeepLOB.ipynb
/kaggle/input/fi2010/FI2010/FI2010_train.csv
/kaggle/input/fi2010/FI2010/FI2010_test.csv


In [28]:
train_data_file = '/kaggle/input/fi2010/FI2010/FI2010_train.csv'
test_data_file = '/kaggle/input/fi2010/FI2010/FI2010_test.csv'

In [30]:
train_data = pd.read_csv(train_data_file, index_col=0)
print(train_data.shape)
train_data[:5]


(362400, 149)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,139,140,141,142,143,144,145,146,147,148
0,0.318116,-0.564619,0.313539,-0.551889,0.319726,-0.731228,0.312891,-0.425448,0.319404,-0.844157,...,-0.816832,-0.825238,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0
1,0.318116,-0.662079,0.313539,-0.551889,0.320706,-0.751891,0.312891,-0.425448,0.320383,-0.854876,...,0.4643,0.452887,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0
2,0.317136,-0.723163,0.313539,-0.551889,0.316787,-0.731228,0.312891,-0.425448,0.317445,-0.762942,...,-0.798788,-0.807237,0.0,0.0,0.0,3.0,3.0,2.0,2.0,2.0
3,0.317136,-0.585895,0.313539,-0.551889,0.318747,-0.307628,0.312891,-0.425448,0.319404,-0.561348,...,0.465974,0.454558,0.0,0.0,0.0,2.0,2.0,3.0,2.0,2.0
4,0.317136,-0.585895,0.313539,-0.551889,0.318747,-0.307628,0.312891,-0.425448,0.319404,-0.561348,...,-0.410306,-0.419666,0.0,0.0,0.0,1.0,1.0,1.0,2.0,2.0


In [31]:
# ready data for training:
# 1. sample_size=100: the most 100 recent updates
# 2. feature_num=40: 40 features per time stamp
# 3. target_num=5: relative changes for the next 1,2,3,5 and 10 events(5 in total)
def data_generator(data, batch_size=32, lookback=100,
                   feature_num=40, target_delay=1, shuffle=False):
    data = data.values
    shape = data.shape
    max_index = shape[0]
    min_index = 0
    i = min_index + lookback
    while True:
        if shuffle:
            rows = np.random.randint(min_index + lookack, max_index)
        else:
            if i + batch_size >= max_index:
                i = min_index + lookback
            rows = np.arange(i, min(i + batch_size, max_index))
            i += len(rows)
        samples = np.zeros((len(rows), lookback, feature_num))
        targets = np.zeros((len(rows),))
        for j, row in enumerate(rows):
            samples[j] = data[row - lookback: row, 0: feature_num]  # take the first 40 columns as features
            targets[j] = data[row - 1, target_delay - 6]
        samples = samples.reshape(samples.shape[0], samples.shape[1],
                                  samples.shape[2], 1)# add the 4th dimension: 1 channel
        # "Benchmark dataset for mid-price forecasting of limit order book data with machine learning"
        # labels 1: equal to or greater than 0.002
        # labels 2: -0.00199 to 0.00199
        # labels 3: smaller or equal to -0.002
        # Y=Y-1 relabels as 0,1,2
        targets = targets - 1
        targets = targets.astype(int)
        targets = to_categorical(targets, num_classes=3)# y is the next event's mid price (k=1)
        yield samples, targets


In [35]:
# the size of a single input is (100,40)
input_tensor = Input(shape=(100,40,1))

# convolutional filter is (1,2) with stride of (1,2)
layer_x = layers.Conv2D(16, (1,2), strides=(1,2))(input_tensor)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)

layer_x = layers.Conv2D(16, (1,2), strides=(1,2))(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)

layer_x = layers.Conv2D(16, (1,10))(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)
layer_x = layers.Conv2D(16, (4,1), padding='same')(layer_x)
layer_x = layers.LeakyReLU(alpha=0.01)(layer_x)

# Inception Module
tower_1 = layers.Conv2D(32, (1,1), padding='same')(layer_x)
tower_1 = layers.LeakyReLU(alpha=0.01)(tower_1)
tower_1 = layers.Conv2D(32, (3,1), padding='same')(tower_1)
tower_1 = layers.LeakyReLU(alpha=0.01)(tower_1)

tower_2 = layers.Conv2D(32, (1,1), padding='same')(layer_x)
tower_2 = layers.LeakyReLU(alpha=0.01)(tower_2)
tower_2 = layers.Conv2D(32, (5,1), padding='same')(tower_2)
tower_2 = layers.LeakyReLU(alpha=0.01)(tower_2)  

tower_3 = layers.MaxPooling2D((3,1), padding='same', strides=(1,1))(layer_x)
tower_3 = layers.Conv2D(32, (1,1), padding='same')(tower_3)
tower_3 = layers.LeakyReLU(alpha=0.01)(tower_3)

layer_x = layers.concatenate([tower_1, tower_2, tower_3], axis=-1)

# concatenate features of tower_1, tower_2, tower_3
layer_x = layers.Reshape((100,96))(layer_x)

# 64 LSTM units
layer_x = LSTM(64)(layer_x)
# The last output layer uses a softmax activation function
output = layers.Dense(3, activation='softmax')(layer_x)
model = Model(input_tensor, output)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100, 40, 1)   0                                            
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 100, 20, 16)  48          input_1[0][0]                    
__________________________________________________________________________________________________
leaky_re_lu_1 (LeakyReLU)       (None, 100, 20, 16)  0           conv2d_1[0][0]                   
__________________________________________________________________________________________________
conv2d_2 (Conv2D)               (None, 100, 20, 16)  1040        leaky_re_lu_1[0][0]              
__________________________________________________________________________________________________
leaky_re_l

In [None]:
batch_size = 32
steps_per_epoch =  len(train_data) // batch_size
train_gen = data_generator(train_data, batch_size=batch_size, lookback=100,
                   feature_num=40, target_delay=1, shuffle=False)
opt = keras.optimizers.Adam(lr=0.01, epsilon=1)# learning rate and epsilon are the same as paper DeepLOB
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.fit_generator(train_gen,
                    steps_per_epoch=steps_per_epoch,
                    epochs=2)

Epoch 1/2
  450/11325 [>.............................] - ETA: 33:36 - loss: 1.0852 - acc: 0.4260

In [34]:
# ready data for training:
# 1. sample_size=100: the most 100 recent updates
# 2. feature_num=40: 40 features per time stamp
# 3. target_num=5: relative changes for the next 1,2,3,5 and 10 events(5 in total)
#def data_generator(
data = train_data
batch_size=32
lookback=100
feature_num=40
target_delay=1
shuffle=False

data = data.values
shape = data.shape
max_index = shape[0]
min_index = 0
i = min_index + lookback

counter = 0
while True:
    counter += 1
    if counter > 2:
        break
    if shuffle:
        rows = np.random.randint(min_index + lookack, max_index)
    else:
        if i + batch_size >= max_index:
            i = min_index + lookback
        rows = np.arange(i, min(i + batch_size, max_index))
        i += len(rows)
    samples = np.zeros((len(rows), lookback, feature_num))
    targets = np.zeros((len(rows),))
    for j, row in enumerate(rows):
        samples[j] = data[row - lookback: row, 0: feature_num]  # take the first 40 columns as features
        targets[j] = data[row - 1, target_delay - 6]
    samples = samples.reshape(samples.shape[0], samples.shape[1],
                              samples.shape[2], 1)# add the 4th dimension: 1 channel
        # "Benchmark dataset for mid-price forecasting of limit order book data with machine learning"
        # labels 1: equal to or greater than 0.002
        # labels 2: -0.00199 to 0.00199
        # labels 3: smaller or equal to -0.002
        # Y=Y-1 relabels as 0,1,2
    targets = targets - 1
    targets = targets.astype(int)
    targets = to_categorical(targets, num_classes=3)# y is the next event's mid price (k=1)
    print(f"{samples} \n samples.shape, {samples.shape}")
    print(f"{targets} \n target.shape, {targets.shape}")
    

[[[[ 0.318116  ]
   [-0.56461858]
   [ 0.31353946]
   ...
   [-0.42776259]
   [ 0.300703  ]
   [-0.48082799]]

  [[ 0.318116  ]
   [-0.66207857]
   [ 0.31353946]
   ...
   [-0.46217348]
   [ 0.30956523]
   [-0.48194432]]

  [[ 0.31713601]
   [-0.72316264]
   [ 0.31353946]
   ...
   [-0.42776259]
   [ 0.30956523]
   [-0.48194432]]

  ...

  [[ 0.32889583]
   [-0.58314971]
   [ 0.32433729]
   ...
   [-0.46038332]
   [ 0.32335092]
   [-0.42121607]]

  [[ 0.32889583]
   [-0.66962828]
   [ 0.32433729]
   ...
   [-0.46217348]
   [ 0.32335092]
   [-0.42121607]]

  [[ 0.32987581]
   [-0.53304704]
   [ 0.32433729]
   ...
   [-0.44984125]
   [ 0.32335092]
   [-0.42121607]]]


 [[[ 0.318116  ]
   [-0.66207857]
   [ 0.31353946]
   ...
   [-0.46217348]
   [ 0.30956523]
   [-0.48194432]]

  [[ 0.31713601]
   [-0.72316264]
   [ 0.31353946]
   ...
   [-0.42776259]
   [ 0.30956523]
   [-0.48194432]]

  [[ 0.31713601]
   [-0.58589506]
   [ 0.31353946]
   ...
   [-0.46217348]
   [ 0.300703  ]
   [-0.4808