In [None]:
# only for Google Colab
!mkdir data
!mkdir checkpoints

!wget -O data/data.zip 'https://docs.google.com/uc?export=download&id=1g6TxegO19bZCWIeuG3hSelPtglmbLWoE' --no-check-certificate
!unzip data/* -d data/

In [None]:
import math
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
from sklearn.utils import shuffle
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm
tqdm.pandas()


npy_data_path = os.path.join('data', 'AAPL*.npy')
files_to_load = sorted(glob.glob(npy_data_path))

if not files_to_load:
    sys.exit('Files to load not found')

d_num_layers = 6  # Price, Ordered volume, Filled volume, Canceled volume, Pending volume, Time index
d_num_price_levels = 10 * 2 * 2  # price level ($10) per 50 cents per level (*2) per side (*2)
d_minutes_per_day = int(6.5 * 60)  # 6 hours 30 minutes of data per trading session, from 9:30 to 16:00
d_total_minutes = d_minutes_per_day * len(files_to_load)

d = np.zeros((d_num_layers, d_num_price_levels, d_total_minutes), np.float32)

load_pointer = 0
for file in files_to_load:
    d[:, :, load_pointer:load_pointer + d_minutes_per_day] = np.load(file)
    load_pointer += d_minutes_per_day


# make X and y

d_pointer = 0
x_block_length = 10 # in minutes
y_block_length = 1 # in minutes
highest_bid_position = int(d_num_price_levels / 2)
error_severity_multiplier = 1000
X_y_pointer = 0

X_y_entries_count = d_total_minutes - x_block_length  # d_pointer += 1

X = np.zeros((X_y_entries_count, d_num_price_levels * x_block_length * d_num_layers), np.float32)
y = np.zeros((X_y_entries_count, 3), np.float32)

while d_pointer + x_block_length + y_block_length < d_total_minutes:
    new_X = d[:, :, d_pointer:d_pointer + x_block_length]

    last_X_price = new_X[0, highest_bid_position, -1]
    raw_new_y = d[0, highest_bid_position, d_pointer + x_block_length + y_block_length]

    if raw_new_y - last_X_price > 0:
        new_y = np.array([0, 0, 1], np.float32)
    elif raw_new_y - last_X_price < 0:
        new_y = np.array([1, 0, 0], np.float32)
    else:
        new_y = np.array([0, 1, 0], np.float32)

    X[X_y_pointer] = new_X.flatten()
    y[X_y_pointer] = new_y

    X_y_pointer += 1
    d_pointer += 1


# normalize data with 0 and non-0 responses
X_with_minus_one_answer = X[y[:, 0] == 1]
y_with_minus_one_answer = y[y[:, 0] == 1]
X_with_zero_answer = X[y[:, 1] == 1]
y_with_zero_answer = y[y[:, 1] == 1]
X_with_one_answer = X[y[:, 2] == 1]
y_with_one_answer = y[y[:, 2] == 1]

X_with_minus_one_answer, y_with_minus_one_answer = shuffle(X_with_minus_one_answer, y_with_minus_one_answer)
X_with_zero_answer, y_with_zero_answer = shuffle(X_with_zero_answer, y_with_zero_answer)
X_with_one_answer, y_with_one_answer = shuffle(X_with_one_answer, y_with_one_answer)

min_answer_group_length = min(len(y_with_minus_one_answer), len(y_with_zero_answer), len(y_with_one_answer))

X = np.concatenate((
    X_with_minus_one_answer[:min_answer_group_length],
    X_with_zero_answer[:min_answer_group_length],
    X_with_one_answer[:min_answer_group_length]
))
y = np.concatenate((
    y_with_minus_one_answer[:min_answer_group_length],
    y_with_zero_answer[:min_answer_group_length],
    y_with_one_answer[:min_answer_group_length]
))

X, y = shuffle(X, y)

train_data_pointer = len(X) - int(len(X) / 10) # 10%
X_train = X[0:train_data_pointer]
y_train = y[0:train_data_pointer]
X_test = X[train_data_pointer:-1]
y_test = y[train_data_pointer:-1]

def get_accuracy():
    p_indexes = np.argmax(predictions, axis=1)
    y_indexes = np.argmax(y_test, axis=1)

    true_answers = len(np.where(p_indexes == y_indexes)[0])

    y_sub_1 = len(np.where(y_test[:, 0] == 1)[0])
    y_0 = len(np.where(y_test[:, 1] == 1)[0])
    y_1 = len(np.where(y_test[:, 2] == 1)[0])

    true_sub_1_answers = len(np.intersect1d( np.where(np.argmax(y_test, axis=1) == 0)[0], np.where(np.argmax(predictions, axis=1) == 0)[0] ))
    true_0_answers = len(np.intersect1d( np.where(np.argmax(y_test, axis=1) == 1)[0], np.where(np.argmax(predictions, axis=1) == 1)[0] ))
    true_1_answers = len(np.intersect1d( np.where(np.argmax(y_test, axis=1) == 2)[0], np.where(np.argmax(predictions, axis=1) == 2)[0] ))

    print('Total true answers: ' + str(math.floor(true_answers * 100 / len(predictions))) + '%')
    print('-1 true answers: ' + str(math.floor(true_sub_1_answers * 100 / y_sub_1 )) + '%')
    print('0 true answers: ' + str(math.floor(true_0_answers * 100 / y_0 )) + '%')
    print('1 true answers: ' + str(math.floor(true_1_answers * 100 / y_1 )) + '%')

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()

model.add(Dense(2400, activation=lambda x: tf.nn.leaky_relu(x, alpha=0.01)))
model.add(Dropout(0.2))
model.add(Dense(2400, activation=lambda x: tf.nn.leaky_relu(x, alpha=0.01)))
model.add(Dropout(0.2))
model.add(Dense(600, activation=lambda x: tf.nn.leaky_relu(x, alpha=0.01)))
# model.add(Dropout(0.2))
model.add(Dense(50, activation=lambda x: tf.nn.leaky_relu(x, alpha=0.01)))
# model.add(Dropout(0.2))
model.add(Dense(3, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from tensorflow.python.keras.callbacks import ModelCheckpoint

best_checkpoint_path = os.path.join('checkpoints', 'best_weights.hdf5')

batch_size = 4096
train_data_slice = (len(X_train) // batch_size) * batch_size
test_data_slice = (len(X_test) // batch_size) * batch_size

model.fit(x=X_train[:train_data_slice], y=y_train[:train_data_slice],
          validation_data=(X_test[:test_data_slice], y_test[:test_data_slice]),
          batch_size=batch_size, epochs=1000,
          callbacks=[
              ModelCheckpoint(
                  best_checkpoint_path,
                  monitor='accuracy',
                  save_best_only=True,
                  verbose=1,
                  save_weights_only=True
              )
          ],
          )

In [None]:
pd.DataFrame(model.history.history).plot()

In [None]:
model.load_weights(best_checkpoint_path)

In [None]:
predictions = model.predict(X_test)

In [None]:
get_accuracy()

In [None]:
mean_absolute_error(y_test, predictions)

In [None]:
# Our predictions
# plt.scatter(y_test,predictions)
plt.figure(figsize=(20,12))
plt.plot(predictions, 'purple')

# Perfect predictions
plt.plot(y_test, 'green')

In [None]:
plt.figure(figsize=(20,12))
plt.plot(y_train, 'green')

In [None]:
plt.figure(figsize=(20,12))
plt.plot(predictions, 'green')