In [2]:
import glob
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import sys
from sklearn.metrics import mean_absolute_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tqdm import tqdm
tqdm.pandas()


npy_data_path = os.path.join('data', 'AAPL*.npy')
files_to_load = sorted(glob.glob(npy_data_path))

if not files_to_load:
    sys.exit('Files to load not found')

d_num_layers = 6  # Price, Ordered volume, Filled volume, Canceled volume, Pending volume, Time index
d_num_price_levels = 10 * 2 * 2  # price level ($10) per 50 cents per level (*2) per side (*2)
d_minutes_per_day = int(6.5 * 60)  # 6 hours 30 minutes of data per trading session, from 9:30 to 16:00
d_total_minutes = d_minutes_per_day * len(files_to_load)

d = np.zeros((d_num_layers, d_num_price_levels, d_total_minutes), np.float32)

load_pointer = 0
for file in files_to_load:
    d[:, :, load_pointer:load_pointer + d_minutes_per_day] = np.load(file)
    load_pointer += d_minutes_per_day


# make X and y

batch_split_pointer = 0
x_block_length = 10 # in minutes
y_block_length = 1 # in minutes
highest_bid_position = int(d_num_price_levels / 2)
error_severity_multiplier = 10000

X = np.zeros((1, d_num_layers, d_num_price_levels, x_block_length), np.float32)
y = np.array([], dtype=int)

while batch_split_pointer + x_block_length < d_total_minutes:
    new_X = d[:, :, batch_split_pointer:batch_split_pointer + x_block_length]

    last_X_price = new_X[0, highest_bid_position, -1]
    raw_new_y = d[0, highest_bid_position, batch_split_pointer + x_block_length + y_block_length]
    new_y = (raw_new_y - last_X_price) * error_severity_multiplier

    X = np.append(X, new_X.reshape(1, d_num_layers, d_num_price_levels, x_block_length), axis=0)
    y = np.append(y, new_y)

    batch_split_pointer += x_block_length
    # batch_split_pointer += 60 # 5 min

X = np.delete(X, 0, axis=0) # remove initialising element

train_data_pointer = len(X) - int(len(X) / 10) # 10%

X_train = X[0:train_data_pointer]
y_train = y[0:train_data_pointer]

X_test = X[train_data_pointer:-1]
y_test = y[train_data_pointer:-1]

# X_train, y_train = shuffle(X_train, y_train)

In [6]:
model = Sequential()

model.add(Dense(2400, activation='relu'))
model.add(Dense(600, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')

In [None]:
model.fit(x=X_train, y=y_train, validation_data=(X_test, y_test),
          batch_size=32, epochs=200)

In [None]:
pd.DataFrame(model.history.history).plot()

In [None]:
predictions = model.predict(X_test)

In [None]:
mean_absolute_error(y_test,predictions)

In [None]:
# Our predictions
# plt.scatter(y_test,predictions)
plt.figure(figsize=(20,12))
plt.plot(predictions, 'purple')

# Perfect predictions
plt.plot(y_test, 'green')

In [None]:
plt.figure(figsize=(20,12))
plt.plot(y_train, 'green')

In [None]:
# scaling

# v_min = X_train.min(axis=(0, 1, 2, 3), keepdims=True)
# v_max = X_train.max(axis=(0, 1, 2, 3), keepdims=True)
#
# X_train = (X_train - v_min)/(v_max - v_min)
# X_test = (X_test - v_min)/(v_max - v_min)

# scaler = MinMaxScaler()
#
# n_train_samples, n_train_x, n_train_y, n_train_z = X_train.shape
# X_train = X_train.reshape((n_train_samples, n_train_x * n_train_y * n_train_z))
#
# n_test_samples, n_test_x, n_test_y, n_test_z = X_test.shape
# X_test = X_test.reshape((n_test_samples, n_test_x * n_test_y * n_test_z))
#
# scaler.fit(X_train)
#
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)
#
# X_train = X_train.reshape((n_train_samples, n_train_x, n_train_y, n_train_z))
# X_test = X_test.reshape((n_test_samples, n_test_x, n_test_y, n_test_z))