In [1]:
import pandas as pd
import numpy as np
import os
import warnings
from sklearn import preprocessing 
from collections import deque
import random
import time
# tensor flow stuff
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.layers import CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [3]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "ltc"
EPOCHS = 10
BATCH_SIZE=64
NAME = "{}-SEQ-{}-PRED-{}".format(SEQ_LEN, FUTURE_PERIOD_PREDICT, int(time.time()))

In [4]:
def classify(current, future):
  if(float(future) > float(current)):
    return 1
  else:
    return 0

In [5]:
def preprocess_df(df):
  df = df.drop('future', axis=1)
  for col in df.columns:
    if col != "target":
      # helps normalize the data
      # btc, lth and others have different units
      df[col] = df[col].pct_change()
      df.dropna(inplace=True)
      df[col] = preprocessing.scale(df[col].values)
  df.dropna(inplace=True)
  sequential_data = []
  # deque removes items from the front
  # if more than SEQ_LEN is appended
  # convinient for the task below
  # creating 60 sequences and when quota fills
  # save the sequences in another list
  # and the deque will continue removing
  # the old sequences as we add new ones
  # everytime a deque fills, append that
  # sequence to the other list
  prev_days = deque(maxlen=SEQ_LEN)
  for i in df.values:
    # ignore the target, append a list to prev_days list
    prev_days.append([n for n in i[:-1]])
    # when 60 feature sets are collected
    # append the sequence with the target of the 60th feature set
    if(len(prev_days) == SEQ_LEN):
      sequential_data.append([np.array(prev_days), i[-1]])
  random.shuffle(sequential_data)
  buys = []  # list that will store our buy sequences and targets
  sells = []  # list that will store our sell sequences and targets

  for seq, target in sequential_data:  # iterate over the sequential data
      if target == 0:  # if it's a "not buy"
          sells.append([seq, target])  # append to sells list
      elif target == 1:  # otherwise if the target is a 1...
          buys.append([seq, target])  # it's a buy!

  random.shuffle(buys)  # shuffle the buys
  random.shuffle(sells)  # shuffle the sells!

  lower = min(len(buys), len(sells))  # what's the shorter length?

  buys = buys[:lower]  # make sure both lists are only up to the shortest length.
  sells = sells[:lower]  # make sure both lists are only up to the shortest length.

  sequential_data = buys+sells  # add them together
  random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

  X = []
  y = []

  for seq, target in sequential_data:  # going over our new sequential data
      X.append(seq)  # X is the sequences
      y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

  return np.array(X), y  # return X and y...and make X a numpy array!


In [9]:
# read all data
import os
directory_in_str = "/home/tigial3535/crypto-data/"
directory = os.fsencode(directory_in_str)
four_dfs = []
for file in os.listdir(directory):
     # get filename with extension
     filename = os.fsdecode(file)
     url = os.path.join(directory_in_str, filename)
     # column titles
     c_titles = ["time", "low", "high", "open", "close", "volume"] 
     # returns the first 3 characters before - from filename
     get_prefix = lambda x: x.split(".")[0].split("-")[0].lower() + "_"
     # time is not prefixed, prefix the others
     u_titles = ["time"] + [get_prefix(filename) + c for c in c_titles[1:]]
     # append it to list of dataframes
     four_dfs.append(pd.read_csv(url, names=u_titles))

In [10]:
# merge them together
merged_df = four_dfs[0]
for df in four_dfs[1:]:
  merged_df = pd.merge(merged_df, df, on='time', how='left')

In [11]:
merged_df.head()

Unnamed: 0,time,eth_low,eth_high,eth_open,eth_close,eth_volume,bch_low,bch_high,bch_open,bch_close,...,ltc_low,ltc_high,ltc_open,ltc_close,ltc_volume,btc_low,btc_high,btc_open,btc_close,btc_volume
0,1528968720,485.98999,486.5,486.019989,486.01001,26.019083,870.859985,871.719971,871.719971,870.859985,...,96.449997,96.669998,96.589996,96.660004,314.387024,6487.370117,6489.560059,6489.549805,6487.379883,7.706374
1,1528968780,486.0,486.0,486.0,486.0,8.4494,870.099976,871.090027,871.090027,870.099976,...,96.470001,96.57,96.57,96.57,77.129799,6479.410156,6487.370117,6487.370117,6479.410156,3.088252
2,1528968840,485.75,486.0,486.0,485.75,26.994646,868.830017,870.950012,868.830017,870.789978,...,96.449997,96.57,96.57,96.5,7.216067,6479.410156,6479.419922,6479.419922,6479.410156,1.4041
3,1528968900,485.75,486.0,485.75,486.0,77.355759,870.0,870.0,870.0,870.0,...,96.279999,96.540001,96.5,96.389999,524.539978,6475.930176,6479.97998,6479.410156,6479.97998,0.753
4,1528968960,485.98999,486.0,486.0,486.0,7.5033,869.98999,870.0,870.0,869.98999,...,96.459999,96.519997,96.459999,96.519997,16.991997,6477.959961,6480.0,6477.959961,6480.0,1.4909


In [12]:
# # fill null values forward first.
merged_df = merged_df.fillna(method="ffill")
## drop any remaing nulls
merged_df.dropna(inplace=True)

In [13]:
# filter volume and close columns
main_df = merged_df.filter(regex='volume|close|time')

In [14]:
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  main_df.index = main_df.time
  main_df.drop("time", axis=1, inplace=True)

In [15]:
main_df.head()

Unnamed: 0_level_0,eth_close,eth_volume,bch_close,bch_volume,ltc_close,ltc_volume,btc_close,btc_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968720,486.01001,26.019083,870.859985,26.856577,96.660004,314.387024,6487.379883,7.706374
1528968780,486.0,8.4494,870.099976,1.1243,96.57,77.129799,6479.410156,3.088252
1528968840,485.75,26.994646,870.789978,1.749862,96.5,7.216067,6479.410156,1.4041
1528968900,486.0,77.355759,870.0,1.6805,96.389999,524.539978,6479.97998,0.753
1528968960,486.0,7.5033,869.98999,1.669014,96.519997,16.991997,6480.0,1.4909


In [16]:
main_df.shape

(102831, 8)

In [18]:
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  main_df.loc[:,'future'] = main_df["{}_close".format(RATIO_TO_PREDICT)] \
                          .shift(-FUTURE_PERIOD_PREDICT)

In [19]:
main_df.head()

Unnamed: 0_level_0,eth_close,eth_volume,bch_close,bch_volume,ltc_close,ltc_volume,btc_close,btc_volume,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1528968720,486.01001,26.019083,870.859985,26.856577,96.660004,314.387024,6487.379883,7.706374,96.389999
1528968780,486.0,8.4494,870.099976,1.1243,96.57,77.129799,6479.410156,3.088252,96.519997
1528968840,485.75,26.994646,870.789978,1.749862,96.5,7.216067,6479.410156,1.4041,96.440002
1528968900,486.0,77.355759,870.0,1.6805,96.389999,524.539978,6479.97998,0.753,96.470001
1528968960,486.0,7.5033,869.98999,1.669014,96.519997,16.991997,6480.0,1.4909,96.400002


In [21]:
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  main_df['target'] = list(map(
                      classify, 
                      main_df["{}_close".format(RATIO_TO_PREDICT)], 
                      main_df["future"]))

In [22]:
# see the result
main_df[["ltc_close", "future", "target"]].head(6)

Unnamed: 0_level_0,ltc_close,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1528968720,96.660004,96.389999,0
1528968780,96.57,96.519997,0
1528968840,96.5,96.440002,0
1528968900,96.389999,96.470001,1
1528968960,96.519997,96.400002,0
1528969020,96.440002,96.400002,0


In [23]:
# there are nulls on the buttom
main_df.tail()

Unnamed: 0_level_0,eth_close,eth_volume,bch_close,bch_volume,ltc_close,ltc_volume,btc_close,btc_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1535215020,279.359985,8.790519,531.47998,0.016868,58.009998,7.301921,6714.52002,1.002652,58.080002,1
1535215080,279.369995,1.311763,531.469971,0.013854,58.02,23.802017,6714.52002,1.021925,58.09,1
1535215140,279.660004,11.752819,531.47998,0.0169,58.02,6.953497,6715.0,3.645508,,0
1535215200,279.649994,8.35171,531.47998,0.29952,58.080002,202.403183,6715.0,0.51356,,0
1535215260,279.649994,10.511729,531.630005,3.528913,58.09,160.602554,6715.0,0.51356,,0


In [24]:
# sort them(they may be already sorted)
times = sorted(main_df.index.values)
# the timestamp for 5% of the data
# 95% of the data happened before this time
last_5pct = times[-int(0.05*len(times))]

In [25]:
last_5pct

1534905120

In [26]:
validation_main_df_sp = main_df[(main_df.index >= last_5pct)]

In [27]:
main_df_sp = main_df[(main_df.index < last_5pct)]

In [24]:
main_df_sp.head()

Unnamed: 0_level_0,bch_close,bch_volume,btc_close,btc_volume,eth_close,eth_volume,ltc_close,ltc_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,870.859985,26.856577,6487.379883,7.706374,486.01001,26.019083,96.660004,314.387024,96.389999,0
1528968780,870.099976,1.1243,6479.410156,3.088252,486.0,8.4494,96.57,77.129799,96.519997,0
1528968840,870.789978,1.749862,6479.410156,1.4041,485.75,26.994646,96.5,7.216067,96.440002,0
1528968900,870.0,1.6805,6479.97998,0.753,486.0,77.355759,96.389999,524.539978,96.470001,1
1528968960,869.98999,1.669014,6480.0,1.4909,486.0,7.5033,96.519997,16.991997,96.400002,0


In [28]:
train_x, train_y = preprocess_df(main_df_sp)
validation_x, validation_y = preprocess_df(validation_main_df_sp)

In [29]:
train_x.shape

(82166, 60, 8)

In [30]:
len(train_y)

82166

In [33]:
model = Sequential()
# remove return_sequences for dense layer
model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]),
                    return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128,
                    return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())


model.add(CuDNNLSTM(128,
                    return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))


opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',
             optimizer=opt,metrics=['accuracy'])
tensorboard = TensorBoard(log_dir='/home/tigial3535/logs/{}'.format(NAME))

# unique file name that will include the epoch and 
# the validation acc for that epoch

filepath = "/home/tigial3535/models/RNN_Final-{epoch:02d}-{val_acc:.3f}"  
# saves only the best ones
checkpoint = ModelCheckpoint("{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))

# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint])
# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("/home/tigial3535/models/{}".format(NAME))

Train on 82166 samples, validate on 4018 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: 0.6867448023862065
Test accuracy: 0.5764062
