In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
!ls "/content/drive/My Drive/crypto_data"

BCH-USD.csv  BTC-USD.csv  ETH-USD.csv  LTC-USD.csv


In [0]:
import pandas as pd
import numpy as np
import os
import warnings
from sklearn import preprocessing 
from collections import deque
import random
import time
# tensor flow stuff
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
from tensorflow.keras.layers import CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [0]:
SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "ltc"
EPOCHS = 10
BATCH_SIZE=64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [0]:
def classify(current, future):
  if(float(future) > float(current)):
    return 1
  else:
    return 0

In [0]:
def preprocess_df(df):
  df = df.drop('future', axis=1)
  for col in df.columns:
    if col != "target":
      # helps normalize the data
      # btc, lth and others have different units
      df[col] = df[col].pct_change()
      df.dropna(inplace=True)
      df[col] = preprocessing.scale(df[col].values)
  df.dropna(inplace=True)
  sequential_data = []
  # deque removes items from the front
  # if more than SEQ_LEN is appended
  # convinient for the task below
  # creating 60 sequences and when quota fills
  # save the sequences in another list
  # and the deque will continue removing
  # the old sequences as we add new ones
  # everytime a deque fills, append that
  # sequence to the other list
  prev_days = deque(maxlen=SEQ_LEN)
  for i in df.values:
    # ignore the target, append a list to prev_days list
    prev_days.append([n for n in i[:-1]])
    # when 60 feature sets are collected
    # append the sequence with the target of the 60th feature set
    if(len(prev_days) == SEQ_LEN):
      sequential_data.append([np.array(prev_days), i[-1]])
  random.shuffle(sequential_data)
  buys = []  # list that will store our buy sequences and targets
  sells = []  # list that will store our sell sequences and targets

  for seq, target in sequential_data:  # iterate over the sequential data
      if target == 0:  # if it's a "not buy"
          sells.append([seq, target])  # append to sells list
      elif target == 1:  # otherwise if the target is a 1...
          buys.append([seq, target])  # it's a buy!

  random.shuffle(buys)  # shuffle the buys
  random.shuffle(sells)  # shuffle the sells!

  lower = min(len(buys), len(sells))  # what's the shorter length?

  buys = buys[:lower]  # make sure both lists are only up to the shortest length.
  sells = sells[:lower]  # make sure both lists are only up to the shortest length.

  sequential_data = buys+sells  # add them together
  random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

  X = []
  y = []

  for seq, target in sequential_data:  # going over our new sequential data
      X.append(seq)  # X is the sequences
      y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

  return np.array(X), y  # return X and y...and make X a numpy array!


In [0]:
# def preprocess_df(df):
#     df = df.drop("future", 1)  # don't need this anymore.

#     for col in df.columns:  # go through all of the columns
#         if col != "target":  # normalize all ... except for the target itself!
#             df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies (each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
#             df.dropna(inplace=True)  # remove the nas created by pct_change
#             df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

#     df.dropna(inplace=True)  # cleanup again... jic.


#     sequential_data = []  # this is a list that will CONTAIN the sequences
#     prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

#     for i in df.values:  # iterate over the values
#         prev_days.append([n for n in i[:-1]])  # store all but the target
#         if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
#             sequential_data.append([np.array(prev_days), i[-1]])  # append those bad boys!

#     random.shuffle(sequential_data)  # shuffle for good measure.

#     buys = []  # list that will store our buy sequences and targets
#     sells = []  # list that will store our sell sequences and targets

#     for seq, target in sequential_data:  # iterate over the sequential data
#         if target == 0:  # if it's a "not buy"
#             sells.append([seq, target])  # append to sells list
#         elif target == 1:  # otherwise if the target is a 1...
#             buys.append([seq, target])  # it's a buy!

#     random.shuffle(buys)  # shuffle the buys
#     random.shuffle(sells)  # shuffle the sells!

#     lower = min(len(buys), len(sells))  # what's the shorter length?

#     buys = buys[:lower]  # make sure both lists are only up to the shortest length.
#     sells = sells[:lower]  # make sure both lists are only up to the shortest length.

#     sequential_data = buys+sells  # add them together
#     random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

#     X = []
#     y = []

#     for seq, target in sequential_data:  # going over our new sequential data
#         X.append(seq)  # X is the sequences
#         y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

#     return np.array(X), y  # return X and y...and make X a numpy array!



In [0]:
# read all data
import os
directory_in_str = "/content/drive/My Drive/crypto_data/"
directory = os.fsencode(directory_in_str)
four_dfs = []
for file in os.listdir(directory):
     # get filename with extension
     filename = os.fsdecode(file)
     url = os.path.join(directory_in_str, filename)
     # column titles
     c_titles = ["time", "low", "high", "open", "close", "volume"] 
     # returns the first 3 characters before - from filename
     get_prefix = lambda x: x.split(".")[0].split("-")[0].lower() + "_"
     # time is not prefixed, prefix the others
     u_titles = ["time"] + [get_prefix(filename) + c for c in c_titles[1:]]
     # append it to list of dataframes
     four_dfs.append(pd.read_csv(url, names=u_titles))

In [0]:
# merge them together
merged_df = four_dfs[0]
for df in four_dfs[1:]:
  merged_df = pd.merge(merged_df, df, on='time', how='left')

In [9]:
merged_df.head()

Unnamed: 0,time,bch_low,bch_high,bch_open,bch_close,bch_volume,btc_low,btc_high,btc_open,btc_close,...,eth_low,eth_high,eth_open,eth_close,eth_volume,ltc_low,ltc_high,ltc_open,ltc_close,ltc_volume
0,1528968660,871.650024,871.72998,871.650024,871.719971,5.675361,6489.549805,6489.560059,6489.560059,6489.549805,...,,,,,,96.580002,96.589996,96.589996,96.580002,9.6472
1,1528968720,870.859985,871.719971,871.719971,870.859985,26.856577,6487.370117,6489.560059,6489.549805,6487.379883,...,485.98999,486.5,486.019989,486.01001,26.019083,96.449997,96.669998,96.589996,96.660004,314.387024
2,1528968780,870.099976,871.090027,871.090027,870.099976,1.1243,6479.410156,6487.370117,6487.370117,6479.410156,...,486.0,486.0,486.0,486.0,8.4494,96.470001,96.57,96.57,96.57,77.129799
3,1528968840,868.830017,870.950012,868.830017,870.789978,1.749862,6479.410156,6479.419922,6479.419922,6479.410156,...,485.75,486.0,486.0,485.75,26.994646,96.449997,96.57,96.57,96.5,7.216067
4,1528968900,870.0,870.0,870.0,870.0,1.6805,6475.930176,6479.97998,6479.410156,6479.97998,...,485.75,486.0,485.75,486.0,77.355759,96.279999,96.540001,96.5,96.389999,524.539978


In [0]:
# # fill null values forward first.
merged_df = merged_df.fillna(method="ffill")
## drop any remaing nulls
merged_df.dropna(inplace=True)

In [0]:
# filter volume and close columns
main_df = merged_df.filter(regex='volume|close|time')

In [0]:
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  main_df.index = main_df.time
  main_df.drop("time", axis=1, inplace=True)

In [13]:
main_df.head()

Unnamed: 0_level_0,bch_close,bch_volume,btc_close,btc_volume,eth_close,eth_volume,ltc_close,ltc_volume
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1528968720,870.859985,26.856577,6487.379883,7.706374,486.01001,26.019083,96.660004,314.387024
1528968780,870.099976,1.1243,6479.410156,3.088252,486.0,8.4494,96.57,77.129799
1528968840,870.789978,1.749862,6479.410156,1.4041,485.75,26.994646,96.5,7.216067
1528968900,870.0,1.6805,6479.97998,0.753,486.0,77.355759,96.389999,524.539978
1528968960,869.98999,1.669014,6480.0,1.4909,486.0,7.5033,96.519997,16.991997


In [14]:
main_df.shape

(92224, 8)

In [0]:
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  main_df.loc[:,'future'] = main_df[f"{RATIO_TO_PREDICT}_close"] \
                          .shift(-FUTURE_PERIOD_PREDICT)

In [16]:
main_df.head()

Unnamed: 0_level_0,bch_close,bch_volume,btc_close,btc_volume,eth_close,eth_volume,ltc_close,ltc_volume,future
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1528968720,870.859985,26.856577,6487.379883,7.706374,486.01001,26.019083,96.660004,314.387024,96.389999
1528968780,870.099976,1.1243,6479.410156,3.088252,486.0,8.4494,96.57,77.129799,96.519997
1528968840,870.789978,1.749862,6479.410156,1.4041,485.75,26.994646,96.5,7.216067,96.440002
1528968900,870.0,1.6805,6479.97998,0.753,486.0,77.355759,96.389999,524.539978,96.470001
1528968960,869.98999,1.669014,6480.0,1.4909,486.0,7.5033,96.519997,16.991997,96.400002


In [0]:
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  main_df['target'] = list(map(
                      classify, 
                      main_df[f"{RATIO_TO_PREDICT}_close"], 
                      main_df["future"]))

In [18]:
# see the result
main_df[["ltc_close", "future", "target"]].head(6)

Unnamed: 0_level_0,ltc_close,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1528968720,96.660004,96.389999,0
1528968780,96.57,96.519997,0
1528968840,96.5,96.440002,0
1528968900,96.389999,96.470001,1
1528968960,96.519997,96.400002,0
1528969020,96.440002,96.400002,0


In [19]:
# there are nulls on the buttom
main_df.tail()

Unnamed: 0_level_0,bch_close,bch_volume,btc_close,btc_volume,eth_close,eth_volume,ltc_close,ltc_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1535215020,531.47998,0.016868,6714.52002,1.002652,279.359985,8.790519,58.009998,7.301921,58.080002,1
1535215080,531.469971,0.013854,6714.52002,1.021925,279.369995,1.311763,58.02,23.802017,58.09,1
1535215140,531.47998,0.0169,6715.0,3.645508,279.660004,11.752819,58.02,6.953497,,0
1535215200,531.47998,0.29952,6715.0,0.51356,279.649994,8.35171,58.080002,202.403183,,0
1535215260,531.630005,3.528913,6715.0,0.51356,279.649994,10.511729,58.09,160.602554,,0


In [0]:
# sort them(they may be already sorted)
times = sorted(main_df.index.values)
# the timestamp for 5% of the data
# 95% of the data happened before this time
last_5pct = times[-int(0.05*len(times))]

In [21]:
last_5pct

1534879920

In [0]:
validation_main_df_sp = main_df[(main_df.index >= last_5pct)]

In [0]:
main_df_sp = main_df[(main_df.index < last_5pct)]

In [24]:
main_df_sp.head()

Unnamed: 0_level_0,bch_close,bch_volume,btc_close,btc_volume,eth_close,eth_volume,ltc_close,ltc_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,870.859985,26.856577,6487.379883,7.706374,486.01001,26.019083,96.660004,314.387024,96.389999,0
1528968780,870.099976,1.1243,6479.410156,3.088252,486.0,8.4494,96.57,77.129799,96.519997,0
1528968840,870.789978,1.749862,6479.410156,1.4041,485.75,26.994646,96.5,7.216067,96.440002,0
1528968900,870.0,1.6805,6479.97998,0.753,486.0,77.355759,96.389999,524.539978,96.470001,1
1528968960,869.98999,1.669014,6480.0,1.4909,486.0,7.5033,96.519997,16.991997,96.400002,0


In [0]:
train_x, train_y = preprocess_df(main_df_sp)
validation_x, validation_y = preprocess_df(validation_main_df_sp)

In [26]:
train_x.shape

(75008, 60, 8)

In [27]:
len(train_y)

75008

In [33]:
from tensorboardcolab import TensorBoardColab, TensorBoardColabCallback
tbc=TensorBoardColab('/content/drive/My Drive/logs')

TypeError: ignored

In [29]:
model = Sequential()
# remove return_sequences for dense layer
model.add(CuDNNLSTM(128, input_shape=(train_x.shape[1:]),
                    return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128,
                    return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())


model.add(CuDNNLSTM(128,
                    return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(CuDNNLSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))


opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(loss='sparse_categorical_crossentropy',
             optimizer=opt,metrics=['accuracy'])
tensorboard = TensorBoard(log_dir=f'/content/drive/My Drive/logs/{NAME}')

# unique file name that will include the epoch and 
# the validation acc for that epoch

filepath = "/content/drive/My Drive/models/RNN_Final-{epoch:02d}-{val_acc:.3f}"  
# saves only the best ones
checkpoint = ModelCheckpoint("{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))

# Train model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint])
    #callbacks=[TensorBoardColabCallback(tbc)])

# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("/content/drive/My Drive/models/{}".format(NAME))



Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 75008 samples, validate on 3810 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10


AttributeError: ignored