In [1]:
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [2]:
SEQ_LEN = 7                   # 1 week of data
FUTURE_PERIOD_PREDICT = 2     # 2 days in the future
COIN_TO_PREDICT = "LTC"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{COIN_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [3]:
# Function to classify predictions as boolean values
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

# Preprocess values and remove future predictions column
def preprocess_df(df):
    df = df.drop('future', 1)
    
    # Drop non-numeric rows and change prices to percentage change values, then scale each column
    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values)
    
    df.dropna(inplace=True)
    
    # Sort values into sequential chunks before randomizing for more accurate model predictions
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)
    
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
            
    # Randomize the sequence
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    
    # If future price is greater than current price, buy, if future price less than current price, sell
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    
    # Randomize each buy or sell
    random.shuffle(buys)
    random.shuffle(sells)
    
    # Balance buys/sells by creating equal length lists for buys/sells
    # Use the lowest length value of the two lists as constraints
    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    
    sequential_data = buys+sells
    random.shuffle(sequential_data)
    
    
    X = []
    y = []
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X), y

In [4]:
main_df = pd.DataFrame()

# Loop through all data, add csv columns into main_df
altcoins = ["BTC", "LTC", "ETH", "ETC", "XMR", "XRP"]
for coin in altcoins:
    dataset = f"Data/Altcoins/{coin}.csv"
    
    df = pd.read_csv(dataset)
    df.rename(columns={"close": f"{coin}_close", "volume": f"{coin}_volume"}, inplace=True)
    
    df.set_index('date', inplace=True)
    df = df[[f"{coin}_close", f"{coin}_volume"]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)
        
main_df['future'] = main_df[f"{COIN_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f"{COIN_TO_PREDICT}_close"], main_df["future"]))
print(main_df[[f"{COIN_TO_PREDICT}_close","future","target"]].head(10))

times = sorted(main_df.index.values)
last_10pct = times[-int(0.1*len(times))]

validate_main_df = main_df[(main_df.index >= last_10pct)]
main_df = main_df[(main_df.index < last_10pct)]

train_x, train_y = preprocess_df(main_df)
validate_x, validate_y = preprocess_df(validate_main_df)

            LTC_close    future  target
date                                   
2017-02-10   3.779899  3.719400       0
2017-02-11   3.757000  3.790000       1
2017-02-12   3.719400  3.783455       1
2017-02-13   3.790000  3.904180       1
2017-02-14   3.783455  3.860000       1
2017-02-15   3.904180  3.818332       0
2017-02-16   3.860000  3.721817       0
2017-02-17   3.818332  3.718663       0
2017-02-18   3.721817  3.710000       0
2017-02-19   3.718663  3.856600       1


In [5]:
print(f"train data: {len(train_x)} validation: {len(validate_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"TEST Dont buys: {validate_y.count(0)}, buys: {validate_y.count(1)}")

train data: 620 validation: 50
Dont buys: 310, buys: 310
TEST Dont buys: 25, buys: 25


In [6]:
model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile(loss='sparse_categorical_crossentropy',
             optimizer=opt,
             metrics=['accuracy'])

tensorboard = TensorBoard(log_dir= f'logs/{NAME}')

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))

history = model.fit(train_x, train_y, batch_size=BATCH_SIZE,
                    epochs=EPOCHS, validation_data=(validate_x, validate_y),
                   callbacks=[tensorboard, checkpoint])



Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train on 620 samples, validate on 50 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [8]:
np.argmax(model.predict(validate_x), axis=1)

array([0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int64)

In [11]:
np.array(validate_y).ravel()

array([1., 1., 1., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 1., 1., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0., 0.,
       0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 1., 0., 1., 1., 0., 0.])