In [1]:
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [2]:
# Function to classify predictions as boolean values
def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0

# Preprocess values and remove future predictions column
def preprocess_df(df):
    df = df.drop('future', 1)
    
    # Drop non-numeric rows and change prices to percentage change values, then scale each column
    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change()
            df.dropna(inplace=True)
            df[col] = preprocessing.scale(df[col].values)
    
    df.dropna(inplace=True)
    
    # Sort values into sequential chunks before randomizing for more accurate model predictions
    sequential_data = []
    prev_days = deque(maxlen=SEQ_LEN)
    
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
            
    # Randomize the sequence
    random.shuffle(sequential_data)
    
    buys = []
    sells = []
    
    # If future price is greater than current price, buy, if future price less than current price, sell
    for seq, target in sequential_data:
        if target == 0:
            sells.append([seq, target])
        elif target == 1:
            buys.append([seq, target])
    
    # Randomize each buy or sell
    random.shuffle(buys)
    random.shuffle(sells)
    
    # Balance buys/sells by creating equal length lists for buys/sells
    # Use the lowest length value of the two lists as constraints
    lower = min(len(buys), len(sells))
    
    buys = buys[:lower]
    sells = sells[:lower]
    
    
    sequential_data = buys+sells
    random.shuffle(sequential_data)
    
    
    X = []
    y = []
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
        
    return np.array(X), y

In [5]:
SEQ_LEN = 7                   # 1 week of data
FUTURE_PERIOD_PREDICT = 2     # 2 days in the future
EPOCHS = 10
BATCH_SIZE = 64


data_df = pd.DataFrame()
main_df = pd.DataFrame()
assets = ["BTC", "LTC", "ETH", "AMZN", "GOOGL", "FB"]

# Main loop to plug each asset into the machine learning model
for asset in assets:
    NAME = f"{asset}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"
    
    # Sub-loop all data to build dataframe; add csv columns into main_df
    for asset in assets:
        dataset = f"Data/{asset}.csv"
    
        df = pd.read_csv(dataset)
        df.rename(columns={"close": f"{asset}_close", "volume": f"{asset}_volume"}, inplace=True)
    
        df.set_index('date', inplace=True)
        df = df[[f"{asset}_close", f"{asset}_volume"]]
    
        if len(main_df) == 0:
            main_df = df
        else:
            main_df = main_df.join(df)
        

    main_df['future'] = main_df[f"{asset}_close"].shift(-FUTURE_PERIOD_PREDICT)
    main_df['target'] = list(map(classify, main_df[f"{asset}_close"], main_df["future"]))
    # print(main_df[[f"{COIN_TO_PREDICT}_close","future","target"]].head(10))


    # Sort dataframes by date and set last 10% of dates as a variable
    times = sorted(main_df.index.values)
    last_10pct = times[-int(0.1*len(times))]


    # Split first 90% of data for training data and last 10% as test data
    validate_main_df = main_df[(main_df.index >= last_10pct)]
    main_df = main_df[(main_df.index < last_10pct)]

    train_x, train_y = preprocess_df(main_df)
    validate_x, validate_y = preprocess_df(validate_main_df)

    print(f"train data: {len(train_x)} validation: {len(validate_x)}")
    print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
    print(f"TEST Dont buys: {validate_y.count(0)}, buys: {validate_y.count(1)}")

    # Initialize sequential model and add a few layers for better accuracy
    model = Sequential()
    model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())

    model.add(LSTM(128, input_shape=(train_x.shape[1:]), return_sequences=True))
    model.add(Dropout(0.1))
    model.add(BatchNormalization())

    model.add(LSTM(128, input_shape=(train_x.shape[1:])))
    model.add(Dropout(0.2))
    model.add(BatchNormalization())

    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.2))

    # Final layer has 2 output nodes for binary classification
    model.add(Dense(2, activation='softmax'))

    opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

    model.compile(loss='sparse_categorical_crossentropy',
                 optimizer=opt,
                 metrics=['accuracy'])

    # Visualize model training
    tensorboard = TensorBoard(log_dir= f'logs/{NAME}')

    filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"
    checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max'))

    history = model.fit(train_x, train_y, batch_size=BATCH_SIZE,
                        epochs=EPOCHS, validation_data=(validate_x, validate_y),
                       callbacks=[tensorboard, checkpoint])



    # Output predictions: 1=buy, 0=sell     
    predictions = np.argmax(model.predict(validate_x), axis=1)

    # Actual test values to compare against predictions
    actual = np.array(validate_y).ravel()
    
    print(actual)
    print(predictions)
    
#     predictions = pd.DataFrame({f'{asset}': pred_x})
    
#     if len(main_df) == 0:
#         data_df = ({'actual values': actual})
#         data_df.join(predictions)
#     else:
#         data_df = data_df.join(predictions)
    

    # Clears dataframes before next iteration through loop
#     predictions = predictions.iloc[0:0]
    main_df = main_df.iloc[0:0]
    df = df.iloc[0:0]

train data: 268 validation: 22
Dont buys: 134, buys: 134
TEST Dont buys: 11, buys: 11
Train on 268 samples, validate on 22 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[1. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1.]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
train data: 268 validation: 22
Dont buys: 134, buys: 134
TEST Dont buys: 11, buys: 11
Train on 268 samples, validate on 22 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[0. 1. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 1.]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
train data: 268 validation: 22
Dont buys: 134, buys: 134
TEST Dont buys: 11, buys: 11
Train on 268 samples, validate on 22 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 0. 

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[1. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1. 1. 0. 1.]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
