In [1]:
import pandas as pd
import os
from sklearn import preprocessing
from collections import deque
import random
import numpy as np

In [2]:
SEQ_LEN = 60 #how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3 # how far into the future are we trying to predict?
RATIO_TO_PREDICT = "LTC-USD"

In [3]:
def classify(current, future):
    if float(future) > float(current):  #if the future price is higher than the current, that's a buy, or a 1
        return 1
    else:
        return 0 # otherwise... it's a 0!

    

In [4]:
#Normalize and scale data
def preprocess_df(df):
    df = df.drop('future', 1) #dnt need this
    for col in df.columns: #go through all the cols
        if col != "target": #normalize all except for target
            df[col] = df[col].pct_change() #pct_change - normalize different currencies with percent 
            df.dropna(inplace=True)  #remove the nan created by pct_change
            df[col] = preprocessing.scale(df[col].values) #scale the data to get between 0 and 1
    df.dropna(inplace=True) #if nan cleanup again
    sequential_data = [] #list that will contain the sequences
    prev_days = deque(maxlen=SEQ_LEN) #These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in
    for i in df.values:
        prev_days.append([n for n in i[:-1]]) # store all except the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days) , i[-1]]) #append features and labels
    random.shuffle(sequential_data)  # shuffle for good measure.
    #Balancing sequence data for a perfect split
    buys = [] # list that will store our buy sequences and targets
    sells = [] # list that will store our sell sequences and targets

    for seq, target in sequential_data:
        if target == 0: # if it's a "not buy"
            sells.append([seq, target]) # append to sells list
        elif target == 1: # otherwise if the target is  1
            buys.append([seq, target])  #it's a buy!
    random.shuffle(buys)
    random.shuffle(sells)
    lower = min(len(buys) , len(sells)) #what's the shorter length?
    buys = buys[:lower] # make sure both lists are only up to the shortest length.
    sells = sells[:lower] # make sure both lists are only up to the shortest length.
    sequential_data = buys + sells
    random.shuffle(sequential_data)

    #split to x and y
    X = []
    y = []
    for seq , target in sequential_data: # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target) # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y # return X and y...and make X a numpy array! 

    


       

    


In [5]:
#creating empty dataframe
main_df = pd.DataFrame() #empty dataframe

In [6]:
ratios = ["BTC-USD" , "LTC-USD", "ETH-USD", "BCH-USD"]


In [7]:
for ratio in ratios:  #iteration
    print(ratio)
    dataset = f'crypto_data/{ratio}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close
    df.rename(columns={"close": f"{ratio}_close", "volume": f"{ratio}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)

BTC-USD
LTC-USD
ETH-USD
BCH-USD


In [8]:
main_df['future'] = main_df[f'{RATIO_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)
print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  \
time                                                                       
1528968720      486.01001       26.019083     870.859985       26.856577   
1528968780      486.00000        8.449400     870.099976        1.124300   
1528968840      485.75000       26.994646     870.789978        1.749862   
1528968900      486.00000       77.355759     870.000000        1.680500   
1528968960 

In [9]:
main_df['target'] = list(map(classify, main_df[f'{RATIO_TO_PREDICT}_close'], main_df['future']))


In [10]:
print(main_df[[f'{RATIO_TO_PREDICT}_close',"future","target"]].head())

            LTC-USD_close     future  target
time                                        
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1
1528968960      96.519997  96.400002       0


In [11]:
times = sorted(main_df.index.values) #time should be in order #get the times

In [12]:
last_5pct = times[-int(0.05*len(times))] #get the last 5percent of times
#print(last_5pct)

In [13]:
#split up the data
validation_main_df = main_df[(main_df.index >= last_5pct)]#make the validation data where the index is in last 5%
main_df = main_df[(main_df.index < last_5pct)] #now the main data is the data upto last 5%    
        

In [14]:
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)
print(f"train data: {len(train_x)} validation: {len(validation_x)}")
print(f"Dont buys: {train_y.count(0)}, buys: {train_y.count(1)}")
print(f"VALIDATION Dont buys: {validation_y.count(0)}, buys: {validation_y.count(1)}")




train data: 77922 validation: 3860
Dont buys: 38961, buys: 38961
VALIDATION Dont buys: 1930, buys: 1930


In [15]:
#Building model with RNN
import time
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint #always save the best one


In [16]:
EPOCHS = 2 #how many times it passes through the data
BATCH_SIZE = 1 #how many batches
NAME = F"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model


In [None]:
model = Sequential()
 #for CPU use LSTM for GPU you can also use  CuDNNLSTM 
model.add(LSTM(128, input_shape=(train_x.shape[1:]),
              return_sequences=True))  
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(LSTM(128, input_shape=(train_x.shape[1:]),
              return_sequences=True))  
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(LSTM(128, input_shape=(train_x.shape[1:]),
              ))  
model.add(Dropout(0.2))
model.add(BatchNormalization()) #normalizes activation outputs, same reason you want to normalize your input data.

model.add(Dense(32, activation='relu')) #just dense layer, not a output
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax')) #2 because its binary #output layer
#Optimizer
optimizer = tf.keras.optimizers.Adam(lr=0.001 , decay=1e-6)
#compile the model
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])
#tensorboard callback
tensorboard = TensorBoard(log_dir=f'logs/{NAME}') #visualize the data #In terminal type tensorboard --logdir=logs

#Modelcheckpoint callback
filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"  # unique file name that will include the epoch and the validation acc for that epoch
checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')) # saves only the best ones

#Train the model
history = model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint],
)



W0902 14:28:35.933018 139774631569216 deprecation.py:506] From /home/user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Train on 77922 samples, validate on 3860 samples


W0902 14:28:37.320781 139774631569216 deprecation.py:323] From /home/user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/2
 2586/77922 [..............................] - ETA: 20:42:48 - loss: 0.6934 - acc: 0.5027

In [None]:
 #Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))