In [1]:
import pandas as pd

#This is one of the csv files in the crypto_data folder. we need to rename the columns
#for the dataset, 
# time= time at which reading is taken, its used as the index column here
# low is the lowest value at for that currency in that time interval(1 min in our case)
# high is the highest value at for that currency in that time interval(1 min in our case)
# open is the starting value of the crypto currency(before 1 min)
# close is the closing value of the crypto currency( after 1 min)
# volume is the part of cryptocurrency we are gonna use

df = pd.read_csv("crypto_data/LTC-USD.csv", names=['time', 'low', 'high', 'open', 'close', 'volume'])

print(df.head())

         time        low       high       open      close      volume
0  1528968660  96.580002  96.589996  96.589996  96.580002    9.647200
1  1528968720  96.449997  96.669998  96.589996  96.660004  314.387024
2  1528968780  96.470001  96.570000  96.570000  96.570000   77.129799
3  1528968840  96.449997  96.570000  96.570000  96.500000    7.216067
4  1528968900  96.279999  96.540001  96.500000  96.389999  524.539978


In [2]:
# Combine all the CSV to one Data Frame

In [3]:
main_df = pd.DataFrame() # begin empty

csvs = ["BTC-USD", "LTC-USD", "BCH-USD", "ETH-USD"]  # the 4 csv files we want to consider
for csv in csvs:  # begin iteration
    print(csv)
    dataset = f'crypto_data/{csv}.csv'  # get the full path to the file.
    df = pd.read_csv(dataset, names=['time', 'low', 'high', 'open', 'close', 'volume'])  # read in specific file

    # rename volume and close to include the ticker so we can still which close/volume is which:
    df.rename(columns={"close": f"{csv}_close", "volume": f"{csv}_volume"}, inplace=True)

    df.set_index("time", inplace=True)  # set time as index so we can join them on this shared time
    df = df[[f"{csv}_close", f"{csv}_volume"]]  # ignore the other columns besides price and volume

    if len(main_df)==0:  # if the dataframe is empty
        main_df = df  # then it's just the current df
    else:  # otherwise, join this data to the main one
        main_df = main_df.join(df)

main_df.fillna(method="ffill", inplace=True)  # if there are gaps in data, use previously known values
main_df.dropna(inplace=True)
print(main_df.head())  

BTC-USD
LTC-USD
BCH-USD
ETH-USD
            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   
1528968960    6480.000000        1.490900      96.519997       16.991997   

            BCH-USD_close  BCH-USD_volume  ETH-USD_close  ETH-USD_volume  
time                                                                      
1528968720     870.859985       26.856577      486.01001       26.019083  
1528968780     870.099976        1.124300      486.00000        8.449400  
1528968840     870.789978        1.749862      485.75000       26.994646  
1528968900     870.000000        1.680500      486.00000    

In [4]:
SEQ_LEN = 60  # how long of a preceeding sequence to collect for RNN
FUTURE_PERIOD_PREDICT = 3  # how far into the future are we trying to predict? in this case we take 3 min
CURRENCY_TO_PREDICT = "LTC-USD" # select any of the four currency 

In [5]:
# This is a simple classification function that we'll use to map in a moment:

def classify(current, future):
    if float(future) > float(current):
        return 1
    else:
        return 0
# This function will take values from 2 columns. 
# If the "future" column is higher,it's a 1 (buy). Otherwise it's a 0 (sell). 

In [6]:
# To do this, first, we need a future column!
main_df['future'] = main_df[f'{CURRENCY_TO_PREDICT}_close'].shift(-FUTURE_PERIOD_PREDICT)

# A .shift will just shift the columns for us, a negative shift will shift them "up." 
# So shifting up 3 will give us the price 3 minutes in the future, and we're just assigning this to a new column.



In [7]:
# Now that we've got the future values, we can use them to make a target using the function we made above.

main_df['target'] = list(map(classify, main_df[f'{CURRENCY_TO_PREDICT}_close'], main_df['future']))

In [8]:
# The map() is used to map a function. The first parameter here is the function we want to map (classify), 
# then the next ones are the parameters to that function. In this case, the current close price, and then the future price.

#The map part is what allows us to do this row-by-row for these columns, but also do it quite fast. 
#The list part converts the end result to a list, which we can just set as a column.


In [9]:
main_df.head() 

Unnamed: 0_level_0,BTC-USD_close,BTC-USD_volume,LTC-USD_close,LTC-USD_volume,BCH-USD_close,BCH-USD_volume,ETH-USD_close,ETH-USD_volume,future,target
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1528968720,6487.379883,7.706374,96.660004,314.387024,870.859985,26.856577,486.01001,26.019083,96.389999,0
1528968780,6479.410156,3.088252,96.57,77.129799,870.099976,1.1243,486.0,8.4494,96.519997,0
1528968840,6479.410156,1.4041,96.5,7.216067,870.789978,1.749862,485.75,26.994646,96.440002,0
1528968900,6479.97998,0.753,96.389999,524.539978,870.0,1.6805,486.0,77.355759,96.470001,1
1528968960,6480.0,1.4909,96.519997,16.991997,869.98999,1.669014,486.0,7.5033,96.400002,0


In [10]:
# We would like to separate out our validation/out of sample data. In the past, all we did was shuffle data, then slice it.

# The problem with that method is, the data is inherently sequential, so taking sequences that don't come in the future 
# is likely a mistake. This is because sequences in our case, for example, 1 minute apart, will be almost identical.

#Chances are, the target is also going to be the same (buy or sell). Because of this, any overfitting is likely to actually 
# pour over into the validation set. 

#Instead, we want to slice our validation while it's still in order. I'd like to take the last 5% of the data. 

#To do that, we'll do:

times = sorted(main_df.index.values)  # get the times
last_5pct = sorted(main_df.index.values)[-int(0.05*len(times))]  # get the last 5% of the times

validation_main_df = main_df[(main_df.index >= last_5pct)]  # make the validation data where the index is in the last 5%
main_df = main_df[(main_df.index < last_5pct)]  # now the main_df is all the data up to the last 5%

In [11]:
# Now we scale the data

from sklearn import preprocessing 
from collections import deque
import numpy as np
import random

def preprocess_df(df):
    df = df.drop("future", 1)  # don't need this anymore.

    for col in df.columns:  # go through all of the columns
        if col != "target":  # normalize all ... except for the target itself!
            df[col] = df[col].pct_change()  # pct change "normalizes" the different currencies #(each crypto coin has vastly diff values, we're really more interested in the other coin's movements)
            
            df.dropna(inplace=True)  # remove the nas created by pct_change
            df[col] = preprocessing.scale(df[col].values)  # scale between 0 and 1.

    df.dropna(inplace=True)  # cleanup again... jic. Those nasty NaNs love to creep in.
    sequential_data = []  # this is a list that will CONTAIN the sequences
    prev_days = deque(maxlen=SEQ_LEN)  # These will be our actual sequences. They are made with deque, which keeps the maximum length by popping out older values as new ones come in

    for i in df.values:  # iterate over the values
        prev_days.append([n for n in i[:-1]])  # store all but the target
        if len(prev_days) == SEQ_LEN:  # make sure we have 60 sequences!
            sequential_data.append([np.array(prev_days), i[-1]])  

    random.shuffle(sequential_data)
    buys = []  # list that will store our buy sequences and targets
    sells = []  # list that will store our sell sequences and targets

    for seq, target in sequential_data:  # iterate over the sequential data
        if target == 0:  # if it's a "not buy"
            sells.append([seq, target])  # append to sells list
        elif target == 1:  # otherwise if the target is a 1...
            buys.append([seq, target])  # it's a buy!

    random.shuffle(buys)  # shuffle the buys
    random.shuffle(sells)  # shuffle the sells!

    lower = min(len(buys), len(sells))  # what's the shorter length?

    buys = buys[:lower]  # make sure both lists are only up to the shortest length.
    sells = sells[:lower]  # make sure both lists are only up to the shortest length.

    sequential_data = buys+sells  # add them together
    random.shuffle(sequential_data)  # another shuffle, so the model doesn't get confused with all 1 class then the other.

    X = []
    y = []

    for seq, target in sequential_data:  # going over our new sequential data
        X.append(seq)  # X is the sequences
        y.append(target)  # y is the targets/labels (buys vs sell/notbuy)

    return np.array(X), y  # return X and y...and make X a numpy array!

In [17]:
print(preprocess_df(main_df))

train_x,train_y=preprocess_df(main_df)
val_x,val_y=preprocess_df(validation_main_df)

(array([[[-3.97056472e-01, -8.15071020e-02,  1.12843043e+00, ...,
         -5.88426830e-03, -1.77821737e+00,  5.84725874e-02],
        [-3.91594204e-01, -8.34862303e-02, -2.76987715e-01, ...,
         -5.86462314e-03, -1.20668640e+00, -5.43553616e-02],
        [ 5.70520491e-01, -8.27031561e-02,  4.25139672e-01, ...,
         -7.58141757e-04,  5.80372911e-01, -2.71297701e-02],
        ...,
        [-6.84848871e-02, -6.00573336e-02,  2.85672309e-01, ...,
         -5.86827616e-03, -2.59036903e-02, -4.40408590e-02],
        [-5.65040428e-01, -4.85471667e-02, -1.37114354e-01, ...,
         -5.33350904e-03,  4.31807924e-03, -4.23170268e-02],
        [ 1.19646779e-01, -6.54154386e-02,  3.79535435e-03, ...,
         -5.83869180e-03, -1.46717582e-01, -4.93801319e-02]],

       [[ 2.61367316e+00,  1.63031060e-01,  1.70586639e+00, ...,
         -4.31145815e-03,  2.39484499e+00, -4.16253642e-02],
        [-8.72118909e-01, -7.03452509e-02, -8.04976830e-01, ...,
         -5.87619875e-03, -1.91871771

In [18]:
# Now we create the model

import time

EPOCHS = 10  # how many passes through our data
BATCH_SIZE = 64  # how many batches? Try smaller batch if you're getting OOM (out of memory) errors.
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"  # a unique name for the model

In [19]:
#ignoring the warnings since tensor flow will give  a lot of them
import warnings
warnings.filterwarnings("ignore")

#import tensorflow
import tensorflow as tf

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, CuDNNLSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import ModelCheckpoint

In [21]:
# creating the model

model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), activation='relu', return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())  #normalizes activation outputs, same reason you want to normalize your input data.

model.add(LSTM(128 , activation='relu', return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(2, activation='softmax'))

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [22]:
# Model compile settings:

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=opt,
    metrics=['accuracy']
)

In [25]:
# Train model
history = model.fit( train_x, train_y,batch_size=BATCH_SIZE, epochs=EPOCHS,validation_data=(val_x,val_y))
   
    
   


Train on 77922 samples, validate on 3860 samples
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# Score model
score = model.evaluate(val_x,val_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("RNN_CRYPTO.model")

Test loss: 0.69082361793271
Test accuracy: 0.5266839
