In [9]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from collections import deque
import random
import time
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

In [2]:
# we are setting three constants SEQ_LEN to know what is the amount of past data we need to predict the next 
# here it is 60 mins, FUTURE_PERIOD_PREDICT is the data we want to be predicted after FUTURE_PERIOD_PREDICT mins
# here it is 3 mins and RATIO_TO_PREDICT is the ratio or the crypto we want to forecast the value of.

SEQ_LEN = 60
FUTURE_PERIOD_PREDICT = 3
RATIO_TO_PREDICT = "LTC-USD"

EPOCHS = 2
BATCH_SIZE = 64
NAME = f"{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"

In [3]:
def classify(current, future):
    if float(future)>float(current):
        return 1
    else:
        return 0

In [4]:
# this is the preprocess function it drops future col that is of no need to us, then we find the percent change in the values of different ratios of crypto currency prices
# we in the way drop the NaN objects and shuffle accordingly, we create seqential_data that takes a seq of rows of SEQ_LEN length and with the target of the 60th row that we 
# would be predicting then we balance the data using buys and sells list lastly we split the data into the X and i.e. values for input and target

def preprocess_df(df):
    df = df.drop('future', axis=1)
    for col in df.columns:
        if col!="target":
            df[col] = df[col].pct_change()
            df.dropna(inplace = True)
            df[col] = preprocessing.scale(df[col].values)
    df.dropna(inplace = True)
    sequential_data = []
    prev_days = deque(maxlen = SEQ_LEN)
    for i in df.values:
        prev_days.append([n for n in i[:-1]])
        if len(prev_days) == SEQ_LEN:
            sequential_data.append([np.array(prev_days), i[-1]])
    random.shuffle(sequential_data)
    buys = []
    sells = []
    for seq, target in sequential_data:
        if target==0:
            sells.append([seq, target])
        elif target==1:
            buys.append([seq, target])
    random.shuffle(buys)
    random.shuffle(sells)
    lower=min(len(buys), len(sells))
    buys=buys[:lower]
    sells=sells[:lower]
    sequential_data = buys+sells
    random.shuffle(sequential_data)
    X=[]
    y=[]
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    return np.array(X), y

In [5]:
# here we are reading all the .csv(s) and assigning its columns the names as mentioned in the parameter names
# and merging them all to this one dataframe main_df, then we rename the columns to the particular crypto named
# attributes to where it belongs and then we set the index for merging later, also we only need the close and volume
# so we remove rest of the columns, then we merge them all to main_df.

main_df = pd.DataFrame()
ratios = ["BTC-USD","LTC-USD","ETH-USD","BCH-USD"]
for ratio in ratios:
    dataset = f"D:/crypto_data/{ratio}.csv"
    df = pd.read_csv(dataset, names=["time", "low","high","open","close","volume"])
    df.rename(columns={"close":f"{ratio}_close", "volume":f"{ratio}_volume"}, inplace=True)
    df.set_index("time", inplace=True)
    df = df[[f"{ratio}_close", f"{ratio}_volume"]]
    # print(df.head())
    if len(main_df)==0:
        main_df = df
    else:
        main_df = main_df.join(df)

print(main_df.head())

            BTC-USD_close  BTC-USD_volume  LTC-USD_close  LTC-USD_volume  \
time                                                                       
1528968660    6489.549805        0.587100      96.580002        9.647200   
1528968720    6487.379883        7.706374      96.660004      314.387024   
1528968780    6479.410156        3.088252      96.570000       77.129799   
1528968840    6479.410156        1.404100      96.500000        7.216067   
1528968900    6479.979980        0.753000      96.389999      524.539978   

            ETH-USD_close  ETH-USD_volume  BCH-USD_close  BCH-USD_volume  
time                                                                      
1528968660            NaN             NaN     871.719971        5.675361  
1528968720      486.01001       26.019083     870.859985       26.856577  
1528968780      486.00000        8.449400     870.099976        1.124300  
1528968840      485.75000       26.994646     870.789978        1.749862  
1528968900      4

In [6]:
# here we are making a new column future to get a new column target that will be our column to predict
# what .shift does is it takes the column shifts it by some number herre -3 that means 1 2 3 times up and assigns all the values in the 
# column to the new column future, map maps a function and takes function parameters and maps them

main_df['future'] = main_df[f"{RATIO_TO_PREDICT}_close"].shift(-FUTURE_PERIOD_PREDICT)
main_df['target'] = list(map(classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df["future"]))
print(main_df[[f"{RATIO_TO_PREDICT}_close", "future", "target"]].head(10))

            LTC-USD_close     future  target
time                                        
1528968660      96.580002  96.500000       0
1528968720      96.660004  96.389999       0
1528968780      96.570000  96.519997       0
1528968840      96.500000  96.440002       0
1528968900      96.389999  96.470001       1
1528968960      96.519997  96.400002       0
1528969020      96.440002  96.400002       0
1528969080      96.470001  96.400002       0
1528969140      96.400002  96.400002       0
1528969200      96.400002  96.400002       0


In [7]:
# here we are firstly sorting the df according to time take out the last 5% values as validation then preprocess both the training and testing data and printing some values

times = sorted(main_df.index.values)
last_5pct = times[-int(0.05*len(times))]
validation_main_df = main_df[(main_df.index >= last_5pct)]
main_df = main_df[(main_df.index < last_5pct)]

train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print(f"train data : {len(train_x)}, validation {len(validation_x)}")
print(f"don't buys : {train_y.count(0)}, buys {train_y.count(1)}")
print(f"validation don't buys : {validation_y.count(0)}, validation buys {validation_y.count(1)}")

train_x = np.array(train_x)
train_y = np.array(train_y)
validation_x = np.array(validation_x)
validation_y = np.array(validation_y)

train data : 69188, validation 3062
don't buys : 34594, buys 34594
validation don't buys : 1531, validation buys 1531


In [10]:
model = Sequential()
model.add(LSTM(128, activation="tanh", input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, activation="tanh", input_shape=(train_x.shape[1:]), return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, activation="tanh", input_shape=(train_x.shape[1:])))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(Dense(32, activation="relu"))
model.add(Dropout(0.2))

model.add(Dense(2, activation="softmax"))

opt=tf.keras.optimizers.Adam(learning_rate=0.001)

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer = opt,
              metrics=['accuracy'])

tensorboard = TensorBoard(log_dir=f'logs/{NAME}')

# Specify the directory name for model checkpoints
checkpoint_dir = "model_checkpoints"

# Create the directory if it doesn't exist
if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

# Define the filepath for model checkpoints
filepath = os.path.join(checkpoint_dir, "RNN_Final-{epoch:02d}-{val_accuracy:.3f}.model")

# filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"

# Create the ModelCheckpoint callback
checkpoint = ModelCheckpoint(filepath, monitor="val_accuracy", verbose=1, save_best_only=True, mode="max")
# checkpoint = ModelCheckpoint("models/{}.model".format(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max'))

history=model.fit(
    train_x, train_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_data=(validation_x, validation_y),
    callbacks=[tensorboard, checkpoint]
)

Epoch 1/2
Epoch 1: val_accuracy improved from -inf to 0.53919, saving model to model_checkpoints\RNN_Final-01-0.539.model




INFO:tensorflow:Assets written to: model_checkpoints\RNN_Final-01-0.539.model\assets


INFO:tensorflow:Assets written to: model_checkpoints\RNN_Final-01-0.539.model\assets


Epoch 2/2
Epoch 2: val_accuracy improved from 0.53919 to 0.56172, saving model to model_checkpoints\RNN_Final-02-0.562.model




INFO:tensorflow:Assets written to: model_checkpoints\RNN_Final-02-0.562.model\assets


INFO:tensorflow:Assets written to: model_checkpoints\RNN_Final-02-0.562.model\assets


