In [2]:
# Take cryptocurrency and track their price and volume over a period of time.
# Last 60 minutes of data
# We are going to predict whether in next 3 minutes, Litecon price will drop or rise.

# You can take examples of stock market, traffic on websites at time of day, servers are going to heat up or not.

In [55]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from collections import deque
import random
import time

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization #, CuDNNLSTM (If you have GPU version)
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint

# Cuda LSTM is 5 times faster. but this is also fine to use for nwo.
# Cuda LSTM use tanh activation function.
# BatchNormalization - Layer normalization.
# ModelCheckpoint - save the checkpoint

# Dataset stored at - G:\LEARNING\PythonLearning\datasets\crypto_data\crypto_data
# Columns - Timestamp, low, high, open, close (end at 60 sec interval), volumn

# This is all sequential data.
# For any supervised machine learning problem, We need sequences themselves and targets
# Targets - Starting constants

SEQ_LEN = 60 #60 MINS OF PRICING
FUTURE_PERIOD_PREDICT = 3 # Predict for next 3 minutes
RATIO_TO_PREDICT = "LTC-USD"
EPOCHS = 10
BATCH_SIZE = 64
NAME = f"{RATIO_TO_PREDICT}-{SEQ_LEN}-SEQ-{FUTURE_PERIOD_PREDICT}-PRED-{int(time.time())}"
# Create targets
def classify(current, future):
    if float(future) > float(current):
        return 1 # means buy. price goes up
    else:
        return 0 # mean sell or dont buy.

def preprocess_df(df):
    df = df.drop("future" , 1) # we dont need this column now. we needed this to create targets only.
    # scaling apart from target
    for col in df.columns:
        if col != "target":
            df[col] = df[col].pct_change() # pct change helps to normalizes the data.
            df.dropna(inplace = True)
            df[col] = preprocessing.scale( df[col].values ) # scale values between 0 and 1 and you can do in a different way.
    
    df.dropna(inplace=True)

    sequential_data = []
    prev_days = deque( maxlen=SEQ_LEN )
    # Deque - think it as list with max lenght as specified. It keeps appending the items, as it reaches limit, it pops out old items.

    # print(df.head())

    # so the sequences will be sequences of prices and volumes but not the targets

    for i in df.values: # each values in rows of columns
        
        prev_days.append( [ n for n in i[:-1] ] ) # not taking target
        if len(prev_days) == SEQ_LEN: # this what we require
            sequential_data.append( [ np.array(prev_days), i[-1] ] ) # append features and current labels
        
    random.shuffle(sequential_data)
    
    # We need to balance the data. we need to have as many buys and as many sells.
    buys=[]
    sells=[]
    
    for seq, target in sequential_data:
        if target == 0:
            sells.append( [seq, target] )
        elif target == 1:
            buys.append( [seq, target] )
    
    random.shuffle(buys)
    random.shuffle(sells)
    
    lower = min( len(buys), len(sells) )
    
    # balancing upto lower
    buys = buys[:lower]
    sells = sells[:lower]
    
    sequential_data = buys + sells # balanced data
    random.shuffle( sequential_data ) # right now its all buys and then sells so its definitely going to confuse the model. We need to shuffle
    
    # Now we need to split between features and targets
    
    X = []
    y = []
    
    for seq, target in sequential_data:
        X.append(seq)
        y.append(target)
    
    return np.array(X), y
    
main_df = pd.DataFrame()

ratios = ["BTC-USD", "LTC-USD", "ETH-USD", "BCH-USD"]
for ratio in ratios:
    dataset = f"G:\LEARNING\PythonLearning\datasets\crypto_data\crypto_data\{ratio}.csv"
    
    df = pd.read_csv(dataset, names=["time", "low", "high", "open", "close", "volume"])
    df.rename(columns={"close":f"{ratio}_close", "volume":f"{ratio}_volume"}, inplace=True)
    df.set_index("time", inplace=True)
    df = df[[ f"{ratio}_close", f"{ratio}_volume" ]]
    
    if len(main_df) == 0:
        main_df = df
    else:
        main_df = main_df.join(df)

# we need to get future price
main_df["future"] = main_df[ f"{RATIO_TO_PREDICT}_close" ].shift(-FUTURE_PERIOD_PREDICT)
main_df["target"] = list( map( classify, main_df[f"{RATIO_TO_PREDICT}_close"], main_df["future"] ) )

#print( main_df[[f"{RATIO_TO_PREDICT}_close", "future", "target"]].head() )

# For timeseries data we are going to take last 5% of historical data and separate that out as a out-of-sample data.

times = sorted(main_df.index.values) # this will return numpy array
# Need to find out whats the actual threshold which separates out 5% in unix timestamp.
last_5pct = times[ -int(0.05*len(times)) ]

#print(last_5pct) #1534922100

validation_main_df = main_df[ (main_df.index >= last_5pct) ]
main_df = main_df[ (main_df.index < last_5pct) ]

#preprocess_df(main_df)

# Create sequences and scaling of data.
train_x, train_y = preprocess_df(main_df)
validation_x, validation_y = preprocess_df(validation_main_df)

print( f"train data: {len(train_x)} validation: {len(validation_x)}" )
print( f"Dont buys: {train_y.count(0)} , buys: {train_y.count(1)}" )
print( f"Validation Dont buys: {validation_y.count(0)} , buys: {validation_y.count(1)}" )

model = Sequential()
model.add(LSTM(128, input_shape=(train_x.shape[1:]), activation="relu", return_sequences=True))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:]), activation="relu", return_sequences=True))
model.add(Dropout(0.1))
model.add(BatchNormalization())

model.add(LSTM(128, input_shape=(train_x.shape[1:]), activation="relu"))
model.add(Dropout(0.2))
model.add(BatchNormalization())

model.add( Dense(32, activation="relu") )
model.add(Dropout(0.2))

model.add( Dense(2, activation="softmax") )

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

model.compile( loss="sparse_categorical_crossentropy", optimizer = opt, metrics=["accuracy"] )

tensorboard = TensorBoard(log_dir=f"logs\{NAME}")

filepath = "RNN_Final-{epoch:02d}-{val_acc:.3f}"
checkpoint = ModelCheckpoint( "models\{}.model".format(filepath, monitor="val_acc", verbose=1, save_best_only=True, mode="max") )

train_y = np.asarray(train_y)
validation_y = np.asarray(validation_y)

history = model.fit(
                    train_x, train_y, 
                    batch_size = BATCH_SIZE, 
                    epochs = EPOCHS, 
                    validation_data=(validation_x, validation_y),
                    callbacks=[tensorboard, checkpoint],
                    )

# Score model
score = model.evaluate(validation_x, validation_y, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
# Save model
model.save("models/{}".format(NAME))

train data: 69188 validation: 3062
Dont buys: 34594 , buys: 34594
Validation Dont buys: 1531 , buys: 1531
Train on 69188 samples, validate on 3062 samples
Epoch 1/10

KeyError: 'val_acc'