**Crypto currency prediction using DNN models - one hot encoding approach**

Import necessary libraries

In [0]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt

from keras import Sequential
from keras.layers import Dense, LSTM, CuDNNLSTM
from sklearn.preprocessing import MinMaxScaler, StandardScaler

Load the data from csv file to pandas dataframe. The bid value is taken and converted to a list with float values. It is then normalized using standard scaler package from sklearn. 

In [0]:
def get_data(filename):
  df = pd.read_csv(filename, names = ['time','low','high', 'open', 'close', 'volume'])
  data = df.iloc[:,4:5].astype('float').values
  scaler = StandardScaler()
  data = scaler.fit_transform(data)
  return df, data

The peak detection function based on the delta value. This is to determine the local minimas and maximas in the price of the crypto-currency plot.

In [0]:
def peakdet(v, delta, x = None):
    maximum = []
    minimum = []
    if x is None:
        x = np.arange(len(v))
    v = np.asarray(v)
    min_val, max_val = np.Inf, -np.Inf
    min_pos, max_pos = np.NaN, np.NaN
    look_for_max = True
    for i in np.arange(len(v)):
        this = v[i]
        if this > max_val:
            max_val = this
            max_pos = x[i]
        if this < min_val:
            min_val = this
            min_pos = x[i]

        if look_for_max:
            if this < max_val-delta:
                maximum.append((max_pos, max_val))
                min_val = this
                min_pos = x[i]
                look_for_max = False
        else:
            if this > min_val+delta:
                minimum.append((min_pos, min_val))
                max_val = this
                max_pos = x[i]
                look_for_max = True

    return np.array(maximum), np.array(minimum)

The real price of the crypto currency is plotted with markings of when to sell, buy and wait.

In [0]:
def plot_peak(peaksmax, peaksmin, data):
  fig = plt.figure(figsize=(21,7))
  plt.plot(peaksmax[:,0], peaksmax[:, 1], 'ro', label="Max peaks")
  plt.plot(peaksmin[:,0], peaksmin[:, 1], 'go', label="Minimum peaks")
  plt.plot(data, label="Bid")
  plt.grid()
  plt.title("Peaks detection")
  plt.legend()
  plt.show()

All the features of the dataframe are normalized except the time, which is being dropped.

In [0]:
def get_NDF(df):
  NDF = df
  NDF = NDF.drop(columns = ['time'])
  scaler = StandardScaler()
  NDF["low"] = scaler.fit_transform(df.iloc[:,1:2].astype('float').values)
  NDF["high"] = scaler.fit_transform(df.iloc[:,2:3].astype('float').values)
  NDF["open"] = scaler.fit_transform(df.iloc[:,3:4].astype('float').values)
  NDF["close"] = scaler.fit_transform(df.iloc[:,4:5].astype('float').values)
  return NDF

Three columns that are being added to the dataset - sell, buy and wait. These are the labels and this prediction is converted to a classification problem from a regression problem.

In [0]:
def setSell(idx, peaksmax, peaksmin):
    for i in peaksmax:
        if i[0] == idx:
            return 1.0
    for i in peaksmin:
        if i[0] == idx:
            return 0.0
    return 0.0

def setWait(idx, peaksmax, peaksmin):
    for i in peaksmax:
        if i[0] == idx:
            return 0.0
    for i in peaksmin:
        if i[0] == idx:
            return 0.0
    return 1.0

def setBuy(idx, peaksmax, peaksmin):
    for i in peaksmax:
        if i[0] == idx:
            return 0.0
    for i in peaksmin:
        if i[0] == idx:
            return 1.0
    return 0.0

In [0]:
def frame_labelization(frame_base, maxp, minp, data):
    frame_base["wait"] = [setWait(d, maxp, minp) for d in range(len(data))]
    frame_base["sell"] = [setSell(d, maxp, minp) for d in range(len(data))]
    frame_base["buy"] = [setBuy(d, maxp, minp) for d in range(len(data))]
    return frame_base

The dataset is split into training and testing data - with a split of 70% and 30% respectively.

In [0]:
def generate_data(dataset, timestep, xcols, ycols):
    dx, dy = [], []
    for i in range(len(dataset) - timestep):
        a = dataset.iloc[i : i + timestep][xcols]
        dx.append(np.array(a))
        dy.append(dataset.iloc[i + timestep  - 1][ycols])
    return np.array(dx), np.array(dy)

def split_train(frame, train_per, test_per, timestep, xcols, ycols, balance=False):
    x_tmp, y_tmp = generate_data(frame, timestep, xcols, ycols)
    if balance is True:
        x_tmp, y_tmp =  balancelabelisation(x_tmp, y_tmp)
    train_size = int(x_tmp.shape[0] * train_per)
    test_size = int(x_tmp.shape[0] * test_per) + train_size

    x_train = x_tmp[: train_size]
    y_train = y_tmp[: train_size]
    x_test = x_tmp[train_size : test_size]
    y_test = y_tmp[train_size : test_size]
    return x_train, y_train, x_test, y_test
  
def balancelabelisation(frame, label):
    buy = int(label[:,0].sum())
    sell = int(label[:,1].sum())
    wait = int(label[:,2].sum())
    need_delete = wait - min(sell, buy)
    tab_wait = [i for i, l in enumerate(label)if l[2] == 1]
    rand_delete = np.random.choice(tab_wait, need_delete, replace=False)
    final_frame = np.delete(frame, rand_delete, axis=0)
    final_label = np.delete(label, rand_delete, axis=0)
    final_frame = np.array(final_frame)
    final_label = np.array(final_label)
    return final_frame, final_label

Model : Its a simple 3 layer CuDNN LSTM network. CuDNN is to be used with GPU. It does not support dropout layers. There is a softmax layer in the end. The model is compiled with adam optimizer, mean squared error loss and accuracy metric. 

In [0]:
def get_model(x_train, y_train):
  model = Sequential()
  model.add(CuDNNLSTM(256, input_shape=(x_train.shape[1], x_train.shape[2]),return_sequences=True))
  model.add(CuDNNLSTM(256))
  model.add(Dense(y_train.shape[1], activation='softmax'))

  model.compile(optimizer="adam", loss="mse",metrics=['accuracy'])
  model.summary()
  return model

The model is fit with train data with a validation split of 10%. It is then evaluated on the test dataset and the final loss and accuracy are displayed.

In [0]:
def evaluate_model(model, x_train, y_train, x_test, y_test):
  
  hist = model.fit(x_train, y_train, 
                   epochs=3,
                   batch_size=16,
                   validation_split=0.1,
                   verbose=1)
  
  score , acc = model.evaluate(x_test, y_test)

  print("score: {}".format(score))
  print("acc: {}".format(acc))
  


Main function

In [0]:
def main():
  df, data = get_data('LTC-USD.csv')
  peaksmax, peaksmin = peakdet(data, 0.0001)
  plot_peak(peaksmax, peaksmin, data)
  NDF = get_NDF(df)
  NDF = NDF[:1000]
  df = df[:1000]
  normalize_data_tmp = frame_labelization(NDF, peaksmax, peaksmin, df)

  labelisation_features_name = ["low", "high", "open", "close", "volume"]
  labelisation_labels_name = ["wait", "sell", "buy"]

  x_train, y_train, x_test, y_test = split_train(normalize_data_tmp, 0.7, 0.3, 50, 
                                                     labelisation_features_name,
                                                     labelisation_labels_name,
                                                     balance=True)

  model = get_model(x_train, y_train)
  evaluate_model(model, x_train, y_train, x_test, y_test)


In [0]:
if __name__ == '__main__':
  main()