In [1]:
import os
import numpy as np
import pandas as pd
import pickle
import quandl
from datetime import datetime
import tensorflow as tf;
from six.moves import cPickle as pickle;
from sklearn.preprocessing import MinMaxScaler
import plotly.offline as py
import plotly.graph_objs as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt
from pandas import read_csv, DataFrame
import math
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.callbacks import Callback
from sklearn.preprocessing import MinMaxScaler
from matplotlib import pyplot as plt
%matplotlib inline
py.init_notebook_mode(connected=True)

Using TensorFlow backend.


In [10]:
def get_quandl_data(quandl_id):
    '''Download and cache Quandl dataseries'''
    cache_path = '{}.pkl'.format(quandl_id).replace('/','-')
    try:
        f = open(cache_path, 'rb')
        df = pickle.load(f)   
        print('Loaded {} from cache'.format(quandl_id))
    except (OSError, IOError) as e:
        print('Downloading {} from Quandl'.format(quandl_id))
        df = quandl.get(quandl_id, authtoken="rGwyAH1yyw29yX8E1LQJ", returns="pandas")
        df.to_pickle(cache_path)
        print('Cached {} at {}'.format(quandl_id, cache_path))
    return df
    
def merge_dfs_on_column(dataframes, labels, col):
    '''Merge a single column of each dataframe into a new combined dataframe'''
    series_dict = {}
    for index in range(len(dataframes)):
        series_dict[labels[index]] = dataframes[index][col]
        
    return pd.DataFrame(series_dict)

In [11]:
exchanges = ['KRAKEN', 'COINBASE', 'BITFINEX']

exchange_data = {}

for exchange in exchanges:
    exchange_code = 'BCHARTS/{}USD'.format(exchange)
    btc_exchange_df = get_quandl_data(exchange_code)
    btc_exchange_df = btc_exchange_df.replace(0, np.NaN) # Wack Kraken values ˜
    exchange_data[exchange] = btc_exchange_df

print(exchange_data)

btc_usd_datasets = merge_dfs_on_column(list(exchange_data.values()), list(exchange_data.keys()), 'Weighted Price')
print(list(exchange_data['COINBASE'].columns))

Downloading BCHARTS/KRAKENUSD from Quandl
Cached BCHARTS/KRAKENUSD at BCHARTS-KRAKENUSD.pkl
Downloading BCHARTS/COINBASEUSD from Quandl
Cached BCHARTS/COINBASEUSD at BCHARTS-COINBASEUSD.pkl
Downloading BCHARTS/BITFINEXUSD from Quandl
Cached BCHARTS/BITFINEXUSD at BCHARTS-BITFINEXUSD.pkl
{'KRAKEN':                   Open        High         Low       Close  Volume (BTC)  \
Date                                                                       
2014-01-07   874.67040   892.06753   810.00000   810.00000     15.622378   
2014-01-08   810.00000   899.84281   788.00000   824.98287     19.182756   
2014-01-09   825.56345   870.00000   807.42084   841.86934      8.158335   
2014-01-10   839.99000   857.34056   817.00000   857.33056      8.024510   
2014-01-11   858.20000   918.05471   857.16554   899.84105     18.748285   
2014-01-12   899.96114   900.93989   833.00001   860.00000     25.429433   
2014-01-13   847.32152   859.99999   815.00000   835.00000     25.869127   
2014-01-14   835.

In [4]:
# Merging data together to create one consistent set 
# Basically does a left join on all three data sets and gets the mean of all values
# Prevents weird discrepencies in the data 

new = pd.merge(exchange_data['KRAKEN'], exchange_data['COINBASE'], how='outer', left_index=True, right_index=True)
new = pd.merge(new, exchange_data['BITFINEX'], how='outer', left_index=True, right_index=True)
new['new_open'] = new[['Open', 'Open_x', 'Open_y']].mean(axis=1)
new['new_high'] = new[['High_x', 'High_y', 'High']].mean(axis=1)
new['new_low'] = new[['Low_x', 'Low_y', 'Low']].mean(axis=1)
new['new_close'] = new[['Close_x', 'Close_y', 'Close']].mean(axis=1)
new['new_btc_volume'] = new[['Volume (BTC)_x', 'Volume (BTC)_y', 'Volume (BTC)']].mean(axis=1)
new['new_currency_volume'] = new[['Volume (Currency)_x', 'Volume (Currency)_y', 'Volume (Currency)']].mean(axis=1)
new['new_weighted_price'] = new[['Weighted Price_x', 'Weighted Price_y', 'Weighted Price']].mean(axis=1)

df = new[['new_open', 'new_high', 'new_low', 'new_close', 'new_btc_volume', 'new_currency_volume', 'new_weighted_price']]

print(df.head())

              new_open    new_high    new_low   new_close  new_btc_volume  \
Date                                                                        
2013-03-31   93.250000  100.000000   93.03000   93.100000      390.827224   
2013-04-01   93.170000  105.900000   92.49999  102.370000     4919.654127   
2013-04-02  102.800000  118.388067   99.00000  117.989990     9084.832816   
2013-04-03  116.579097  146.880000  101.51088  134.952969    12909.402178   
2013-04-04  131.779686  143.000000  119.00000  132.681000     6910.100414   

            new_currency_volume  new_weighted_price  
Date                                                 
2013-03-31         3.708933e+04           94.899563  
2013-04-01         4.921010e+05          100.027564  
2013-04-02         9.874119e+05          108.687953  
2013-04-03         1.652092e+06          127.975861  
2013-04-04         9.094385e+05          131.610029  


In [5]:
btc_trace = go.Scatter(x=df.index, y=df['new_weighted_price'])
py.iplot([btc_trace])

In [6]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
    """
    Frame a time series as a supervised learning dataset.
    Arguments:
        data: Sequence of observations as a list or NumPy array.
        n_in: Number of lag observations as input (X).
        n_out: Number of observations as output (y).
        dropnan: Boolean whether or not to drop rows with NaN values.
    Returns:
        Pandas DataFrame of series framed for supervised learning.
    """
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    
    # input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [('time_%d(t-%d)' % (j+1, i)) for j in range(n_vars)]
        
    # forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [('time_%d(t)' % (j+1)) for j in range(n_vars)]
        else:
            names += [('time_%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
            
    # put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    # drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    return agg

def create_rnn_set(data, look_back=1):
    df = pd.DataFrame(data)
    columns = [df.shift(i) for i in range(1, look_back + 1)]
    columns.append(df)
    df = pd.concat(columns, axis=1)
    df.fillna(0, inplace=True)
    return df

def create_dataset(dataset, look_back=1):
    dataX, dataY = [], []
    for i in range(len(dataset) - look_back):
        a = dataset[i:(i + look_back), 0]
        dataX.append(a)
        dataY.append(dataset[i + look_back, 0])
    print(len(dataY))
    return np.array(dataX), np.array(dataY)

In [129]:
train_size = int(len(values) * 0.7)
test_size = len(values) - train_size
train, test = values[0:train_size,:], values[train_size:len(values),:]
look_back = 1
values = df['new_weighted_price'].values.reshape(-1,1)
values = values.astype('float32')
x, y = create_dataset(train)
val = series_to_supervised(train, look_back).values
trainX, trainY = val[:,0:look_back], val[:, look_back]
print(x, y)
print(trainX, trainY)

1175
[[  94.89956665]
 [ 100.027565  ]
 [ 108.68795013]
 ..., 
 [ 686.42138672]
 [ 741.94622803]
 [ 745.82977295]] [ 100.027565    108.68795013  127.9758606  ...,  741.94622803  745.82977295
  759.06738281]
[[  94.89956665]
 [ 100.027565  ]
 [ 108.68795013]
 ..., 
 [ 686.42138672]
 [ 741.94622803]
 [ 745.82977295]] [ 100.027565    108.68795013  127.9758606  ...,  741.94622803  745.82977295
  759.06738281]


In [153]:
def create_model(trainX, trainY, testX, testY, neurons):
    '''
    Create a model based on 'weighed_average' variable. 
    This model will not be stateful. 
    ''' 

    model = Sequential()
    model.add(LSTM(neurons, input_shape=(trainX.shape[1], trainX.shape[2]), return_state=False))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    history = model.fit(trainX, trainY, epochs=300, batch_size=50, validation_data=(testX, testY), verbose=1, shuffle=False)

    return model 

# Create the values and feed them into the model
def create_data(df):
    values = df['new_weighted_price'].values.reshape(-1,1)
    values = values.astype('float32')
    scaler = MinMaxScaler(feature_range=(0, 1))
    values = scaler.fit_transform(values)

    train_size = int(len(values) * 0.7)
    test_size = len(values) - train_size
    train, test = values[0:train_size,:], values[train_size:len(values),:]

    val = series_to_supervised(train, look_back).values
    trainX, trainY = val[:,0:look_back], val[:, look_back]
    trainX = trainX.reshape(trainX.shape[0], look_back, trainX.shape[1])

    val = series_to_supervised(test, look_back).values
    testX, testY = val[:,0:look_back], val[:, look_back]
    testX = testX.reshape(testX.shape[0], look_back, testX.shape[1])
    
    return trainX, trainY, testX, testY, scaler

trainX, trainY, testX, testY, scaler = create_data(df)
# Testing multiple neurons, 150 had the best MSE over 200, 175, 100, 50
model = create_model(trainX, trainY, testX, testY, neurons=150)

expected_test = model.predict(testX)
yhat_inverse = scaler.inverse_transform(expected_test.reshape(-1, 1))
testY_inverse = scaler.inverse_transform(testY.reshape(-1, 1))
print('MSE: ', math.sqrt(mean_squared_error(testY_inverse, yhat_inverse)))

predictDates = df.tail(len(testX)).index
testY_reshape = testY_inverse.reshape(len(testY_inverse))
yhat_reshape = yhat_inverse.reshape(len(yhat_inverse))

actual_chart = go.Scatter(x=predictDates, y=testY_reshape, name= 'Actual Price')
predict_chart = go.Scatter(x=predictDates, y=yhat_reshape, name= 'Predict Price')
py.iplot([predict_chart, actual_chart])

Train on 1175 samples, validate on 504 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300


Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 

array([[ 0.09338189],
       [ 0.09159265],
       [ 0.0824488 ],
       [ 0.07764318],
       [ 0.07197552],
       [ 0.08102166],
       [ 0.08269944],
       [ 0.07802531],
       [ 0.07844409],
       [ 0.07926214],
       [ 0.07728963],
       [ 0.08073851],
       [ 0.08281983],
       [ 0.08459263],
       [ 0.0823387 ],
       [ 0.08191197],
       [ 0.08202667],
       [ 0.08239052],
       [ 0.07744318],
       [ 0.07930545],
       [ 0.07904058],
       [ 0.07895064],
       [ 0.07944147],
       [ 0.08082693],
       [ 0.08088232],
       [ 0.08040419],
       [ 0.08117583],
       [ 0.0812064 ],
       [ 0.08256585],
       [ 0.08278746],
       [ 0.08203808],
       [ 0.08164183],
       [ 0.081123  ],
       [ 0.0801763 ],
       [ 0.07981344],
       [ 0.08042608],
       [ 0.08002944],
       [ 0.07951055],
       [ 0.07985017],
       [ 0.08008399],
       [ 0.08004   ],
       [ 0.07995165],
       [ 0.0772102 ],
       [ 0.0749157 ],
       [ 0.06997344],
       [ 0

MSE:  97.57892438623722


In [207]:
def model_2(trainX, trainY, testX, testY):
    '''
    Create a model based on 'weighed_average' variable. 
    This model will not be stateful. 
    ''' 

    model = Sequential()
    model.add(LSTM(175, input_shape=(trainX.shape[1], trainX.shape[2])))
    model.add(Dense(1))
    model.compile(loss='mae', optimizer='adam')
    history = model.fit(trainX, trainY, epochs=300, batch_size=50, validation_data=(testX, testY), verbose=2, shuffle=False)

    return model 

def create_data(df, look_back):
    '''
    This function specifically passes in a lookback of 2. We want to consider two previous values for this instance
    '''
    values = df['new_weighted_price'].values.reshape(-1,1)
    values = values.astype('float32')
    scaler = MinMaxScaler(feature_range=(0, 1))
    values = scaler.fit_transform(values)

    train_size = int(len(values) * 0.7)
    test_size = len(values) - train_size
    train, test = values[0:train_size,:], values[train_size:len(values),:]

    val = series_to_supervised(data=train, n_out=look_back).values
    trainX, trainY = val[:,0:look_back], val[:, look_back]
    trainX = trainX.reshape(trainX.shape[0], look_back, 1)
    print(trainX)

    val = series_to_supervised(data=test, n_out=look_back).values
    testX, testY = val[:,0:look_back], val[:, look_back]
    testX = testX.reshape(testX.shape[0], look_back, 1)
    
    return trainX, trainY, testX, testY, scaler

'''
Increasing the lookback increases the MSE
'''
trainX, trainY, testX, testY, scaler = create_data(df, 4)
model_2 = model_2(trainX, trainY, testX, testY)

# Output
expected_test = model_2.predict(testX)
yhat_inverse = scaler.inverse_transform(expected_test.reshape(-1, 1))
testY_inverse = scaler.inverse_transform(testY.reshape(-1, 1))
print('MSE: ', math.sqrt(mean_squared_error(testY_inverse, yhat_inverse)))

predictDates = df.tail(len(testX)).index
testY_reshape = testY_inverse.reshape(len(testY_inverse))
yhat_reshape = yhat_inverse.reshape(len(yhat_inverse))

actual_chart = go.Scatter(x=predictDates, y=testY_reshape, name= 'Actual Price')
predict_chart = go.Scatter(x=predictDates, y=yhat_reshape, name= 'Predict Price')
py.iplot([predict_chart, actual_chart])

[[[ 0.00379896]
  [ 0.00449075]
  [ 0.00565906]
  [ 0.00826106]]

 [[ 0.00449075]
  [ 0.00565906]
  [ 0.00826106]
  [ 0.00875132]]

 [[ 0.00565906]
  [ 0.00826106]
  [ 0.00875132]
  [ 0.00966141]]

 ..., 
 [[ 0.07795587]
  [ 0.08428156]
  [ 0.08296047]
  [ 0.08359704]]

 [[ 0.08428156]
  [ 0.08296047]
  [ 0.08359704]
  [ 0.09108751]]

 [[ 0.08296047]
  [ 0.08359704]
  [ 0.09108751]
  [ 0.09161142]]]
Train on 1172 samples, validate on 501 samples
Epoch 1/300
5s - loss: 0.0252 - val_loss: 0.1416
Epoch 2/300
0s - loss: 0.0173 - val_loss: 0.0837
Epoch 3/300
0s - loss: 0.0104 - val_loss: 0.0167
Epoch 4/300
0s - loss: 0.0044 - val_loss: 0.0429
Epoch 5/300
0s - loss: 0.0068 - val_loss: 0.0333
Epoch 6/300
0s - loss: 0.0047 - val_loss: 0.0151
Epoch 7/300
0s - loss: 0.0034 - val_loss: 0.0107
Epoch 8/300
0s - loss: 0.0025 - val_loss: 0.0130
Epoch 9/300
0s - loss: 0.0034 - val_loss: 0.0107
Epoch 10/300
0s - loss: 0.0028 - val_loss: 0.0113
Epoch 11/300
0s - loss: 0.0032 - val_loss: 0.0122
Epoch 12/

0s - loss: 0.0016 - val_loss: 0.0420
Epoch 156/300
0s - loss: 0.0022 - val_loss: 0.0274
Epoch 157/300
0s - loss: 0.0040 - val_loss: 0.0610
Epoch 158/300
0s - loss: 0.0056 - val_loss: 0.0462
Epoch 159/300
0s - loss: 0.0021 - val_loss: 0.0304
Epoch 160/300
0s - loss: 0.0037 - val_loss: 0.0631
Epoch 161/300
0s - loss: 0.0053 - val_loss: 0.0394
Epoch 162/300
0s - loss: 0.0033 - val_loss: 0.0368
Epoch 163/300
0s - loss: 0.0027 - val_loss: 0.0544
Epoch 164/300
0s - loss: 0.0038 - val_loss: 0.0338
Epoch 165/300
0s - loss: 0.0016 - val_loss: 0.0338
Epoch 166/300
0s - loss: 0.0020 - val_loss: 0.0362
Epoch 167/300
0s - loss: 0.0021 - val_loss: 0.0327
Epoch 168/300
0s - loss: 0.0034 - val_loss: 0.0607
Epoch 169/300
0s - loss: 0.0048 - val_loss: 0.0451
Epoch 170/300
0s - loss: 0.0026 - val_loss: 0.0361
Epoch 171/300
0s - loss: 0.0024 - val_loss: 0.0358
Epoch 172/300
0s - loss: 0.0027 - val_loss: 0.0565
Epoch 173/300
0s - loss: 0.0036 - val_loss: 0.0466
Epoch 174/300
0s - loss: 0.0024 - val_loss: 0

In [206]:
math.sqrt(mean_squared_error(testY_inverse, yhat_inverse))

794.5944327139475