In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# 1. Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)



In [0]:
# Test if GPU is active
import tensorflow as tf
tf.test.gpu_device_name()

In [3]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
import statsmodels.api as sm
from time import time
from scipy import stats
from sklearn.metrics import mean_squared_error
from math import sqrt
from random import randint
from keras.models import Sequential
from keras.layers import Dense, LSTM, GRU, Dropout
from keras.optimizers import SGD, RMSprop, Adam
from keras.callbacks import EarlyStopping
from keras import initializers
from matplotlib import pyplot
from datetime import datetime
from matplotlib import pyplot as plt
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
%matplotlib inline

Using TensorFlow backend.


##**DATA PREPROCESSING **

In [4]:
# PyDrive reference:
# https://gsuitedevs.github.io/PyDrive/docs/build/html/index.html

# Read data
dataset_file = drive.CreateFile({'id': '1U4t7lcxdU9xkLswd4ZttFQqhbf3yKRPv'})
dataset_file.GetContentFile('bitcoin_train.csv')
# Read the training file and record the running time
time_1 = time()
data = pd.read_csv('bitcoin_train.csv', sep = ',', parse_dates=[0], dayfirst = True)
time_2 = time()
print('read data cost '+ str(time_2 - time_1)+' second')

# Clean data
col_n = ['date','close']
data_select = pd.DataFrame(data,columns = col_n)
data_image = data_select.dropna() # Drop all Nans
data_image = data_select.values
data_select['date'] = pd.to_datetime(data_select['date'])
data_select.sort_values('date', inplace = True)

# Split data
training_data = data_select[:-46]
testing_data = data_select[-46:]

read data cost 0.09872078895568848 second


##**PREPARE DATA FOR MODEL**

In [0]:
def create_lookback(dataset, look_back=1):
    X, Y = [], []
    for i in range(len(dataset) - look_back):
        a = dataset[i:(i + look_back), 0]
        X.append(a)
        Y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(Y)
  
scaler = MinMaxScaler()

#Reshape and scale datasets
training_set = training_data.values
price_set_1 = training_set[:,1]
training_set = np.reshape(price_set_1, (len(price_set_1), 1))
train_set = scaler.fit_transform(training_set)

testing_set = testing_data.values
price_set_2 = testing_set[:,1]
testing_set = np.reshape(price_set_2, (len(price_set_2), 1))
test_set = scaler.fit_transform(testing_set)

# create datasets which are suitable for time series forecasting
look_back = 1
X_train, Y_train = create_lookback(train_set, look_back)
X_test, Y_test = create_lookback(test_set, look_back)

# reshape datasets so that they will be ok for the requirements of the LSTM model in Keras
X_train = np.reshape(X_train, (len(X_train), X_train.shape[1], 1))
X_test = np.reshape(X_test, (len(X_test), X_test.shape[1], 1))


##**DEFINE FUNCTIONS**

In [0]:
### DISPLAY PLOTLY GRAPHS IN COLAB
def configure_plotly_browser_state():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              plotly: 'https://cdn.plot.ly/plotly-latest.min.js?noext',
            },
          });
        </script>
        '''))
  
### PREDICTION USING TEST DATA
def predict(model):
  global X_test, Y_test
  X_test = np.reshape(X_test, (len(X_test), 1, 1))
  # get predictions and then make some transformations to be able to calculate RMSE properly in USD
  prediction = model.predict(X_test) 
  prediction_inverse = scaler.inverse_transform(prediction.reshape(-1, 1)) 
  Y_test_inverse = scaler.inverse_transform(Y_test.reshape(-1, 1)) 
  prediction2_inverse = np.array(prediction_inverse[:,0][:]) 
  Y_test2_inverse = np.array(Y_test_inverse[:,0]) 
  RMSE = sqrt(mean_squared_error(Y_test2_inverse, prediction2_inverse))
  #print('Test RMSE: %.3f' % RMSE)
  return RMSE

### TRAIN LSTM MODEL:
def train_model(X_train, Y_train, X_test, Y_test, optimizer, n_neurons, n_layers, batch_size, dropout_rate):
  model = Sequential()
  for i in range(0,n_layers):
    if i == 0:
      if n_layers == 1: model.add(LSTM(n_neurons, input_shape=(X_train.shape[1], X_train.shape[2])))
      elif n_layers > 1: model.add(LSTM(n_neurons, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    if i > 0:
      if i < n_layers-1: model.add(LSTM(n_neurons, return_sequences=True))
      elif i == n_layers-1: model.add(LSTM(n_neurons))
  if dropout_rate > 0:
    model.add(Dropout(dropout_rate))
  model.add(Dense(1))

  time_3 = time()

  # compile and fit the model
  model.compile(loss='mean_squared_error', optimizer=optimizer)
  history = model.fit(X_train, Y_train, epochs=50, batch_size=batch_size, shuffle=False,
                      validation_data=(X_test, Y_test), verbose = 0)
                      #callbacks = [EarlyStopping(monitor='val_loss', min_delta=5e-5, patience=20, verbose=0)], verbose=0)
  #time_4 = time()
  #print('Traing cost '+ str(time_4 - time_3)+' second')
  return history

### PLOT LOSS GRAPHS
def plot_loss(criteria, title, label):

  size = len(loss[0])
  colors = ['blue','red','green','orange','grey']
  data = []
  for i in range(0,len(loss)):
    trace = go.Scatter(
        x = np.arange(0, size, 1),
        y = loss[i],
        mode = 'lines',
        name = label + ' = ' + str(criteria[i]),
        line = dict(color=colors[i], width=2)
    )
    data.append(trace)
  layout = dict(title = 'Test Loss of different ' + title,
                xaxis = dict(title = 'Epoch number'), yaxis = dict(title = 'Loss'))
  fig = dict(data=data, layout=layout)
  py.iplot(fig, filename='training_process')

##**OPTIMIZER FINE-TUNING**

In [81]:
# Adam
learning_rate = [0.00001,0.0001,0.001,0.01,0.1]
loss = []
for lr in learning_rate:
  optimizer = Adam(lr=lr, decay=1e-6)
  history = train_model(X_train, Y_train, X_test, Y_test, optimizer, 128, 1, 16, 0)
  loss.append(history.history['val_loss'])
  
configure_plotly_browser_state()
plot_loss(learning_rate, 'Adam learnig rates', 'lr')
print('Best learning rate:', str(learning_rate[np.argmin([loss[0][-1],loss[1][-1],loss[2][-1],loss[3][-1],loss[4][-1]])]))
print('Best loss:', str(np.min([loss[0][-1],loss[1][-1],loss[2][-1],loss[3][-1],loss[4][-1]])))

Best learning rate: 0.001
Best loss: 0.008082787257929642


In [82]:
# RMSprop 
#[1046.90699647584, 328.6150799042895, 227.3251100927995, 252.9473386128843, 259.9733161862902]
loss = []
for lr in learning_rate:
  optimizer = RMSprop(lr=lr, decay=1e-6)
  history = train_model(X_train, Y_train, X_test, Y_test, optimizer, 128, 1, 16, 0)
  loss.append(history.history['val_loss'])

configure_plotly_browser_state()
plot_loss(learning_rate, 'RMSprop learnig rates', 'lr')
print('Best learning rate:', str(learning_rate[np.argmin([loss[0][-1],loss[1][-1],loss[2][-1],loss[3][-1],loss[4][-1]])]))
print('Best loss:', str(np.min([loss[0][-1],loss[1][-1],loss[2][-1],loss[3][-1],loss[4][-1]])))

Best learning rate: 0.001
Best loss: 0.007565635152988964


In [85]:
# SGD
# [1098.0822372525574, 846.0444633275769, 382.90657030517235, 232.171269787306, 226.463277492128]
loss = []
for lr in learning_rate:
  optimizer = SGD(lr=lr, momentum=0.9, decay=1e-6, nesterov=True)
  history = train_model(X_train, Y_train, X_test, Y_test, optimizer, 128, 1, 16, 0)
  loss.append(history.history['val_loss'])

configure_plotly_browser_state()
plot_loss(learning_rate, 'SGD learnig rates', 'lr')
print('Best learning rate:', str(learning_rate[np.argmin([loss[0][-1],loss[1][-1],loss[2][-1],loss[3][-1],loss[4][-1]])]))
print('Best loss:', str(np.min([loss[0][-1],loss[1][-1],loss[2][-1],loss[3][-1],loss[4][-1]])))

Best learning rate: 0.01
Best loss: 0.007890996895730496


## **FINAL TEST DATA VISUALIZATION**

In [12]:
configure_plotly_browser_state()

trace1 = go.Scatter(
    x = np.arange(0, len(prediction2_inverse), 1),
    y = prediction2_inverse,
    mode = 'lines',
    name = 'Predicted labels',
    line = dict(color=('rgb(244, 146, 65)'), width=2)
)
trace2 = go.Scatter(
    x = np.arange(0, len(Y_test2_inverse), 1),
    y = Y_test2_inverse,
    mode = 'lines',
    name = 'True labels',
    line = dict(color=('rgb(66, 244, 155)'), width=2)
)

data = [trace1, trace2]
layout = dict(title = 'Comparison of true prices (on the test dataset) with prices our model predicted',
             xaxis = dict(title = 'Day number'), yaxis = dict(title = 'Price, USD'))
fig = dict(data=data, layout=layout)
py.iplot(fig, filename='results_demonstrating0')

RMSE = sqrt(mean_squared_error(Y_test2_inverse, prediction2_inverse))
print('Test RMSE: %.3f' % RMSE)

Test RMSE: 233.858
