##  This is a script to use deep learning to predict stuff

# Do Imports

In [None]:
import tensorflow as tf
import os
import numpy as np
import pandas as pd

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import time

import matplotlib.pyplot as plt
import math
from sklearn.preprocessing import StandardScaler
from numpy.lib.stride_tricks import as_strided
#tf.enable_eager_execution()

In [None]:
os.getcwd()

# Read data

In [None]:
# Read in data (from pkl file probably)

#os.getcwd()
df = pd.read_pickle('../data/price_level_total_view_2017-01-03_AAPL_grouped_2')
df.head()
df.tail()


# Define functions

In [None]:
def features_2d_to_3d(data, labels, window):
    data_n, data_w = data.shape
    stride1, stride2 = data.strides
    l_stride1 = labels.strides[0]
    new_len = data_n - window
    data3d = as_strided(data, [new_len, window, data_w], strides=[stride1, stride1, stride2])
    labels3d = as_strided(labels, [new_len, window], strides=[l_stride1, l_stride1])
    return data3d, labels3d

def flatten_3d(data):
    data_n = data.shape[0]
    new_width = data.shape[1]*data.shape[2]
    
    return np.reshape(data, (data_n, new_width)) # flesh this function out
    
def split_data(dfX, dfy, train_frac):
    X = dfX
    y = dfy
    n = X.shape[0]
    cutoff = np.floor(n * train_frac).astype(int) # total - the number you want to test, which here i'm flooring
    #                   (amount you want in training should be 1/10th value the denominator)
    # cutoff

    X_train, X_test = (X.iloc[0:cutoff , :] , X.iloc[cutoff: , :] )
    y_train, y_test = (y.iloc[0:cutoff].values.ravel() , y.iloc[cutoff:].values.ravel() )

    ss = StandardScaler()
    ss.fit(X_train)
    X_train = ss.transform(X_train)
    X_test = ss.transform(X_test)

    return X_train, y_train, X_test, y_test


def batch_data(data, labels, batch_size, n_steps):
    windowed_x, windowed_y = features_2d_to_3d(data, labels, n_steps)
    
    t_steps = data.shape[0]
    width = data.shape[1]
    n_batches = t_steps // batch_size
    remainder = t_steps - (n_batches * batch_size)
    new_len = t_steps - remainder
    
    windowed_x = windowed_x[:new_len]
    windowed_y = windowed_y[:new_len]
    
    x_batches = np.reshape(windowed_x, [-1, batch_size, n_steps, width])
    y_batches = np.reshape(windowed_y, [-1, batch_size, n_steps])
    y_batches = y_batches[:,:,n_steps-1]
    
    print(x_batches.shape)
    print(y_batches.shape)

    return x_batches, y_batches, n_batches, new_len


# Define model

In [None]:
def create_rnn_model(n_steps, batch_size):

    tf.reset_default_graph()

    # Define parameters for the RNN
    n_inputs = 14
    n_neurons = 50
    n_outputs = 3

    # Set up placeholders for input data
    X = tf.placeholder(tf.float32, [batch_size, n_steps, n_inputs], name="X")
    labels = tf.placeholder(tf.int32, [batch_size], name="y")
    is_training = tf.placeholder_with_default(False, shape=[], name='training')
    
    with tf.name_scope("rnn"):
        
        cell = tf.contrib.rnn.OutputProjectionWrapper(
            tf.nn.rnn_cell.BasicRNNCell(num_units=n_neurons, activation=tf.nn.relu), output_size=n_outputs)
        outputs, final_state = tf.nn.dynamic_rnn(cell, X, dtype=tf.float32)
        drop1 = tf.layers.dropout(final_state, training=is_training, rate=0.2)
        
    with tf.name_scope("output"):
        hidden1 = tf.layers.dense(drop1, 10, name="hidden1", activation=tf.nn.relu)
        logits = tf.layers.dense(hidden1, n_outputs, name="output")
        Y_proba = tf.nn.softmax(logits, name="Y_proba")
        
        print(logits.shape)
        print(labels.shape)

    # Define the optimizer; taking as input (learning_rate) and (loss)
    with tf.name_scope("train"):
        learning_rate = 0.001
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
        loss = tf.reduce_mean(xentropy)
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        training_op = optimizer.minimize(loss)

    # Step 6: Define the evaluation metric
    with tf.name_scope("eval"):
        correct = tf.nn.in_top_k(logits, labels, 1)
        accuracy = tf.reduce_sum(tf.cast(correct, tf.float32))

    # Step 7: Initiate
    with tf.name_scope("init_and_save"):
        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        
    # Summaries
    with tf.name_scope("summaries"):
        value_ = tf.placeholder(shape=[], name='summary_placeholder', dtype=tf.float32)
        tl = tf.summary.scalar('train_loss', value_)
        ta = tf.summary.scalar('train_accuracy', value_)
        vl = tf.summary.scalar('val_loss', value_)
        va = tf.summary.scalar('val_accuracy', value_)
    
    return training_op, loss, accuracy, init, X, labels, Y_proba, value_, tl, ta, vl, va, is_training

# Create Dataset batches

In [None]:
# select columns and scale data
dfX = df.reset_index().iloc[:,5:]  #np.delete(np.arange(16), [0,2])
dfX2 = df.reset_index().iloc[:,[2,4]]

ss = StandardScaler()
ss.fit(dfX2)
dfX2 = ss.transform(dfX2)

dfX2 = pd.DataFrame(data=dfX2, columns= ['mid_price_log', 'trade_volume_differential'])
dfX = pd.merge(dfX2, dfX, left_index=True, right_index=True)

dfy = df.reset_index().iloc[:,3]

# Create train/test sets
trainarray,labelarray,testarray,testlabelarray = split_data(dfX, dfy, 0.8)


In [None]:
# transform data set to use data windows
from numpy.lib.stride_tricks import as_strided

window_size = 20
batch_size = 400
x, y, n_batches, n_train = batch_data(trainarray, labelarray, batch_size, window_size)
x_val, y_val, n_batches_val, n_test  = batch_data(testarray, testlabelarray, batch_size, window_size)

In [None]:
pd.Series(labelarray).value_counts()

In [None]:
# create random labels if we want to test accuracy

# labelarray = np.asarray([np.random.randint(0,3) for i in np.arange(labelarray.shape[0])])
# testlabelarray = np.asarray([np.random.randint(0,3) for i in np.arange(testlabelarray.shape[0])])

In [None]:
# Create datasets & model

training_op, loss, accuracy, init, X, labels, Y_proba, value_, tl, ta, vl, va, is_training = create_rnn_model(window_size, batch_size)

#temp = create_datasets(trainarray, labelarray, testarray, testlabelarray, window_size)

# Train model

In [None]:
n_epochs=50
saver = tf.train.Saver()
with tf.Session() as sess:
    writer = tf.summary.FileWriter('../tf_graphs/', tf.get_default_graph())
    init.run()
    for epoch in range(n_epochs):
        
        train_loss, train_accuracy = 0, 0
        for b in range(n_batches):
            X_batch, y_batch = x[b], y[b]
            _, loss_value, acc_value = sess.run([training_op, loss, accuracy], 
                                                feed_dict={X: X_batch, labels: y_batch, is_training: True})
            train_loss += loss_value
            train_accuracy += acc_value
        train_accuracy = train_accuracy/n_train
        
        s = sess.run(ta, feed_dict={value_: train_accuracy})
        writer.add_summary(s, epoch)
        s = sess.run(tl, feed_dict={value_: train_loss})
        writer.add_summary(s, epoch)
        
        val_loss, val_accuracy = 0, 0
        for b in range(n_batches_val):
            X_batch, y_batch = x_val[b], y_val[b]       
            loss_value, acc_value = sess.run([loss, accuracy], feed_dict={X: X_batch, labels: y_batch})
            val_loss += loss_value
            val_accuracy += acc_value
        val_accuracy = val_accuracy/n_test

        s = sess.run(va, feed_dict={value_: val_accuracy})
        writer.add_summary(s, epoch)
        s = sess.run(vl, feed_dict={value_: val_loss})
        writer.add_summary(s, epoch)
        
        print("Epoch: {}, Train acc: {:.4f}, Val acc: {:.4f}, Train loss: {:.4f}, Val loss: {:.4f}".format(epoch, train_accuracy, val_accuracy, train_loss, val_loss))

    saver.save(sess, "./my_time_series_model")
    writer.close()


In [None]:
with tf.Session() as sess:                          
    saver.restore(sess, "./my_time_series_model")

    X_new = x_val
    y_pred = sess.run(Y_proba, feed_dict={X: X_new[0]})
    
y_pred
np.argmax(y_pred, axis=1)