##  This is a script to use deep learning to predict stuff

# Do Imports

In [None]:
import tensorflow as tf
import os
import numpy as np
import pandas as pd

os.environ['TF_CPP_MIN_LOG_LEVEL']='2'
import time

import matplotlib.pyplot as plt

# Read data and create Dataset iterators

In [None]:
# Read in data (from pkl file probably)

#os.getcwd()
df = pd.read_pickle('../data/price_level_total_view_2017-01-03_AAPL_grouped')
df.head()
df.tail()


In [None]:
# temporary object to develop to
trainarray = np.zeros([10,5,2], dtype=float) #[n_rows, window_size, features]
labelarray = np.zeros([10,2], dtype=float) # [n_rows, output_classes]

testarray = np.zeros([10,5,2], dtype=float) #[n_rows, window_size, features]
testlabelarray = np.zeros([10,2], dtype=float) # [n_rows, output_classes]


# Define functions

In [None]:
def features_2d_to_3d(data, labels, window):
    data_n, data_w = data.shape
    stride1, stride2 = data.strides
    new_len = data_n - window
    data3d = as_strided(data, [new_len , window, data_w], strides=[stride1, stride1, stride2])
    return(data3d, labels[:len(labels)-window])

def flatten_3d(data):
    data_n = data.shape[0]
    new_width = data.shape[1]*data.shape[2]
    
    return np.reshape(data, (data_n, new_width)) # flesh this function out
    
def split_data(df, train_frac):
    n = X.shape[0]
    cutoff = n-(n//8) # total - the number you want to test, which here i'm flooring 
    #                   (amount you want in training should be 1/10th value the denominator)
    # cutoff

    X_train, X_test = (X.iloc[0:cutoff , :] , X.iloc[cutoff: , :] )

    y_train, y_test = (y.iloc[0:cutoff , :].values.ravel() , y.iloc[cutoff: , :].values.ravel() )
    
    return X_train, y_train, X_test, y_test


In [None]:
def create_datasets(trainarray, labelarray, testarray, testlabelarray, batch_size):
    tf.reset_default_graph()
    
    train_n = trainarray.shape[0]
    test_n = testarray.shape[0]

    with tf.name_scope("dataset"):
        training_dataset = (
            tf.data.Dataset.from_tensor_slices(
                (
                    tf.cast(trainarray, tf.float32),
                    tf.cast(labelarray, tf.int32)
                )
            ).shuffle(buffer_size=2*train_n).batch(batch_size) # multiply by 2 if using accuracy calc
        )

        test_dataset = (
            tf.data.Dataset.from_tensor_slices(
                (
                    tf.cast(testarray, tf.float32),
                    tf.cast(testlabelarray, tf.int32)
                )
            )
        ).shuffle(buffer_size=2*test_n).batch(batch_size)

    with tf.name_scope("iterator"):
        iterator = tf.data.Iterator.from_structure(training_dataset.output_types, training_dataset.output_shapes)
        features, labels = iterator.get_next()
        train_init = iterator.make_initializer(training_dataset) # initializer for train_data
        test_init = iterator.make_initializer(test_dataset) # initializer for train_data

    return features, labels, train_init, test_init, trainarray.shape[1]

# Define model

In [None]:
def create_model(features, labels, n_inputs, n_outputs=3):
    keep_prob = tf.placeholder(tf.float32)
    
    with tf.name_scope("dnn"):
    
        hidden1 = tf.layers.dense(features, n_inputs, name="hidden1",
                                  activation=tf.nn.elu)#, kernel_regularizer=regularizer1)
        hidden2 = tf.layers.dense(hidden1, n_inputs, name="hidden2",
                                  activation=tf.nn.elu)#, kernel_regularizer=regularizer2)

        #max1 = tf.contrib.layers.maxout(hidden1, n_inputs//2)

    #     dropout1 = tf.nn.dropout(hidden1, keep_prob)
    #     hidden2 = tf.layers.dense(dropout1, n_hidden2, name="hidden2",
    #                               activation=tf.nn.relu)#, kernel_regularizer=regularizer2)
    #     dropout2 = tf.nn.dropout(hidden2, keep_prob)
    #     hidden3 = tf.layers.dense(dropout2, n_hidden3, name="hidden3",
    #                               activation=tf.nn.relu)#, kernel_regularizer=regularizer3)
    #     dropout3 = tf.nn.dropout(hidden3, keep_prob)
    #     hidden4 = tf.layers.dense(dropout3, n_hidden4, name="hidden4",
    #                               activation=tf.nn.relu)#, kernel_regularizer=regularizer4)
    #     dropout4 = tf.nn.dropout(hidden4, keep_prob)
    #     hidden5 = tf.layers.dense(dropout4, n_hidden5, name="hidden5",
    #                               activation=tf.nn.relu)#, kernel_regularizer=regularizer4)
    #     dropout5 = tf.nn.dropout(hidden5, keep_prob)
    #     hidden6 = tf.layers.dense(dropout5, n_hidden5, name="hidden6",
    #                               activation=tf.nn.relu)#, kernel_regularizer=regularizer4)
    #     dropout6 = tf.nn.dropout(hidden6, keep_prob)
    #     hidden7 = tf.layers.dense(dropout6, n_hidden5, name="hidden7",
    #                               activation=tf.nn.relu)#, kernel_regularizer=regularizer4)
    #     dropout7 = tf.nn.dropout(hidden7, keep_prob)
        logits = tf.layers.dense(hidden2, n_outputs, name="outputs")#, kernel_regularizer=regularizer5)
    

    with tf.name_scope("loss"):
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")
    #     l2_loss = tf.reduce_sum(tf.losses.get_regularization_losses())
    #     loss += l2_loss
        loss_summary = tf.summary.scalar('log_loss', loss)

    learning_rate = 0.002

    with tf.name_scope("train"):
        #optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        optimizer = tf.train.AdamOptimizer(learning_rate)
        training_op = optimizer.minimize(loss)

    with tf.name_scope("eval"):
        correct = tf.nn.in_top_k(logits, labels, 1)
        accuracy = tf.reduce_sum(tf.cast(correct, tf.float32))
        accuracy_summary = tf.summary.scalar('accuracy', accuracy)

    with tf.name_scope("predict"):
        output = tf.nn.softmax(logits)

    merged = tf.summary.merge_all()
    
    return training_op, output, loss, accuracy




# Train model

In [None]:
# Create train/test sets
trainarray,labelarray,testarray,testlabelarray = split_data(df, 0.8)

In [None]:
# transform data set to use data windows
from numpy.lib.stride_tricks import as_strided

trainarray, labelarray = features_2d_to_3d(np.array(trainarray), np.array(labelarray), 5)
testarray, testlabelarray = features_2d_to_3d(np.array(testarray), np.array(testlabelarray), 5)
trainarray = flatten_3d(trainarray)
testarray = flatten_3d(testarray)

In [None]:
# Create datasets & model

n_inputs = trainarray.shape[1]

features, labels, tr_init, te_init, n_inputs = create_datasets(trainarray, labelarray, testarray, testlabelarray, batch_size)
training_op, output, loss, accuracy = create_model(features, labels, n_inputs, n_outputs=3)

In [None]:
# Do training
batch_size = 100
train_n = trainarray.shape[0]
test_n = testarray.shape[0]
n_batches = train_n // batch_size
n_batches_test = test_n // batch_size

In [None]:
n_epochs = 10

columns = ['t-plus', 'loss', 'accuracy', 'test_loss', 'test_accuracy']
summaries = pd.DataFrame(np.zeros([n_epochs,5], dtype=float), columns=columns)
run_name = 'model1'

with tf.Session() as sess:
    start_time = time.time()
    #writer.add_graph(sess.graph)
    sess.run(tf.global_variables_initializer())
    tot_batches_run = 0
    for epoch in range(n_epochs):
        sess.run(tr_init) # drawing samples from train_data
        tot_loss = 0
        for i in range(n_batches):
            try:
                _, loss_value = sess.run([training_op, loss]) # , feed_dict={keep_prob : 0.75} # for dropout only
                tot_loss += loss_value
            except tf.errors.OutOfRangeError:
                print("out of range on iter {}".format(i))
                break
        
        
        # Now gauge training accuracy
        sess.run(tr_init) # drawing samples from test_data
        total_correct_preds = 0
        try:
            while True:
                accuracy_batch = sess.run(accuracy) # , feed_dict={keep_prob : 1} # for dropout only
                total_correct_preds += accuracy_batch
        except tf.errors.OutOfRangeError:
            pass
        tr_acc = total_correct_preds/train_n
        
        # Now get testing loss
        sess.run(te_init) # drawing samples from test_data
        test_tot_loss = 0
        for i in range(n_batches_test):
            try:
                loss_value = sess.run([loss]) # , feed_dict={keep_prob : 0.75} # for dropout only
                test_tot_loss += loss_value[0]
            except tf.errors.OutOfRangeError:
                print("out of range on iter {}".format(i))
                break
        
        # Now gauge testing accuracy
        sess.run(te_init) # drawing samples from test_data
        total_correct_preds = 0
        try:
            while True:
                accuracy_batch = sess.run(accuracy) # , feed_dict={keep_prob : 1} # for dropout only
                total_correct_preds += accuracy_batch
        except tf.errors.OutOfRangeError:
            pass
        te_acc = total_correct_preds/test_n

        
        epoch_time = time.time()
        print("Epoch: {}, Train_Loss: {:.4f}, Test_Loss: {:.4f}, Train_Accuracy: {:.4f}, Test_Accuracy: {:.4f}"\
              .format(epoch, tot_loss, test_tot_loss, tr_acc, te_acc))
        cum_time = epoch_time - start_time
        summaries.iloc[epoch,:] = cum_time, tot_loss, test_tot_loss, tr_acc, te_acc
     

# Closing remarks