In [2]:
import os
import sys
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import tensorflow as tf
import numpy as np
import scipy as sp
import sklearn as sk

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [3]:
STOCK_NAMES = ['AAPL','MSFT','AMZN','GOOG','BRKB',
              'FB','JNJ','JPM','XOM','V','WMT','BAC',
              'UNH','PFE','WFC','VZ','PG','CVX','T',
              'INTC','CSCO','HD','MA','KO','BA',
              'MRK','ORCL','DIS','CMCSA','PEP','C',
              'MCD','PM','DWDP','ABBV','NFLX','MDT',
              'ABT','NVDA','AMGN','NKE','ADBE','MO','LLY',
              'MMM','IBM','HON','UNP','ACN',
              'UTX']

PROJECT_PATH = '/pine/scr/s/i/siyangj/DeepStock/FinalProject/'
DATA_PATH = os.path.join(PROJECT_PATH,'Data/')

VOLUME_NORMALIZATION_ID = 'VOLUME_NORMALIZATION'

DATA_XY = os.path.join(DATA_PATH,'XY_sequence.h5')
X_ID = 'X'
Y_ID = 'Y'

NUM_DAYS = 5
NUM_PER_DAY = 388
NUM_STOCKS = 50
NUM_PER_STOCK = 5

In [4]:
DATA_XY

'/pine/scr/s/i/siyangj/DeepStock/FinalProject/Data/XY_sequence.h5'

In [56]:
#set FLAG and get data
class BaseConfig(object):
    seq_length=NUM_PER_DAY      #seq lenght
    batch_size=NUM_DAYS-1      #batch_size
    feature_num=NUM_STOCKS*NUM_PER_STOCK      #dim of a seq
    y_size=NUM_STOCKS
    lstm_size=64   #hidden layer units
    lstm_layers=6
    keep_prob=0.8
    lr=0.0001        #learn rate
    sep=0.8         #train and test sep
    epoch_size=10000 #train number
    save_path='./ckpt/'
    
config=BaseConfig()

In [9]:
data_X = pd.read_hdf(DATA_XY,X_ID).values
data_Y = pd.read_hdf(DATA_XY,Y_ID).values

In [25]:
int(data_X.shape[0] / config.seq_length)

97

In [21]:
a = np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]])
a.reshape([2,2,3])[0]

array([[1, 2, 3],
       [4, 5, 6]])

In [31]:
def PrepareData(data_X,data_Y,config):
    
    X,y = [],[]
    total_batch = int(data_X.shape[0] / config.seq_length)
    for i in range(total_batch):
        X.append(data_X[i*config.seq_length:(i+1)*config.seq_length,:])
        y.append(data_Y[i*config.seq_length:(i+1)*config.seq_length,:])
    
    train_size=int(config.sep*len(X))
    split_index=[1]*train_size
    split_index.extend([0] * (len(X) - train_size))
    np.random.shuffle(split_index)

    #division all_data into train and test data
    train_X,train_y,test_X,test_y=[],[],[],[]
    for i,v in enumerate(split_index):
        if v==0:
            test_X.append(X[i])
            test_y.append(y[i])
        else:
            train_X.append(X[i])
            train_y.append(y[i])
    train_X=np.array(train_X).astype('float32')
    train_y=np.array(train_y).astype('float32')
    test_X=np.array(test_X).astype('float32')
    test_y=np.array(test_y).astype('float32')
    return train_X,train_y,test_X,test_y

In [32]:
train_X,train_y,test_X,test_y=PrepareData(data_X,data_Y,config)

In [54]:
#general W
def W_var(in_dim,out_dim):
    return tf.Variable(tf.random_normal([in_dim,out_dim]),tf.float32)

#general b
def b_var(out_dim):
    return tf.Variable(tf.random_normal([out_dim,]),tf.float32)

#lstm : 64 lstm_size, 2 lstm_layer

def lstm_cell(config,keep_prob):
    temp=tf.contrib.rnn.BasicLSTMCell(config.lstm_size,reuse=False)
    drop = tf.nn.rnn_cell.DropoutWrapper(temp, output_keep_prob=keep_prob)
    return drop

def lstm_layers(config,X,keep_prod):
    #input
    
    stacked_lstm = tf.contrib.rnn.MultiRNNCell(
    [lstm_cell(config,keep_prod) for _ in range(config.lstm_layers)])
    initial_state = stacked_lstm.zero_state(config.batch_size, tf.float32)
    
    outputs, final_state = tf.nn.dynamic_rnn(stacked_lstm, X, 
          initial_state=initial_state)
    return outputs,final_state
        
def output_layers(config,output_lstm):
    in_size=output_lstm.get_shape()[-1].value
    #output_lstm=output_lstm[:,-1,:]
    output_lstm=tf.reshape(output_lstm,[-1,in_size])
    W=W_var(in_size,config.y_size)
    b=b_var(config.y_size)
    ###################################
    ## TODO
    ## Should change to tf.nn.xw_plus_b
    #output_final=tf.add(tf.matmul(output_lstm,W),b)
    output_final=tf.nn.xw_plus_b(output_lstm,W,b)
    output_final = tf.reshape(output_final,[config.batch_size,config.seq_length,config.y_size])
    return output_final

def loss_function(config,pred,Y):
    
    '''
    Not for regression. Damn.
    loss = tf.contrib.seq2seq.sequence_loss(
        pred,
        Y,
        tf.ones([config.batch_size, config.seq_length]),
        average_across_timesteps=True,
        average_across_batch=True)
    '''

    # Update the cost
    # cost = tf.reduce_sum(loss)
    # print(pred.shape,Y.shape)
    cost=tf.reduce_mean(tf.square(pred-Y))
    return cost

def optimizer_function(config,loss):
    opt=tf.train.AdamOptimizer(config.lr).minimize(loss)
    return opt


class train_body:

    def __init__(self):
        self.X_placehold=tf.placeholder(tf.float32, [config.batch_size,config.seq_length,config.feature_num])
        self.Y_placehold=tf.placeholder(tf.float32, [config.batch_size,config.seq_length,config.y_size])
        self.keep_prod=tf.placeholder(tf.float32)
        self.output_lstm,_=lstm_layers(config,self.X_placehold,self.keep_prod)

        self.output_final=output_layers(config,self.output_lstm)

        self.loss=loss_function(config,self.output_final,self.Y_placehold)

        self.opt=optimizer_function(config,self.loss)

        

In [60]:
def myrun():
    
    tb=train_body()

    #save model

    saver=tf.train.Saver(tf.global_variables())
    with tf.Session() as sess:
        
        tf.global_variables_initializer().run()
        
        range(int(len(train_X)/config.batch_size))

        for e in range(config.epoch_size):
            loss_values1=np.array([])
            for i in range(int(len(train_X)/config.batch_size)):
                
                tempx=train_X[i*config.batch_size:i*config.batch_size+config.batch_size]
                tempy=train_y[i*config.batch_size:i*config.batch_size+config.batch_size]
                #print(tempx)
                tmp_loss_value,_=sess.run([tb.loss,tb.opt],feed_dict={tb.X_placehold:tempx,
                                                                      tb.Y_placehold:tempy,
                                                                      tb.keep_prod:config.keep_prob})
                loss_values1=np.append(loss_values1,tmp_loss_value)
                
            if e%10==0:
                loss_values2=np.array([])
                for i in range(int(len(test_X)/config.batch_size)):

                    tempx=test_X[i*config.batch_size:i*config.batch_size+config.batch_size]
                    tempy=test_y[i*config.batch_size:i*config.batch_size+config.batch_size]
                    #print(tempx)
                    tmp_loss_value=sess.run([tb.loss],feed_dict={tb.X_placehold:tempx,
                                                                 tb.Y_placehold:tempy,
                                                                 tb.keep_prod:1})
                    loss_values2=np.append(loss_values2,tmp_loss_value)
               
                print('std is: ',train_y.std())
                print('ephoch: '+ str(e)+'\ntrain loss is: '+str(loss_values1.mean())
                      +'; test loss is: ' + str(loss_values2.mean()))
                #print('ephoch: '+ str(e)+'\ntrain loss is: '+str(loss_values1.mean()))
                print ("save model:",saver.save(sess,config.save_path+'ckpt-{:d}'.format(e)))
myrun()

std is:  0.0006764979
ephoch: 0
train loss is: 0.8965866565704346; test loss is: nan
save model: ./ckpt/ckpt-0


  ret = ret.dtype.type(ret / rcount)


std is:  0.0006764979
ephoch: 10
train loss is: 0.819682776927948; test loss is: nan
save model: ./ckpt/ckpt-10
std is:  0.0006764979
ephoch: 20
train loss is: 0.7191542387008667; test loss is: nan
save model: ./ckpt/ckpt-20
std is:  0.0006764979
ephoch: 30
train loss is: 0.6048129200935364; test loss is: nan
save model: ./ckpt/ckpt-30
std is:  0.0006764979
ephoch: 40
train loss is: 0.5039969682693481; test loss is: nan
save model: ./ckpt/ckpt-40
std is:  0.0006764979
ephoch: 50
train loss is: 0.4146208167076111; test loss is: nan
save model: ./ckpt/ckpt-50
std is:  0.0006764979
ephoch: 60
train loss is: 0.3547157347202301; test loss is: nan
save model: ./ckpt/ckpt-60
std is:  0.0006764979
ephoch: 70
train loss is: 0.3207405209541321; test loss is: nan
save model: ./ckpt/ckpt-70
std is:  0.0006764979
ephoch: 80
train loss is: 0.3050563335418701; test loss is: nan
save model: ./ckpt/ckpt-80


KeyboardInterrupt: 

In [59]:
tf.reset_default_graph()