In [None]:
%matplotlib inline
import pandas as pd
import tensorflow as tf
import numpy as np
from statsmodels.tsa.stattools import coint
from sklearn.preprocessing import MinMaxScaler

In [None]:
pep = pd.read_csv('data/market_data/KO.csv')
ko = pd.read_csv('data/market_data/PEP.csv')

In [None]:
df = pd.merge(left=ko,right=pep,left_on='dt',right_on='dt')

In [None]:
df['ratio'] = df.close_x/df.close_y

In [None]:
df['dt'] = pd.to_datetime(df.dt)

In [None]:
df['month'] = df.dt.dt.month
df['year'] = df.dt.dt.year
df['dow'] = df.dt.dt.dayofweek
df['hour'] = df.dt.dt.hour

In [None]:
df = pd.get_dummies(df,columns=['month','year','dow','hour'])

In [None]:
df['ratio_grouth'] = 1
df['ratio_grouth'].iloc[1:] = np.array(df['ratio'].iloc[1:])/np.array(df['ratio'].iloc[:-1])

In [None]:
df=df.iloc[1:]

In [None]:
df.plot(x='dt',y='ratio_grouth')

In [None]:
score,value, _ = coint(df[df.dt.dt.year>2013].open_y,df[df.dt.dt.year>2013].open_x)

In [None]:
def build_inputs(batch_size,num_steps,input_size):
    inputs = tf.placeholder(shape=(batch_size,num_steps,input_size),dtype=tf.float32,name='inputs')
    targets = tf.placeholder(shape=(batch_size,1),dtype=tf.float32,name='targets')
    
    return inputs,targets

def build_lstm(lstm_size,num_layers,batch_size):
    
    stacked_rnn = []
    for layer in range(num_layers):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=0.7)
        stacked_rnn.append(drop)
    
    cell = tf.contrib.rnn.MultiRNNCell(cells=stacked_rnn, state_is_tuple=True)
    
    initial_state = cell.zero_state(batch_size,dtype=tf.float32)
    
    return cell, initial_state

def build_output(lstm_output,in_size,out_size=1):
    #seq_output = tf.concat(lstm_output, axis=1)
    x1 = tf.reshape(tensor=lstm_output,shape=(-1,in_size))
    x1 = tf.contrib.layers.fully_connected(x1,int(in_size*1.5))
    x1 = tf.layers.batch_normalization(x1)
    x1 = tf.maximum(x1,x1*0.2)
    x1 = tf.nn.dropout(x1,0.8)
    
    x1 = tf.contrib.layers.fully_connected(x1,int(in_size*1.3))
    x1 = tf.layers.batch_normalization(x1)
    x1 = tf.maximum(x1,x1*0.2)
    x1 = tf.nn.dropout(x1,0.8)
    
    x = tf.contrib.layers.fully_connected(x1,in_size)
    x = tf.layers.batch_normalization(x)
    x = tf.maximum(x,x*0.2)
    x = tf.nn.dropout(x,0.6)
                    
    with tf.variable_scope('logits'):
        logits_w = tf.Variable(tf.truncated_normal((in_size,out_size),stddev=0.1))
        logits_b = tf.Variable(tf.zeros(out_size))
        
    logits = tf.add(tf.matmul(x,logits_w),logits_b)
    
    return logits

def build_loss(logits,targets):
    loss = tf.reduce_mean(tf.squared_difference(logits,targets))
    
    return loss

def build_optimizer(loss, grad_clip):

    # Optimizer for training, using gradient clipping to control exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss, tvars), grad_clip)
    train_op = tf.train.AdamOptimizer(0.000001)
    optimizer = train_op.apply_gradients(zip(grads, tvars))
    
    return optimizer

class Network:
    def __init__(self,batch_size,num_steps,lstm_size,num_layers,input_size,grad_clip=5):
        tf.reset_default_graph()
        
        self.inputs,self.targets = build_inputs(batch_size,num_steps,input_size)
        
        cell, self.initial_state = build_lstm(lstm_size,num_layers,batch_size)
        
        outputs, state = tf.nn.dynamic_rnn(cell=cell,initial_state=self.initial_state,inputs=self.inputs)
        
        self.final_state = state
        
        self.logits = build_output(outputs,num_steps*lstm_size,1)
        
        self.loss = build_loss(self.logits,self.targets)
        
        self.opt = build_optimizer(self.loss, grad_clip)
        

In [None]:
cols = ['d_ratio','d_dow_0', 'd_dow_1', 'd_dow_2', 'd_dow_3', 'd_dow_4','high_x','low_x','volume_x','high_y','low_y','volume_y', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12', 'year_2010', 'year_2011',
       'year_2012', 'year_2013', 'year_2014', 'year_2015', 'year_2016',
       'year_2017', 'dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'hour_9',
       'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15',
       'hour_16','ratio_grouth']

In [None]:
scaler = MinMaxScaler()
df[['high_x','low_x','volume_x','high_y','low_y','volume_y']] = scaler.fit_transform(df[['high_x','low_x','volume_x','high_y','low_y','volume_y']])

In [None]:
by_day=df.groupby(df.dt.dt.date).mean()
by_day=by_day.rename(columns={'ratio':'d_ratio'})

by_day['d_dow'] = pd.to_datetime(by_day.index).dayofweek
by_day = pd.get_dummies(by_day,columns=['d_dow'])

In [None]:
batch_size = 64
input_length = 30
input_size = len(cols)

n_batches = len(df)//(batch_size*input_length)
size = batch_size*n_batches*input_length

df = df[-1*size-2:]

model = Network(batch_size=batch_size, num_steps=input_length,
                lstm_size=64, num_layers=3, 
                input_size=input_size)

In [None]:
def get_batch():
    for i in range(841,size-input_length*batch_size-1):
        batch = df.iloc[i:i+input_length*batch_size+1,1:]
        
        temp = pd.DataFrame()
        
        for u  in range(0,len(batch)-input_length,input_length):
            to_ = batch.date.iloc[u+input_length-1]
            
            temp = pd.concat([temp,by_day[by_day.index<=to_].iloc[-30:][['d_ratio','d_dow_0', 'd_dow_1', 'd_dow_2', 'd_dow_3', 'd_dow_4']]],axis=0,ignore_index=True)
        
        temp=temp.append(pd.Series([0,0,0,0,0,0], index=['d_ratio','d_dow_0', 'd_dow_1', 'd_dow_2', 'd_dow_3', 'd_dow_4']),ignore_index=True)
        batch.reset_index(inplace=True)
        batch = pd.concat([batch,temp],axis=1)
    
        x = np.reshape(np.array(batch[cols].iloc[:-1]), (batch_size,input_length,input_size) )
        
        y = np.reshape(np.array(batch['ratio_grouth'].iloc[1:]), (batch_size,input_length,1) )[:,-1,:]

        yield x,y

In [None]:
epochs = 4
saver = tf.train.Saver(max_to_keep=100)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
   
    for e in range(epochs):
        total_loss=0
        new_state = sess.run(model.initial_state)
        n_batches=0
        for x, y in get_batch():
            feed = {model.inputs: x,
                    model.targets: y,
                    model.initial_state:new_state
                    }
           
            batch_loss, new_state, y_hat, _ = sess.run([model.loss,
                                                 model.final_state,
                                                 model.logits,
                                                 model.opt],
                                                 feed_dict=feed)
           
            if n_batches % 200 == 0:
                print('e',e,'n',n_batches,'pred',y_hat[3],'y',y[3])
           
            total_loss+= batch_loss**0.5
            n_batches+=1           
               
        print('mean',total_loss/n_batches)

