In [1]:
%matplotlib inline
import pandas as pd
import tensorflow as tf
import numpy as np
from statsmodels.tsa.stattools import coint
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns

In [2]:
#Loading Pepsi and Coca Cola datasets
pep = pd.read_csv('data/market_data/KO.csv')
ko = pd.read_csv('data/market_data/PEP.csv')

In [3]:
#Merging the datasets
df = pd.merge(left=ko,right=pep,left_on='dt',right_on='dt')

In [4]:
#Calculating difference between lowest and highest price at that timestep for each company separately
df['x_low_high_diff'] = df.high_x - df.low_x
df['y_low_high_diff'] = df.high_y - df.low_y

In [5]:
#If I try predicting each company's stock price separately will face the problem of seasonality. 
#Thats's why I calculate ratio of a few variables between the 2 similar companies and later will try to predict the ratio's grouth

df['close_ratio'] = df.close_x/df.close_y
df['open_ratio'] = df.open_x/df.open_y
df['volume_ratio'] = df.volume_x/df.volume_y
df['diff_ratio'] = df.x_low_high_diff/df.y_low_high_diff

In [6]:
df['dt'] = pd.to_datetime(df.dt)

df['month'] = df.dt.dt.month
df['year'] = df.dt.dt.year
df['dow'] = df.dt.dt.dayofweek
df['hour'] = df.dt.dt.hour

In [7]:
df['open_ratio_grouth'] = 1
df['close_ratio_grouth'] = 1
df['open_close_ratio_grouth'] = 1
df['volume_ratio_grouth'] = 1
df['diff_ratio_grouth'] = 1 

df.loc[1:,'open_ratio_grouth'] = np.array(df['open_ratio'].iloc[1:])/np.array(df['open_ratio'].iloc[:-1])
df.loc[1:,'close_ratio_grouth'] = np.array(df['close_ratio'].iloc[1:])/np.array(df['close_ratio'].iloc[:-1])
df.loc[1:,'volume_ratio_grouth'] = np.array(df['volume_ratio'].iloc[1:])/np.array(df['volume_ratio'].iloc[:-1])
df.loc[1:,'diff_ratio_grouth'] = np.array(df['diff_ratio'].iloc[1:])/np.array(df['diff_ratio'].iloc[:-1])


df.loc[:,'open_close_ratio_grouth'] = df['close_ratio']/df['open_ratio']
df.loc[:,'open_close_ratio_grouth'] = df['close_ratio']/df['open_ratio']


df.drop(df.index[0],inplace=True,axis=0)

  # Remove the CWD from sys.path while we load stuff.
  # Remove the CWD from sys.path while we load stuff.


In [8]:
df.head()

Unnamed: 0,dt,open_x,high_x,low_x,close_x,volume_x,open_y,high_y,low_y,close_y,...,diff_ratio,month,year,dow,hour,open_ratio_grouth,close_ratio_grouth,open_close_ratio_grouth,volume_ratio_grouth,diff_ratio_grouth
1,2010-01-04 10:00:00,60.71,60.98,60.7,60.951,832628,57.0,57.11,56.9,57.01,...,1.333333,1,2010,0,10,0.994941,1.003959,1.003794,1.333854,0.751515
2,2010-01-04 10:15:00,60.95,61.099,60.87,61.055,1123591,57.005,57.16,56.95,57.16,...,1.090476,1,2010,0,10,1.003865,0.999078,0.999006,0.986506,0.817857
3,2010-01-04 10:30:00,61.055,61.17,61.02,61.14,1387369,57.16,57.22,57.11,57.21,...,1.363636,1,2010,0,10,0.999006,1.000517,1.000517,1.004114,1.250496
4,2010-01-04 10:45:00,61.14,61.18,61.04,61.062,1644542,57.2,57.22,57.11,57.14,...,1.272727,1,2010,0,10,1.000692,0.999948,0.999773,1.036124,0.933333
5,2010-01-04 11:00:00,61.07,61.14,61.0,61.01,2001837,57.14,57.1685,57.05,57.09,...,1.181435,1,2010,0,11,0.999904,1.000023,0.999892,1.084119,0.92827


In [9]:
#subtract 1 from ratio_grouth, so if ratio decreases, ratio_grouth be negative. 
#It makes loss calculation easier and debugging, exploratory analysis more clear.  
df.loc[:,['open_ratio_grouth','close_ratio_grouth','open_close_ratio_grouth']]-=1
#because initial weights of tensorflow fully connected layers are a bit high for our target value,
#in stead of setting custom smaller weights I decided to make the target a bit larger by multiplying with 10
df.loc[:,'close_ratio_grouth']*=10

#exploring data after artificial grouth
ea_df = df[df.open_ratio>2]

In [10]:
df = pd.get_dummies(df,columns=['month','year','dow','hour'])

In [11]:
#Because open and close ratios can have special behaviour in the beginning of day or week,
#I multiply ratio grouths with respective dummy variables
df['org_9'] = df['open_ratio_grouth']*df['hour_9'] 
df['org_0_9'] = df['open_ratio_grouth']*df['hour_9']*df['dow_0'] 
df['crg_4_16'] = df['close_ratio_grouth']*df['hour_16']*df['dow_4']
df['crg_16'] = df['close_ratio_grouth']*df['hour_16']

In [12]:
scaler = MinMaxScaler(feature_range=(-0.5,0.5))
df[['org_9','org_0_9','high_x','low_x','high_y','low_y','close_ratio','open_close_ratio_grouth','volume_ratio_grouth','open_ratio','open_ratio_grouth']] = scaler.fit_transform(df[['org_9','org_0_9','high_x','low_x','high_y','low_y','close_ratio','open_close_ratio_grouth','volume_ratio_grouth','open_ratio','open_ratio_grouth']])

In [13]:
#All the features to be used to make prediction 
cols = ['org_9','org_0_9','crg_4_16','crg_16','high_x','low_x','high_y','low_y', 'month_1', 'month_2',
       'month_3', 'month_4', 'month_5', 'month_6', 'month_7', 'month_8',
       'month_9', 'month_10', 'month_11', 'month_12', 'year_2010', 'year_2011',
       'year_2012', 'year_2013', 'year_2014', 'year_2015', 'year_2016',
       'year_2017', 'dow_0', 'dow_1', 'dow_2', 'dow_3', 'dow_4', 'hour_9',
       'hour_10', 'hour_11', 'hour_12', 'hour_13', 'hour_14', 'hour_15',
       'hour_16','close_ratio','open_close_ratio_grouth','volume_ratio_grouth','open_ratio','open_ratio_grouth','close_ratio_grouth']

In [36]:
batch_size = 32
#number of features
input_size = len(cols)
#number of features to be used in the time step of the target value
#As our goal is predicting close ratio grouth I also use the open_ratio and open_ratio_grouth of the target's time time step
last_input_size = 2
#how many timesteps to look back
num_steps = 140
#how many timesteps forward to predict - 26 is nearly a day ahead
max_iter = 26
num_lstm_layers = 2
lstm_size = 68
learning_rate = 0.00001
epochs = 4

data = df[cols].values

In [37]:
def build_inputs(batch_size,num_steps,input_size,last_input_size):
    inputs = tf.placeholder(shape=(batch_size,num_steps,input_size),dtype=tf.float32,name='inputs')
    opens = tf.placeholder(shape=(batch_size,last_input_size),dtype=tf.float32,name='opens')
    targets = tf.placeholder(shape=(batch_size,max_iter,1),dtype=tf.float32,name='targets')
    is_training = tf.placeholder(dtype=tf.bool,name='is_training')
    
    return inputs,opens,targets

In [38]:
def build_lstm(lstm_size,num_layers,batch_size):
    
    stacked_rnn = []
    for layer in range(num_layers):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
        drop = tf.contrib.rnn.DropoutWrapper(lstm,output_keep_prob=0.7)
        stacked_rnn.append(drop)
    
    cell = tf.contrib.rnn.MultiRNNCell(cells=stacked_rnn, state_is_tuple=True)
    
    initial_state = cell.zero_state(batch_size,dtype=tf.float32)
    
    return cell, initial_state

In [39]:
def build_output(d_outputs,opens,in_size,out_size=1):
    
    d_outputs = tf.reshape(d_outputs,(-1,in_size))
    
    #MLP
    x1 = tf.contrib.layers.fully_connected(d_outputs,int(1.4*in_size),activation_fn=None)
    x1 = tf.layers.batch_normalization(x1)
    #leaky rely
    x1 = tf.maximum(x1,x1*0.2)
    x1 = tf.nn.dropout(x1,0.7)
    
    #in this step we concat the timeseries output we target timestep's values
    x2 = tf.contrib.layers.fully_connected(x1,in_size,activation_fn=None)
    x2 = tf.layers.batch_normalization(x2)
    x2 = tf.maximum(x2,x2*0.2)
    x2 = tf.nn.dropout(x2,0.7)
    
    x3 = tf.contrib.layers.fully_connected(x2,int(0.5*in_size),activation_fn=None)
    x3 = tf.layers.batch_normalization(x3)
    x3 = tf.maximum(x3,x3*0.2)
    x3 = tf.nn.dropout(x3,0.7)
    
    outputs = tf.contrib.layers.fully_connected(x3,out_size,activation_fn=None)
    
    return outputs

In [40]:
def build_loss(outputs, targets, out_size=1):
    
    targets = tf.reshape(targets, tf.shape(outputs))
    #simply calculating  squared difference
    loss1 = tf.reduce_mean(tf.squared_difference(outputs,targets))
    
    #calculate number of deals where predicted grouth but we had decrease and vice-versa
    z = tf.reshape(tf.multiply(outputs,targets),(-1,))
    x=tf.zeros((tf.shape(outputs)[0],))
    y=tf.ones((tf.shape(outputs)[0],))
    bad_deals = tf.where(z > 0, x, y)

    loss2 = tf.multiply(tf.reduce_mean(bad_deals),0.09)
    
    
    loss = tf.add(loss1,loss2)
    
    return loss

In [41]:
def build_decoder(state,targets):
    return NotImplemented

In [44]:
class Network:
    def __init__(self,batch_size,num_steps,lstm_size,num_lstm_layers,last_input_size,input_size,learning_rate):
        tf.reset_default_graph()
        
        self.inputs,self.opens,self.targets = build_inputs(batch_size,num_steps,input_size,last_input_size)
        
        cell, self.initial_state = build_lstm(lstm_size,num_lstm_layers,batch_size)
        
        _, state = tf.nn.dynamic_rnn(cell=cell,initial_state=self.initial_state,inputs=self.inputs)
        
        self.final_state = state
        print(state[-1])
        self.d_outputs, _ = build_decoder(state[-1],self.targets)
        
        self.outputs = build_output(self.d_outputs,self.opens,lstm_size)
        
        self.loss = build_loss(self.outputs,self.targets,batch_size)
        
        self.opt = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)

In [45]:
model = Network(batch_size=batch_size, num_steps=num_steps,
                lstm_size=lstm_size, num_lstm_layers=num_lstm_layers, 
                last_input_size = last_input_size, input_size=input_size,learning_rate=learning_rate)

LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_5:0' shape=(32, 68) dtype=float32>, h=<tf.Tensor 'rnn/while/Exit_6:0' shape=(32, 68) dtype=float32>)


TypeError: 'NotImplementedType' object is not iterable

In [None]:
def get_batch():
    l=(len(data)-num_steps*batch_size-max_iter)//4
    #passing thru all timesteps
    for i in range(l):
                
        #collecting batch from 4 different time fields
        j = len(data)//4
        ids = list(range(i,(i+(max_iter+num_steps)*batch_size//4))) + list(range(j,(j+(max_iter+num_steps)*batch_size//4))) + list(range(-1*(j+(max_iter+num_steps)*batch_size//4)-1,-j-1)) + list(range(-1*(i+(max_iter+num_steps)*batch_size//4)-1,-i-1))
        
        batch = np.reshape(data[ids],(batch_size,num_steps+max_iter,input_size))

        yield batch[:,:-max_iter],batch[:,-max_iter:,-4:-2],batch[:,-max_iter:,-1:],(l-i)<=100#x,x_open,y,test_batch(last 100 batches are used for testing)

In [None]:
#calculating proportion of deals where we managed to correctly predict if the ratio will rise or fall
def won_deal_prop(y,y_hat):
    y = np.reshape(y,(-1,1))
    win_deals = sum((y_hat[:,0]*y[:,0])>0)
    return win_deals,win_deals/len(y)

In [None]:
saver = tf.train.Saver(max_to_keep=100)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
   
    for e in range(epochs):
        
        total_train_loss,total_test_loss=0,0
        total_train_won_deals,total_test_won_deals=0,0
        
        n_train_batches=0
        n_test_batches=0
        new_state = sess.run(model.initial_state)
        
        for x, x_open, y, test_batch in get_batch():

            feed = {model.inputs: x,model.opens:x_open,
                    model.targets: y,model.initial_state:new_state
                    }
                   
            fetch = [model.loss,model.final_state,model.outputs,model.opt,model.is_training:True]
            if test_batch:
                #Last batches must be only used for testing so setting another value just to escape from optimizing 
                fetch[-1]=model.d_outputs
                feed[model.is_training]=False
                
            batch_loss, new_state, y_hat, _ = sess.run(fetch,feed_dict=feed)
            batch_loss = batch_loss**0.5
            win_deals = sum((y_hat[:,0]*y[:,0])>0)
            win_deal_prop = win_deals/len(y)
                         
            if n_train_batches % 200 == 0 and not test_batch:
                print(e,n_train_batches,'loss',batch_loss,'won deals',win_deal_prop)
            
            if test_batch:
                total_test_loss+= batch_loss
                total_test_won_deals+=win_deals
                n_test_batches+=1  
            else:
                total_train_loss+= batch_loss
                total_train_won_deals+=win_deals
                n_train_batches+=1  
                     
        mean_test_loss=total_test_loss/n_test_batches    
        print('Train loss',total_train_loss/n_train_batches, 'Train won deals prop',total_train_won_deals/(n_train_batches*len(y)))
        print('Test loss',mean_test_loss, 'Test won deals prop',total_test_won_deals/(n_test_batches*len(y)))
        path = 'checkpoints/long_run_e{0}_loss{1}'.format(e,mean_test_loss)
        saver.save(sess,save_path=path)

In [None]:
#draft
seq_length =  [max_iter for _ in range(batch_size)]
    
cell = tf.contrib.rnn.BasicLSTMCell(in_size)

helper = tf.contrib.seq2seq.TrainingHelper(targets,seq_length)
    
decoder = tf.contrib.seq2seq.BasicDecoder(cell=cell,helper=helper,initial_state=state[-1])
d_outputs,_,_ = tf.contrib.seq2seq.dynamic_decode(decoder,max_iter)