In [1]:
#1st step - Install all required libraries
"""
!pip install --upgrade pandas
!pip install --upgrade numpy
!pip install --upgrade matplotlib
!pip install --upgrade seaborn
!pip install --upgrade scipy
!pip install --upgrade tensorflow
!pip install --upgrade keras
!pip install --upgrade sklearn
"""

'\n!pip install --upgrade pandas\n!pip install --upgrade numpy\n!pip install --upgrade matplotlib\n!pip install --upgrade seaborn\n!pip install --upgrade scipy\n!pip install --upgrade tensorflow\n!pip install --upgrade keras\n!pip install --upgrade sklearn\n'

In [2]:
import scipy.signal as sci
import scipy.stats as scp
import pandas as pd
import matplotlib
matplotlib.use('nbagg')
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from math import sqrt
import math
import os
import json
import time
import seaborn as sns
from core.data_processor import DataLoader
from core.model import Model
import datetime
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils


Using TensorFlow backend.


In [3]:
#Parameters 
class Parameters(object):
    pass

param = Parameters()
param.tickSize = 0.5 #tick size is 0.5 GBp i.e. 0.005 GBP

param.fileDirectory = './intraday/'
param.trade_InSampleFile = 'trade_in.csv'
param.quote_InSampleFile = 'quote_in.csv'

param.trade_OutSampleFile = 'trade_out.csv'
param.quote_OutSampleFile = 'quote_out.csv'

In [4]:
param.fileDirectory

'./intraday/'

In [5]:
#Identify future mid prices - 30 seconds duration
def IdentifyFutureMidPrices(df, predictionDuration = 30):
    futDat = df[['datetime', 'mid']].rename(columns={'mid':'futMid'})
    futDat['datetime'] = futDat['datetime'] - pd.offsets.timedelta(seconds=int(predictionDuration))
    return pd.merge_asof(df, futDat, on='datetime', direction='backward')

def ReadCSV(file):
    print('Loading file - ' + file)
    df = pd.read_csv(file)
    df['datetime'] = pd.to_datetime(df['datetime'], format="%Y-%m-%dD%H:%M:%S.%f")
    return df

#Load data
def LoadData(path, tradeFile, quoteFile):
    tradeFile = os.path.join(path, tradeFile)
    quoteFile = os.path.join(path, quoteFile)

    trade_df = ReadCSV(tradeFile)
    quote_df = ReadCSV(quoteFile)
    
    quote_df['mid'] = 0.5*(quote_df['bid'].copy() + quote_df['ask'].copy())
    quote_df['midChangeGroup'] = quote_df['mid'].diff().ne(0).cumsum()
    quote_df = IdentifyFutureMidPrices(quote_df)
    return trade_df, quote_df

#Evaluation function
#df should contain columns - datetime, sym, bsize, bid, ask, asize, predMid (model predicted mid-price)
#Function to evaluate results and return RMS if needed
def RMS(df, ret = False):
    df = df.groupby(['midChangeGroup']).first().reset_index()
    tmp = df.dropna(subset=['predMid', 'futMid'])
    rms = sqrt(mean_squared_error(tmp['futMid'], tmp['predMid']))
    predCount = len(tmp['predMid'])
    print('RMS = %.4f. #Predictions = %s' % (rms, predCount))
    if ret == True:
        return rms

In [6]:
df_tradein,df_quotein   = LoadData(param.fileDirectory,param.trade_InSampleFile,param.quote_InSampleFile)
df_tradeout,df_quoteout = LoadData(param.fileDirectory,param.trade_OutSampleFile,param.quote_OutSampleFile)

Loading file - ./intraday/trade_in.csv
Loading file - ./intraday/quote_in.csv
Loading file - ./intraday/trade_out.csv
Loading file - ./intraday/quote_out.csv


In [7]:
df_quotein.head()

Unnamed: 0,datetime,sym,bsize,bid,ask,asize,mid,midChangeGroup,futMid
0,2018-02-01 05:00:09.686,TEST.L,11850,4800.0,5100.0,8450,4950.0,1,4950.0
1,2018-02-01 07:50:00.048,TEST.L,11850,4800.0,4884.0,237,4842.0,2,4842.0
2,2018-02-01 07:50:00.077,TEST.L,11965,4800.0,4884.0,237,4842.0,2,4842.0
3,2018-02-01 07:50:00.078,TEST.L,12084,4800.0,4884.0,237,4842.0,2,4842.0
4,2018-02-01 07:50:00.095,TEST.L,12384,4800.0,4884.0,237,4842.0,2,4842.0


In [8]:
df_tradein.head()

Unnamed: 0,datetime,sym,price,size
0,2018-02-01 07:50:00.077,TEST.L,4884.0,115
1,2018-02-01 07:50:00.078,TEST.L,4884.0,234
2,2018-02-01 07:50:00.095,TEST.L,4884.0,534
3,2018-02-01 07:50:00.177,TEST.L,4884.0,534
4,2018-02-01 07:50:00.179,TEST.L,4884.0,534


In [9]:
#Feature Engineering - contains all the important features to be used 
def feature_engg(df_quote,df_trade):
    df1 = df_quote[['datetime', 'bid', 'ask', 'bsize', 'asize','midChangeGroup','futMid','mid']]
    df2 = df_trade[['datetime', 'price', 'size']]

    # To find when the market opens 
    trade_rate = df2['size'].shift(-1)-df2['size']
    peaks = sci.find_peaks(x=-trade_rate, threshold=10000)[0]
    start = peaks[0]+1

    # Discarding data before market opening
    trade = df2[start:len(df2)]
    quote = df1[df1['datetime']>=df2.loc[start,'datetime']]
    
    # Finding change in bid_size and ask_size to spot sell orders and buy orders
    quote['b_change'] = quote['bsize']-quote['bsize'].shift(1)
    quote['a_change'] = quote['asize']-quote['asize'].shift(1)
    
    # Finding arrival rate
    quote['del_t'] = (quote['datetime'] - quote['datetime'].shift(1)).dt.total_seconds()
    quote.loc[quote['del_t'] == 0, 'del_t'] = np.nan
    quote['del_t'].fillna(method = 'ffill', inplace = True)
    quote['order_change'] = 0
    quote.loc[quote['b_change'] != 0, 'order_change'] = quote['b_change']
    quote.loc[quote['a_change'] != 0, 'order_change'] = quote['a_change']
    
    # VARIABLE 1 - arrival_rate = d(order_size)/(dt)
    quote['arrival_rate'] = quote['order_change']/quote['del_t']
    
    # Create a column 'merger' to match trade with quote based on trade price matching 
    # either the ask price or bid price (decided by change in bsize or asize) 
    quote['merger'] = quote['datetime']
    quote.reset_index(drop = True, inplace = True)
    quote.loc[quote[quote['b_change'] != 0].drop(0, axis = 0).index, 'merger'] = quote.loc[quote['b_change'] != 0, 'datetime'].drop(0, axis = 0).apply(str).values + '-' + quote.loc[quote['b_change'].shift(-1) != 0, 'bid'].apply(str).values[:-1]
    quote.loc[quote[quote['a_change'] != 0].drop(0, axis = 0).index, 'merger'] = quote.loc[quote['a_change'] != 0, 'datetime'].drop(0, axis = 0).apply(str).values + '-' + quote.loc[quote['a_change'].shift(-1) != 0, 'ask'].apply(str).values[:-1]

    # Cumulative count label to handle multiple trades and quotes at the same time instant
    quote['cum_count'] = quote.groupby('merger').cumcount()
    quote['merger'] = quote['cum_count'].apply(str) + ':' + quote['merger'].apply(str)

    # Adding cumulative count to merger to create unique values in merge column'
    trade['merger'] = trade['datetime'].apply(str) + '-' + trade['price'].apply(str)
    trade['cum_count'] = trade.groupby('merger').cumcount()
    trade['merger'] = trade['cum_count'].apply(str) + ':' + trade['merger'].apply(str)

    # Drop unnecessary column
    trade.drop('cum_count', axis = 1, inplace = True)
    quote.drop('cum_count', axis = 1, inplace = True)

    # Merge Quote and Trade based on the 'merger' column
    df = quote.merge(trade, how = 'left', on='merger')
    
    # Drop unnecessary columns
    df.drop(['datetime_y', 'a_change', 'b_change', 'merger', 'del_t', 'order_change'], inplace = True, axis = 1)

    # Finding accumalated imbalance
    df.set_index('datetime_x', inplace = True)
    df['Accum_Bid'] = df.loc[:,['bsize']].rolling('1s').sum()
    df['Accum_Ask'] = df.loc[:,['asize']].rolling('1s').sum()
    
    # VARIABLE 2 - integrated imbalance = (total bsize - total asize)/(total bsize + total asize) in 1s
    df['imbalance'] = (df['Accum_Bid'] - df['Accum_Ask'])/(df['Accum_Bid'] + df['Accum_Ask'])
    df.reset_index(inplace = True)
    
    # Trade size and Trade Price resolving NAs
    # VARIABLE 4 - size of Trade
    df['size'].fillna(0, inplace = True)
    
    # VARIABLE 5 - price of Trade
    df['price'].fillna(0, inplace = True)
        

    # Identifying buy/sell Trades and no Trades
    # VARIABLE 3 - Trade = +1 (Buy/Sell), 0 (No Trade)
    df['Trade'] = 0
    df.loc[df['price']!=0, 'Trade'] = 1
    
    # VARIABLE 7
    df['Buy'] = 0
    df.loc[df['price'] == df['ask'].shift(1), 'Buy'] = 1
    df.loc[df['price'] == df['bid'].shift(1), 'Buy'] = 0


    # VARIABLE 6 - spread = bid - ask
    df['spread'] = df['ask'] - df['bid']
    
    # Drop unnecessary columns
    df.drop(['bsize', 'asize', 'ask', 'bid', 'Accum_Bid', 'Accum_Ask'], inplace = True, axis = 1)
    
    return df.dropna().reset_index(drop = True)

In [10]:
df = feature_engg(df_quotein,df_tradein)

In [11]:
df.head()

Unnamed: 0,datetime_x,midChangeGroup,futMid,mid,arrival_rate,price,size,imbalance,Trade,Buy,spread
0,2018-02-01 08:00:18.248,7,4839.5,4832.75,458000.0,0.0,0.0,-0.273973,0,0,8.5
1,2018-02-01 08:00:18.248,8,4839.5,4834.5,-90500.0,0.0,0.0,-0.509378,0,0,5.0
2,2018-02-01 08:00:18.248,8,4839.5,4834.5,15500.0,0.0,0.0,-0.607748,0,0,5.0
3,2018-02-01 08:00:18.248,8,4839.5,4834.5,7500.0,0.0,0.0,-0.659197,0,0,5.0
4,2018-02-01 08:00:18.249,8,4839.5,4834.5,-31000.0,0.0,0.0,-0.701681,0,0,5.0


In [12]:
# Figure out timesteps of 30sec from a particular indice i
def timestep(X,indice):
    start = X.loc[X['datetime_x'] <= X.loc[indice,'datetime_x']+datetime.timedelta(seconds = 30)].index[-1]
    timestep = start - indice
    return timestep

In [13]:
# This snippet tries to find the number of timestep to 30seconds for the first 10000 data points
windowsize_finder = []
for i in range(len(df.iloc[:10000])):
    windowsize_finder.append(timestep(df.iloc[:10000],i))
#Plot the histrogram of the resultant distribution
sns.distplot(windowsize_finder)
plt.show()

<IPython.core.display.Javascript object>

In [14]:
# Smoothing and Label Creator, creates labels +1,0,-1 signifying the tick that would most likely occur 30sec later
# By looking at the graph we theorize that a rolling mean window size of 300 works the best for us
def target_creator(df,alpha=0.0):
    w_size = 300
    mm = df['mid'].rolling(w_size).mean()
    mp = df['mid'].iloc[::-1].rolling(w_size).mean()[::-1].shift(-1)
    df['dec'] = (mm.iloc[w_size:-w_size] > mp.iloc[w_size:-w_size]*(1+alpha))
    df['inc'] = (mm.iloc[w_size:-w_size] < mp.iloc[w_size:-w_size]*(1-alpha))
    df['target'] = 0
    df.loc[df['dec'] == True, 'target'] = -1
    df.loc[df['inc'] == True, 'target'] = 1
    return df

In [15]:
df = target_creator(df, 0.0001).drop(['inc', 'dec'], axis = 1)

In [16]:
#To find the distribution
sns.distplot(df.target)
plt.show()

<IPython.core.display.Javascript object>

In [17]:
df.describe()

Unnamed: 0,midChangeGroup,futMid,mid,arrival_rate,price,size,imbalance,Trade,Buy,spread,target
count,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0
mean,188105.8,4584.491,4584.481,794.95,451.8676,17.66611,0.004559139,0.0988119,0.05266285,1.142605,-0.03013593
std,110616.7,126.3781,126.3525,212758.2,1365.207,2607.61,0.3176332,0.2984094,0.2233596,2.115517,0.8715804
min,7.0,4378.75,4378.5,-49950000.0,0.0,0.0,-0.9983285,0.0,0.0,0.5,-1.0
25%,83602.0,4481.0,4481.0,-12500.0,0.0,0.0,-0.2068147,0.0,0.0,1.0,-1.0
50%,198033.0,4558.0,4558.0,-21.06125,0.0,0.0,0.005764021,0.0,0.0,1.0,0.0
75%,285155.8,4668.0,4668.0,12000.0,0.0,0.0,0.2161149,0.0,0.0,1.5,1.0
max,370697.0,4864.25,4864.75,42400000.0,4889.0,1719464.0,0.999216,1.0,1.0,300.0,1.0


In [18]:
#Normalizing the code before inputing to the model
def normalize(df, columns, binary_columns):
    df[binary_columns] = df[binary_columns].replace({0: np.nan})
    mean = df[columns].mean()
    std = df[columns].std()
    df[columns] = (df[columns] - mean)/std
    df[binary_columns] = df[binary_columns].fillna(0)
    return [df, mean, std]

In [19]:
columns = ['imbalance', 'spread', 'arrival_rate', 'mid', 'price', 'size' ]
binary_columns = ['price', 'size']
[df_std, mean, std] = normalize(df, columns, binary_columns)
print('Saving as CSV...')
df_std.to_csv('merged.csv')
df_std.info()

Saving as CSV...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1672754 entries, 0 to 1672753
Data columns (total 12 columns):
datetime_x        1672754 non-null datetime64[ns]
midChangeGroup    1672754 non-null int64
futMid            1672754 non-null float64
mid               1672754 non-null float64
arrival_rate      1672754 non-null float64
price             1672754 non-null float64
size              1672754 non-null float64
imbalance         1672754 non-null float64
Trade             1672754 non-null int64
Buy               1672754 non-null int64
spread            1672754 non-null float64
target            1672754 non-null int64
dtypes: datetime64[ns](1), float64(7), int64(4)
memory usage: 153.1 MB


In [20]:
print('Mean = \n',mean, 'Std = \n', std)

('Mean = \n', imbalance          0.004559
spread             1.142605
arrival_rate     794.949975
mid             4584.481069
price           4573.008394
size             178.785205
dtype: float64, 'Std = \n', imbalance            0.317633
spread               2.115517
arrival_rate    212758.241086
mid                126.352507
price              126.422281
size              8293.698866
dtype: float64)


In [21]:
df_std.describe()

Unnamed: 0,midChangeGroup,futMid,mid,arrival_rate,price,size,imbalance,Trade,Buy,spread,target
count,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0,1672754.0
mean,188105.8,4584.491,2.578549e-15,-9.450801000000001e-17,-1.821772e-16,1.3592769999999998e-19,-3.438292e-16,0.0988119,0.05266285,2.2428080000000002e-17,-0.03013593
std,110616.7,126.3781,1.0,1.0,0.3143427,0.3143427,1.0,0.2984094,0.2233596,1.0,0.8715804
min,7.0,4378.75,-1.63021,-234.7773,-1.534606,-0.02143618,-3.157377,0.0,0.0,-0.3037577,-1.0
25%,83602.0,4481.0,-0.8189871,-0.06248853,0.0,0.0,-0.6654653,0.0,0.0,-0.06740889,-1.0
50%,198033.0,4558.0,-0.2095809,-0.003835392,0.0,0.0,0.003793312,0.0,0.0,-0.06740889,0.0
75%,285155.8,4668.0,0.6609994,0.05266565,0.0,0.0,0.666038,0.0,0.0,0.1689399,1.0
max,370697.0,4864.25,2.218151,199.2835,2.499493,207.3002,3.131464,1.0,1.0,141.2692,1.0


In [55]:
# LSTM model in ./core/model.py
# Data loader class in ./core/data_loader.py
# config contains all required parameters for the model 

def model_run():
    configs = json.load(open('config.json', 'r'))
    if not os.path.exists(configs['model']['save_dir']): os.makedirs(configs['model']['save_dir'])

    data = DataLoader(
        configs['data']['filename'],
        configs['data']['train_test_split'],
        configs['data']['columns']
    )

    model = Model()
    model.build_model(configs)
    model.load_model('./saved_models/18012019-162559-e40.h5')

    # out-of memory generative training
    steps_per_epoch = math.ceil((data.len_train - configs['data']['sequence_length']) / configs['training']['batch_size'])
    model.train_generator(
        data_gen=data.generate_train_batch(
            seq_len=configs['data']['sequence_length'],
            batch_size=configs['training']['batch_size'],
            normalise=configs['data']['normalise']
        ),
        epochs=configs['training']['epochs'],
        batch_size=configs['training']['batch_size'],
        steps_per_epoch=steps_per_epoch,
        save_dir=configs['model']['save_dir']
    )

    return model


In [57]:
# model = model_run()

In [49]:
#Loading saved model
from keras.models import load_model

# returns a compiled model
#To run if not trained the model
configs = json.load(open('config.json', 'r'))
model = Model()
model.build_model(configs)
model.load_model('./saved_models/18012019-162559-e40.h5')


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_34 (LSTM)               (None, 35, 30)            4680      
_________________________________________________________________
dropout_23 (Dropout)         (None, 35, 30)            0         
_________________________________________________________________
lstm_35 (LSTM)               (None, 35, 40)            11360     
_________________________________________________________________
lstm_36 (LSTM)               (None, 40)                12960     
_________________________________________________________________
dropout_24 (Dropout)         (None, 40)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 3)                 123       
Total params: 29,123
Trainable params: 29,123
Non-trainable params: 0
_________________________________________________________________
[Model

In [29]:
#Getting the test data ready 
df_test = feature_engg(df_quoteout,df_tradeout)

In [24]:
# Normalise test data
def norm_test(df_test, mean, std, binary_cols):
    columns = list(mean.index)
    df_test[binary_columns].replace({0: np.nan})
    df_test[columns] = (df_test[columns] - mean)/std
    df_test[binary_columns] = df_test[binary_columns].fillna(0)
    return df_test

# Normalizing test data using the mean and std obtained from the training data
df_test_std = norm_test(df_test,mean,std,binary_columns)
df_test_std.reset_index(inplace = True, drop = True)

# Saving as CSV
print('Saving as CSV...')
df_test_std.to_csv('merged_test.csv')
df_test_std.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207256 entries, 0 to 207255
Data columns (total 11 columns):
datetime_x        207256 non-null datetime64[ns]
midChangeGroup    207256 non-null int64
futMid            207256 non-null float64
mid               207256 non-null float64
arrival_rate      207256 non-null float64
price             207256 non-null float64
size              207256 non-null float64
imbalance         207256 non-null float64
Trade             207256 non-null int64
Buy               207256 non-null int64
spread            207256 non-null float64
dtypes: datetime64[ns](1), float64(7), int64(3)
memory usage: 17.4 MB


In [25]:
df_test_std.head()

Unnamed: 0,datetime_x,midChangeGroup,futMid,mid,arrival_rate,price,size,imbalance,Trade,Buy,spread
0,2018-02-14 08:00:22.710,8,4435.5,-1.224598,1.105504,-36.172488,-0.021557,2.003214,0,0,7.732103
1,2018-02-14 08:00:22.710,8,4435.5,-1.224598,3.690598,-36.172488,-0.021557,2.008645,0,0,7.732103
2,2018-02-14 08:00:22.710,9,4435.5,-1.198877,-4.299692,-36.172488,-0.021557,1.864455,0,0,4.659568
3,2018-02-14 08:00:22.710,9,4435.5,-1.198877,3.897405,-36.172488,-0.021557,1.878608,0,0,4.659568
4,2018-02-14 08:00:22.710,9,4435.5,-1.198877,3.897405,-36.172488,-0.021557,1.482022,0,0,4.659568


In [26]:
#For testing the data, we are testing it on cross validation data
columns = [ "Buy","Buy","spread","price","size","arrival_rate","Trade","imbalance","mid"]
data_test = DataLoader(configs['data']['test_filename'],0,columns)
x_test = data_test.get_test_data(seq_len=configs['data']['sequence_length'],normalise=configs['data']['normalise'])

In [52]:
Num_pred = 100000 # To get reasonable amount of predictions (#24000) 
df_test['predMid'] = np.NaN
pred_train = model.predict_point_by_point(x_test[:Num_pred,:,:])
df_test['predMid'].iloc[:Num_pred] = df_test.mid.iloc[:Num_pred] + [param.tickSize*i for i in pred_train]

[Model] Predicting Point-by-Point...


In [53]:
#RMS error
RMS(df_test[:Num_pred])

RMS = 2.8937. #Predictions = 24003
