In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge

In [2]:
#[DONOTCHANGE]
#3rd step - all parameters
class Parameters(object):
    pass

param = Parameters()
param.tickSize = 0.5 #tick size is 0.5 GBp i.e. 0.005 GBP

param.fileDirectory = './intraday'

param.trade_InSampleFile = 'trade_in.csv'
param.quote_InSampleFile = 'quote_in.csv'

param.trade_OutSampleFile = 'trade_out.csv'
param.quote_OutSampleFile = 'quote_out.csv'

#4th step - Model specific parameters
param.imbalanceThreshold = 0.7
param.timeDuration = 30 #30 seconds

In [11]:
#Initialise libraries and functions
from sklearn.metrics import mean_squared_error
from math import sqrt

import os

#Disable certain warnings
pd.options.mode.chained_assignment = None


#Identify future mid prices - 30 seconds duration
#Changes are made in this function to extract features and target variable is changed from "futMid" to "futMid30S"
def IdentifyFutureMidPrices(df, predictionDuration = '30S'):
    futureData = df.resample(predictionDuration, on = 'datetime').first()
    futureData = futureData.shift(periods=-1)
    futureData.drop(columns = ['datetime', 'sym', 'bsize', 'bid', 'ask', 'asize'], inplace = True)
    futureData.rename(columns = {"mid":"futMid"+predictionDuration}, inplace = True)
    futureData.reset_index(inplace = True)
    return pd.merge_asof(df, futureData[['datetime', 'futMid'+predictionDuration]], on='datetime')
    

def ReadCSV(file):
    print('Loading file - ' + file)
    df = pd.read_csv(file)
    df['datetime'] = pd.to_datetime(df['datetime'], format="%Y-%m-%dD%H:%M:%S.%f")
    return df

#Load data
def LoadData(path, tradeFile, quoteFile):
    tradeFile = os.path.join(path, tradeFile)
    quoteFile = os.path.join(path, quoteFile)

    trade_df = ReadCSV(tradeFile)
    quote_df = ReadCSV(quoteFile)
    
    quote_df['mid'] = 0.5*(quote_df['bid'].copy() + quote_df['ask'].copy())
    quote_df['midChangeGroup'] = quote_df['mid'].diff().ne(0).cumsum()
    quote_df = IdentifyFutureMidPrices(quote_df)
   
    print('Files loaded')
    return trade_df, quote_df

#Evaluation function
#df should contain columns - datetime, sym, bsize, bid, ask, asize, predMid (model predicted mid-price)
#Function to evaluate results
def RMS(df):
    df['futMid'] = df['futMid30S']
    df = df.groupby(['midChangeGroup']).first().reset_index()
    tmp = df.dropna(subset=['predMid', 'futMid'])
    rms = sqrt(mean_squared_error(tmp['futMid'], tmp['predMid']))
    predCount = len(tmp['predMid'])
    print('RMS = %.4f. #Predictions = %s' % (rms, predCount))

In [5]:
tradeIndf, quoteIndf = LoadData(param.fileDirectory, param.trade_InSampleFile, param.quote_InSampleFile)

Loading file - ./intraday/trade_in.csv
Loading file - ./intraday/quote_in.csv
Files loaded


In [6]:
#6th Step
#Train the model using in-sample data

#Use a pickle to save the trained data or a function with all required calculations
#For ex: a regression/neural network trained model should be saved to pickle and loaded in PredictionModel
#Another option to define a function for all calculations to predict
#def Predict(quote_df, trade_df):
#    define all required calculations


#Using training data, below model has been designed. You should aim to improve this model
#When mid price changes and 
# imbalance is >  0.7 (param.imbalanceThreshold), predict that mid-price will tick-up
# imbalance is < -0.7 (param.imbalanceThreshold), predict that mid-price will tick-down
def InSamplePredictionModel(quote_df, trade_df):
    #Load pickle to predict or use a function Predict()
    print('Prediction model')    
    quote_df['tick'] = np.nan
    quote_df['predMid'] = np.nan
    
    #extracting timestamp features
    quote_df['hour']=quote_df.datetime.dt.hour
    quote_df['minute']=quote_df.datetime.dt.minute
    quote_df['second']=quote_df.datetime.dt.second
    
    quote_df['midChanged'] = quote_df['mid'].diff()
    quote_df['imbalance'] = (quote_df['bsize']-quote_df['asize'])/(quote_df['bsize']+quote_df['asize'])
    quote_df.loc[(quote_df['midChanged'] != 0) & (quote_df['imbalance'] > param.imbalanceThreshold), 'tick'] = 1
    quote_df.loc[(quote_df['midChanged'] != 0) & (quote_df['imbalance'] < -param.imbalanceThreshold), 'tick'] = -1
    quote_df.dropna(subset=['futMid30S'],inplace=True)
    
    #creating lags
    for i in range(1,30):
       
        pd = '%d'%i + 'S'
        quote_df = IdentifyFutureMidPrices(quote_df,predictionDuration=pd) 
    
    return quote_df

quote_df = InSamplePredictionModel(quoteIndf,tradeIndf)
#quote_df.to_csv('train.csv',index=False)
#quote_df = pd.read_csv('train.csv')

Prediction model


In [8]:
target = 'futMid30S'
features = ['futMid7S','futMid8S'
       ,'futMid10S','futMid11S','futMid12S','futMid13S','futMid14S','futMid15S',  'futMid20S','futMid22S', 'futMid23S','futMid24S','futMid25S'
       ,'futMid26S','futMid27S' ,'futMid28S','futMid29S']

def MachineLearningModel(quote_df,trade_df):
    df = quote_df
    df.dropna(subset=features+[target],inplace=True)
    model = Ridge(alpha=30000)
    trainX = np.array(df[features])
    trainY = np.array(df[target])
    model.fit(trainX,trainY)
    
    return (model)

model = MachineLearningModel(quote_df,tradeIndf)

In [9]:
#7th step
#Predict with the trained model using out_sample data

#Load the out-sample csv if not in memory
#Do not change tick frequency for outsample dataframe
dirContents = dir()
if not ('tradeOutdf' in dirContents and 'quoteOutdf' in dirContents):
    tradeOutdf, quoteOutdf = LoadData(param.fileDirectory, param.trade_OutSampleFile, param.quote_OutSampleFile)
    
quote_test = InSamplePredictionModel(quoteOutdf, tradeOutdf)
#quote_test.to_csv('test1.csv',index=False)
#quote_test = pd.read_csv('test.csv')

def OutSamplePrediction(quote_df, trade_df): 
    print('Out-sample prediction')  
    
    quote_df.dropna(subset=features+[target],inplace=True)
    return (quote_df)


Loading file - ./intraday/trade_out.csv
Loading file - ./intraday/quote_out.csv
Files loaded
Prediction model


In [None]:
#Printing results

res = OutSamplePrediction(quote_test, tradeOutdf)
res['predMid']=model.predict(res[features]) 
#res.to_csv('submission.csv',index=False)
print (RMS(res))