In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from enum import Enum

prefix = "MSFT_2012-06-21_34200000_57600000"

trades_data_file = f"./data/{prefix}_message_5.csv"
orderbook_data_file = f"./data/{prefix}_orderbook_5.csv"


class Direction(Enum):
    BUY = 1
    SELL = -1


class OrderType(Enum):
    SUBMISSION = 1
    CANCELLATION = 2
    DELETION = 3
    EXECUTION_VISIBLE = 4
    EXECUTION_HIDDEN = 5
    TRADING_HALT = 7


trade_df = pd.read_csv(trades_data_file, names=(
    'timestamp',
    'type',
    'orderId',
    'size',
    'trade_price',
    'direction'
))

orderbook_df = pd.read_csv(orderbook_data_file, names=(
    'ask_1_price',
    'ask_1_size',
    'bid_1_price',
    'bid_1_size',

    'ask_2_price',
    'ask_2_size',
    'bid_2_price',
    'bid_2_size',

    'ask_3_price',
    'ask_3_size',
    'bid_3_price',
    'bid_3_size',

    'ask_4_price',
    'ask_4_size',
    'bid_4_price',
    'bid_4_size',

    'ask_5_price',
    'ask_5_size',
    'bid_5_price',
    'bid_5_size',
))

df = pd.concat([trade_df, orderbook_df], axis=1, )
df['timestamp'] = df['timestamp'] - 34200
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
df['time'] = df['timestamp']
df = df.set_index('timestamp')

df = df.iloc[:20_000]

MAX_ROWS = 250
MAX_COLS = 100

pd.set_option('display.max_rows', MAX_ROWS)
pd.set_option('display.max_columns', MAX_COLS)

# Feature Engineering

- Midprice
- Imbalance
- Sum Trade 1s, 3s, 5s
- (bid/ask) advance time
- last trade time

In [80]:
df['midprice'] = (df['ask_1_price'] + df['bid_1_price']) / 2
df['imbalance'] = df['ask_1_size'] - df['bid_1_size'] / (df['ask_1_size'] + df['bid_1_size'])

df['sum_within_1s'] = df['size'].where(df['type'] == OrderType.EXECUTION_VISIBLE.value).rolling(window=pd.Timedelta(seconds=1)).sum().fillna(0)
df['sum_within_5s'] = df['size'].where(df['type'] == OrderType.EXECUTION_VISIBLE.value).rolling(window=pd.Timedelta(seconds=5)).sum().fillna(0)
df['sum_within_10s'] = df['size'].where(df['type'] == OrderType.EXECUTION_VISIBLE.value).rolling(window=pd.Timedelta(seconds=10)).sum().fillna(0)

# display(df.tail(MAX_ROWS))

# seconds since the bid price increased
bid_price_advances = df['bid_1_price'].diff().fillna(0) > 0
ask_price_advances = df['ask_1_price'].diff().fillna(0) < 0

df['bid_advance_timetamp'] = df[bid_price_advances]['time']
df['ask_advance_timetamp'] = df[ask_price_advances]['time']

df['previous_bid_advance_timestamp'] = df['bid_advance_timetamp'].ffill()
df['previous_ask_advance_timestamp'] = df['ask_advance_timetamp'].ffill()

df['bid_advance_time'] = df['time'] - df['previous_bid_advance_timestamp']
df['ask_advance_time'] = df['time'] - df['previous_ask_advance_timestamp']

# remove bid_advance_timetamp and previous_bid_advance_timestamp
df = df.drop(columns=['bid_advance_timetamp', 'ask_advance_timetamp', 'previous_bid_advance_timestamp', 'previous_ask_advance_timestamp'])

display(df)

Unnamed: 0_level_0,type,orderId,size,trade_price,direction,ask_1_price,ask_1_size,bid_1_price,bid_1_size,ask_2_price,ask_2_size,bid_2_price,bid_2_size,ask_3_price,ask_3_size,bid_3_price,bid_3_size,ask_4_price,ask_4_size,bid_4_price,bid_4_size,ask_5_price,ask_5_size,bid_5_price,bid_5_size,time,midprice,imbalance,sum_within_1s,sum_within_5s,sum_within_10s,bid_advance_time,ask_advance_time
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1
1970-01-01 00:00:00.013994120,3,16085616,100,310400,-1,309900,3788,309500,300,310500,100,309300,3986,310600,100,309200,100,310700,200,309100,300,310800,200,308900,100,1970-01-01 00:00:00.013994120,309700.0,3787.926614,0.0,0.0,0.0,NaT,NaT
1970-01-01 00:00:00.013994120,1,16116348,100,310500,-1,309900,3788,309500,300,310500,200,309300,3986,310600,100,309200,100,310700,200,309100,300,310800,200,308900,100,1970-01-01 00:00:00.013994120,309700.0,3787.926614,0.0,0.0,0.0,NaT,NaT
1970-01-01 00:00:00.015247805,1,16116658,100,310400,-1,309900,3788,309500,300,310400,100,309300,3986,310500,200,309200,100,310600,100,309100,300,310700,200,308900,100,1970-01-01 00:00:00.015247805,309700.0,3787.926614,0.0,0.0,0.0,NaT,NaT
1970-01-01 00:00:00.015442111,1,16116704,100,310500,-1,309900,3788,309500,300,310400,100,309300,3986,310500,300,309200,100,310600,100,309100,300,310700,200,308900,100,1970-01-01 00:00:00.015442111,309700.0,3787.926614,0.0,0.0,0.0,NaT,NaT
1970-01-01 00:00:00.015789148,1,16116752,100,310600,-1,309900,3788,309500,300,310400,100,309300,3986,310500,300,309200,100,310600,200,309100,300,310700,200,308900,100,1970-01-01 00:00:00.015789148,309700.0,3787.926614,0.0,0.0,0.0,NaT,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1970-01-01 00:07:00.186062800,3,25378160,600,310400,1,310600,4900,310400,4460,310700,5400,310300,4400,310800,6300,310200,3700,310900,3933,310100,4600,311000,3500,310000,53400,1970-01-01 00:07:00.186062800,310500.0,4899.523504,2201.0,2201.0,3800.0,0 days 00:00:07.824242495,0 days 00:00:09.892898887
1970-01-01 00:07:00.186065848,1,25378167,600,310400,1,310600,4900,310400,5060,310700,5400,310300,4400,310800,6300,310200,3700,310900,3933,310100,4600,311000,3500,310000,53400,1970-01-01 00:07:00.186065848,310500.0,4899.491968,2201.0,2201.0,3800.0,0 days 00:00:07.824245543,0 days 00:00:09.892901935
1970-01-01 00:07:00.186119702,3,25378163,600,310400,1,310600,4900,310400,4460,310700,5400,310300,4400,310800,6300,310200,3700,310900,3933,310100,4600,311000,3500,310000,53400,1970-01-01 00:07:00.186119702,310500.0,4899.523504,2201.0,2201.0,3800.0,0 days 00:00:07.824299397,0 days 00:00:09.892955789
1970-01-01 00:07:00.186120425,1,25378169,600,310400,1,310600,4900,310400,5060,310700,5400,310300,4400,310800,6300,310200,3700,310900,3933,310100,4600,311000,3500,310000,53400,1970-01-01 00:07:00.186120425,310500.0,4899.491968,2201.0,2201.0,3800.0,0 days 00:00:07.824300120,0 days 00:00:09.892956512


# Random Forest Neural Network

In [81]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import TimeSeriesSplit
from genetic_selection import GeneticSelectionCV
from lightgbm import LGBMClassifier
from evolutionary_search import EvolutionaryAlgorithmSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from scipy.stats import mode


ModuleNotFoundError: No module named 'genetic_selection'