### Loading Libraries and Datasets

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px

from sklearn import set_config
from sklearn.base import clone
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [7]:
data = pd.read_csv("train.csv")
seed = 42

In [8]:
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier

class TestModel : 
    def __init__(self):
        
        self.firstLayerMethods = [
            {
                "type":"LGBMR",
                "model":LGBMRegressor(random_state=seed, objective="mae", verbose=0, n_estimators=100)
            },
            {
                "type":"catboost",
                "model":CatBoostRegressor(random_seed=seed, objective="MAE", n_estimators=100, verbose=0)
            }
        ]
        
        self.firstLayerSelectionEncoder = OneHotEncoder(sparse=False)
        
        self.firstLayerSelector = CatBoostClassifier(random_seed=seed,objective="MultiLogloss", n_estimators=100, verbose=0)
        
        return
    
    def fit(self,X,y):
        firstLayerPredictions = []

        for i,method in enumerate(self.firstLayerMethods):
            type,model = method.values()
            print("Training ",type)
            model.fit(X,y)
            firstLayerPredictions.append(model.predict(X))
        
        firstLayerPredictions = np.array(firstLayerPredictions).T
        
        firstLayerSelection = self.firstLayerSelectionEncoder.fit_transform(np.abs(firstLayerPredictions-np.repeat(y.values[:,np.newaxis],2,1)).argmin(axis=1).reshape(-1,1)).values
        
        print("Training first layer selector")
        
        self.firstLayerSelector.fit(X,firstLayerSelection)
        
        return

    def predict(self,X):
        firstLayerPredictions = []
        for i,method in enumerate(self.firstLayerMethods):
            type,model = method.values()
            firstLayerPredictions.append(model.predict(X))
        
        firstLayerPredictions = np.array(firstLayerPredictions).T
        
        firstLayerSelection = self.firstLayerSelector.predict(X)
        
        return (firstLayerSelection*firstLayerPredictions).sum(axis=1)

In [9]:
data['imbalance_auction'] = data['imbalance_size'] * data['imbalance_buy_sell_flag']
data['imbalance_auction_proportion_matched'] = data['imbalance_size'] / data['matched_size']
data['imbalance_order_book'] = data['bid_size']/(data['bid_size']+data['ask_size'])

data['spread'] = data['ask_price'] - data['bid_price']
data['mid_price'] = (data['ask_price'] + data['bid_price']) / 2

data['bef_300'] = np.where(data['seconds_in_bucket'] <= 300, 1, 0)
data['aft_300'] = np.where(data['seconds_in_bucket'] > 300, 1, 0)

data.drop(columns=['imbalance_size', 'imbalance_buy_sell_flag', 'row_id', 'time_id'], inplace=True)
display(data.head())

Unnamed: 0,stock_id,date_id,seconds_in_bucket,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,imbalance_auction,imbalance_auction_proportion_matched,imbalance_order_book,spread,mid_price,bef_300,aft_300
0,0,0,0,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,3180602.69,0.237708,0.87717,0.000214,0.999919,1,0
1,1,0,0,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,-166603.91,0.101451,0.135625,0.000764,1.000278,1,0
2,2,0,0,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,-302879.87,0.166475,0.666468,0.000895,0.99985,1,0
3,3,0,0,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,-11917682.27,0.648061,0.00483,0.000215,1.000107,1,0
4,4,0,0,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,-447549.96,0.025058,0.974343,0.000622,0.999705,1,0


In [14]:
from sklearn.model_selection import train_test_split
    
X = data[~data.target.isna()]
y = X.pop('target')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

X_train_bef = X_train[X_train['bef_300'] == 1].drop(columns=['far_price', 'near_price'])
X_train_aft = X_train[X_train['aft_300'] == 1]

mu_1 = X_train_bef.mean()
mu_2 = X_train_aft.mean()
X_train_bef.fillna(mu_1, inplace=True)
X_train_aft.fillna(mu_2, inplace=True)

y_train_bef = y_train[y_train.index.isin(X_train_bef.index)]
y_train_aft = y_train[y_train.index.isin(X_train_aft.index)]

In [15]:
model_bef = TestModel()
model_bef.fit(X_train_bef, y_train_bef)

Training  LGBMR
Training  catboost




Training first layer selector


In [16]:
model_aft = TestModel()
model_aft.fit(X_train_aft, y_train_aft)

Training  LGBMR
Training  catboost




Training first layer selector


In [20]:
X_test_bef = X_val[X_val['bef_300'] == 1].drop(columns=['far_price', 'near_price'])
X_test_aft = X_val[X_val['aft_300'] == 1]

X_test_bef.fillna(mu_1, inplace=True)
X_test_aft.fillna(mu_2, inplace=True)

pred_bef = model_bef.predict(X_test_bef)
pred_aft = model_aft.predict(X_test_aft)
diff_bef = abs(pred_bef - y_val[y_val.index.isin(X_test_bef.index)])
diff_aft = abs(pred_aft - y_val[y_val.index.isin(X_test_aft.index)])
print("MAE:", (sum(diff_bef) + sum(diff_aft)) / len(X_val))

MAE: 6.263022710259568


In [22]:
diff_aft.mean()

5.555388400862184