### Loading Libraries and Datasets

In [27]:
import numpy as np
import pandas as pd
import plotly_express as px

from sklearn import set_config
from sklearn.base import clone
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from lightgbm import early_stopping,log_evaluation, LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [33]:
data = pd.read_csv("train.csv")
seed = 42

In [37]:
tss = TimeSeriesSplit(10)

def cross_val_score(estimatorConstructor,X,y, cv = tss, label = ''):

    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X)))
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics   
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = estimatorConstructor()
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        mu = X_train.mean()
        X_train.fillna(mu)
        X_val.fillna(mu)
        
        #train model
        model.fit(X_train, y_train,(X_val,y_val))
        
        #make predictions
        # train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
                  
        val_predictions[val_idx] += val_preds
        
        #evaluate model for a fold
        # train_score = mean_absolute_error(y_train, train_preds)
        val_score = mean_absolute_error(y_val, val_preds)
        
        print("Fold",fold,"Val MAE:",val_score)
        
        # append model score for a fold to list
        # train_scores.append(train_score)
        val_scores.append(val_score)
    
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f} | {label}')
    
    return val_scores, val_predictions

In [46]:
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier

class MetaModel: 
    def __init__(self):
        
        self.config = {
            "nb_round_early_stop":100
        }
        
        self.firstLayerMethods = [
            {
                "type":"LGBMR",
                "model":LGBMRegressor(**{
                        'device': "cpu",
                        'objective'         : 'regression_l1',
                        'boosting_type'     : 'gbdt',
                        'random_state'      : 42,
                        'colsample_bytree'  : 0.7,
                        'subsample'         : 0.65,
                        'learning_rate'     : 0.065,
                        'max_depth'         : 6,
                        'n_estimators'      : 500,
                        'num_leaves'        : 150,  
                        'reg_alpha'         : 0.01,
                        'reg_lambda'        : 3.25,
                        'verbose'           : -1,
                       })
            },
            {
                "type":"catboost",
                "model":CatBoostRegressor(**{
                    'task_type': "CPU",
                    'objective'           : "MAE",
                    'eval_metric'         : "MAE",
                    'bagging_temperature' : 0.5,
                    'colsample_bylevel'   : 0.7,
                    'iterations'          : 500,
                    'learning_rate'       : 0.065,
                    'od_wait'             : 25,
                    'max_depth'           : 7,
                    'l2_leaf_reg'         : 1.5,
                    'min_data_in_leaf'    : 1000,
                    'random_strength'     : 0.65, 
                    'verbose'             : 0,
                    'use_best_model'      : True,
                  })
            }
        ]
        
        self.firstLayerSelectionEncoder = OneHotEncoder(sparse_output=False)
        
        self.firstLayerSelector = CatBoostClassifier(random_seed=seed,objective="MultiLogloss", n_estimators=100, verbose=0)
        
        # self.reg = LinearRegression()
        
        return
    
    def fit(self,X,y,eval_set=None):
        firstLayerPredictions = []

        for i,method in enumerate(self.firstLayerMethods):
            type,model = method.values()
            print("Training ",type)
            if type == "LGBMR":
                model.fit(X, y, 
                          eval_set = [eval_set], 
                          #verbose = 0, 
                          eval_metric = "mae",
                          callbacks = [log_evaluation(0,), 
                                       early_stopping(self.config["nb_round_early_stop"], verbose = False)], 
                         )
            elif type == "catboost":
                model.fit(X, y, 
                          eval_set = [eval_set], 
                          verbose = 0, 
                          early_stopping_rounds = self.config["nb_round_early_stop"],
                         ); 

            else:
                model.fit(X, y)
            firstLayerPredictions.append(model.predict(X))
        
        firstLayerPredictions = np.array(firstLayerPredictions).T
        
        firstLayerSelection = self.firstLayerSelectionEncoder.fit_transform(np.abs(firstLayerPredictions-np.repeat(y.values[:,np.newaxis],2,1)).argmin(axis=1).reshape(-1,1)).values
        
        print("Training first layer selector")
        
        self.firstLayerSelector.fit(X,firstLayerSelection)
        
        # self.reg.fit(firstLayerPredictions,y)
        
        return

    def predict(self,X):
        firstLayerPredictions = []
        for i,method in enumerate(self.firstLayerMethods):
            type,model = method.values()
            firstLayerPredictions.append(model.predict(X))
        
        firstLayerPredictions = np.array(firstLayerPredictions).T
        
        firstLayerSelection = self.firstLayerSelector.predict(X)
        
        return (firstLayerSelection*firstLayerPredictions).sum(axis=1)
        
        # return self.reg.predict(firstLayerPredictions)

In [34]:
data['imbalance_auction'] = data['imbalance_size'] * data['imbalance_buy_sell_flag']
data['imbalance_auction_proportion_matched'] = data['imbalance_size'] / data['matched_size']
data['imbalance_order_book'] = data['bid_size']/(data['bid_size']+data['ask_size'])

data['spread'] = data['ask_price'] - data['bid_price']
data['mid_price'] = (data['ask_price'] + data['bid_price']) / 2

data['bef_300'] = np.where(data['seconds_in_bucket'] <= 300, 1, 0)
data['aft_300'] = np.where(data['seconds_in_bucket'] > 300, 1, 0)

data.drop(columns=['imbalance_size', 'imbalance_buy_sell_flag', 'row_id', 'time_id'], inplace=True)
#data.drop(columns=["row_id"],inplace=True)

In [35]:
from sklearn.model_selection import train_test_split
    
X = data[~data.target.isna()]
y = X.pop('target')

# X_bef = X[X['bef_300'] == 1].drop(columns=['far_price', 'near_price'])
# X_aft = X[X['aft_300'] == 1]

# y_bef = y[y.index.isin(X_bef.index)]
# y_aft = y[y.index.isin(X_aft.index)]

In [47]:
cross_val_score(MetaModel,X,y)

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 5.879109503910993
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 7.276003555246476
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.895819423809661
Training  LGBMR
Training  catboost


# Catboost selector with bef/aft split

In [20]:
cross_val_score(MetaModel,X_bef,y_bef,label="model_bef")

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 6.347937367316545
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 7.918597152981582
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 7.809680726675798
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 7.813109787866368
Training  LGBMR
Training  catboost
Training first layer selector
Fold 4 Val MAE: 6.6265186352020855
Training  LGBMR
Training  catboost
Training first layer selector
Fold 5 Val MAE: 6.460874297012216
Training  LGBMR
Training  catboost
Training first layer selector
Fold 6 Val MAE: 7.200034725962418
Training  LGBMR
Training  catboost
Training first layer selector
Fold 7 Val MAE: 6.927192232374852
Training  LGBMR
Training  catboost
Training first layer selector
Fold 8 Val MAE: 6.6083349174891195
Training  LGBMR
Training  catboost
Training first layer selector
Fold 9 Val MAE: 6.579511262348932
Val Scor

([6.347937367316545,
  7.918597152981582,
  7.809680726675798,
  7.813109787866368,
  6.6265186352020855,
  6.460874297012216,
  7.200034725962418,
  6.927192232374852,
  6.6083349174891195,
  6.579511262348932],
 array([ 0.        ,  0.        ,  0.        , ...,  0.93983515,
        -0.09111884, -0.49082114]))

In [21]:
result_aft = cross_val_score(MetaModel,X_aft,y_aft,label="model_aft")

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 5.328784933055805
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 6.511028192774758
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.487829942557861
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 6.478314129753692
Training  LGBMR
Training  catboost
Training first layer selector
Fold 4 Val MAE: 5.543599931129056
Training  LGBMR
Training  catboost
Training first layer selector
Fold 5 Val MAE: 5.46387383115327
Training  LGBMR
Training  catboost
Training first layer selector
Fold 6 Val MAE: 5.718727959026511
Training  LGBMR
Training  catboost
Training first layer selector
Fold 7 Val MAE: 5.554874339162555
Training  LGBMR
Training  catboost
Training first layer selector
Fold 8 Val MAE: 5.1658269768812355
Training  LGBMR
Training  catboost
Training first layer selector
Fold 9 Val MAE: 5.019251417090166
Val Score:

# Linear regression selector with bef/aft split

In [30]:
cross_val_score(MetaModel,X_bef,y_bef,label="model_bef")

Training  LGBMR
Training  catboost
Fold 0 Val MAE: 6.931939494240654
Training  LGBMR
Training  catboost
Fold 1 Val MAE: 8.436610870054755
Training  LGBMR
Training  catboost
Fold 2 Val MAE: 10.533642104364167
Training  LGBMR
Training  catboost
Fold 3 Val MAE: 7.981769462337933
Training  LGBMR
Training  catboost
Fold 4 Val MAE: 6.73263052222328
Training  LGBMR
Training  catboost
Fold 5 Val MAE: 6.558399040955649
Training  LGBMR
Training  catboost
Fold 6 Val MAE: 7.244063121948457
Training  LGBMR
Training  catboost
Fold 7 Val MAE: 7.073760209161333
Training  LGBMR
Training  catboost
Fold 8 Val MAE: 6.744928129936018
Training  LGBMR
Training  catboost
Fold 9 Val MAE: 6.69553079590666
Val Score: 7.49333 ± 1.16510 | model_bef


([6.931939494240654,
  8.436610870054755,
  10.533642104364167,
  7.981769462337933,
  6.73263052222328,
  6.558399040955649,
  7.244063121948457,
  7.073760209161333,
  6.744928129936018,
  6.69553079590666],
 array([ 0.        ,  0.        ,  0.        , ...,  4.39555036,
         0.09355896, -0.47424669]))

In [12]:
cross_val_score(MetaModel,X_aft,y_aft,label="model_aft")

Training  LGBMR
Training  catboost
Fold 0 Val MAE: 5.641635805992633
Training  LGBMR
Training  catboost
Fold 1 Val MAE: 6.755317249491351
Training  LGBMR
Training  catboost
Fold 2 Val MAE: 8.344468721725047
Training  LGBMR
Training  catboost
Fold 3 Val MAE: 6.917156508682197
Training  LGBMR
Training  catboost
Fold 4 Val MAE: 5.652181431725709
Training  LGBMR
Training  catboost
Fold 5 Val MAE: 5.5228968653165795
Training  LGBMR
Training  catboost
Fold 6 Val MAE: 5.789824158269713
Training  LGBMR
Training  catboost
Fold 7 Val MAE: 6.024225192062919
Training  LGBMR
Training  catboost
Fold 8 Val MAE: 5.193028676768116
Training  LGBMR
Training  catboost
Fold 9 Val MAE: 5.209143437097536
Val Score: 6.10499 ± 0.92717 | model_aft


([5.641635805992633,
  6.755317249491351,
  8.344468721725047,
  6.917156508682197,
  5.652181431725709,
  5.5228968653165795,
  5.789824158269713,
  6.024225192062919,
  5.193028676768116,
  5.209143437097536],
 array([ 0.        ,  0.        ,  0.        , ..., -0.29073928,
         2.18888491, -5.44373671]))

# Linear regression selector without bef/aft split

In [13]:
cross_val_score(MetaModel,X,y)

Training  LGBMR
Training  catboost
Fold 0 Val MAE: 6.168401439219442
Training  LGBMR
Training  catboost
Fold 1 Val MAE: 7.642852021352264
Training  LGBMR
Training  catboost
Fold 2 Val MAE: 7.333824459341764
Training  LGBMR
Training  catboost
Fold 3 Val MAE: 7.275478509434792
Training  LGBMR
Training  catboost
Fold 4 Val MAE: 6.210999961521283
Training  LGBMR
Training  catboost
Fold 5 Val MAE: 6.078875795156375
Training  LGBMR
Training  catboost
Fold 6 Val MAE: 6.564586806006145
Training  LGBMR
Training  catboost
Fold 7 Val MAE: 6.359768607781469
Training  LGBMR
Training  catboost
Fold 8 Val MAE: 6.057330742338905
Training  LGBMR
Training  catboost
Fold 9 Val MAE: 5.89008601467084
Val Score: 6.55822 ± 0.59426 | 


([6.168401439219442,
  7.642852021352264,
  7.333824459341764,
  7.275478509434792,
  6.210999961521283,
  6.078875795156375,
  6.564586806006145,
  6.359768607781469,
  6.057330742338905,
  5.89008601467084],
 array([ 0.        ,  0.        ,  0.        , ...,  1.07484548,
         1.72683925, -3.59432684]))

# Catboost selector without bef/aft split

In [15]:
cross_val_score(MetaModel,X,y)

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 5.894938568205118
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 7.3087571695506295
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.932196830282457
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 7.2086890429742825
Training  LGBMR
Training  catboost
Training first layer selector
Fold 4 Val MAE: 6.143983727497267
Training  LGBMR
Training  catboost
Training first layer selector
Fold 5 Val MAE: 6.029022844577131
Training  LGBMR
Training  catboost
Training first layer selector
Fold 6 Val MAE: 6.544286466705665
Training  LGBMR
Training  catboost
Training first layer selector
Fold 7 Val MAE: 6.292429203995975
Training  LGBMR
Training  catboost
Training first layer selector
Fold 8 Val MAE: 5.989033922078409
Training  LGBMR
Training  catboost
Training first layer selector
Fold 9 Val MAE: 5.8627754066657785
Val Sco

([5.894938568205118,
  7.3087571695506295,
  6.932196830282457,
  7.2086890429742825,
  6.143983727497267,
  6.029022844577131,
  6.544286466705665,
  6.292429203995975,
  5.989033922078409,
  5.8627754066657785],
 array([ 0.        ,  0.        ,  0.        , ...,  0.63031969,
         1.07024192, -2.05362575]))

# Catboost selector, without b/a split, without feature engineering

In [28]:
cross_val_score(MetaModel,X,y)

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 5.911270404343734
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 7.32854370625557
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.9590646134574365
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 7.225255301378903
Training  LGBMR
Training  catboost
Training first layer selector
Fold 4 Val MAE: 6.166512106745125
Training  LGBMR
Training  catboost
Training first layer selector
Fold 5 Val MAE: 6.047953904726102
Training  LGBMR
Training  catboost
Training first layer selector
Fold 6 Val MAE: 6.55426731617956
Training  LGBMR
Training  catboost
Training first layer selector
Fold 7 Val MAE: 6.300287113531595
Training  LGBMR
Training  catboost
Training first layer selector
Fold 8 Val MAE: 6.002972059837637
Training  LGBMR
Training  catboost
Training first layer selector
Fold 9 Val MAE: 5.876055236998791
Val Score: 

([5.911270404343734,
  7.32854370625557,
  6.9590646134574365,
  7.225255301378903,
  6.166512106745125,
  6.047953904726102,
  6.55426731617956,
  6.300287113531595,
  6.002972059837637,
  5.876055236998791],
 array([ 0.        ,  0.        ,  0.        , ...,  0.53036643,
         1.68544934, -2.24955026]))

In [16]:
X_baseline = pd.read_parquet("./Baseline/BaselineData/XTrIntCmpNewFtre.parquet")
y_baseline = pd.read_parquet("./Baseline/BaselineData/Ytrain.parquet")["target"]

In [17]:
cross_val_score(MetaModel,X_baseline,y_baseline)

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 5.858464297078814
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 7.241476161973192
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.871425577293039
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 7.176109464546702
Training  LGBMR
Training  catboost
Training first layer selector


KeyboardInterrupt: 