### Loading Libraries and Datasets

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px

from sklearn import set_config
from sklearn.base import clone
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [2]:
data = pd.read_csv("train.csv")
seed = 42

In [3]:
tss = TimeSeriesSplit(10)

def cross_val_score(estimatorConstructor, X, y, cv = tss):

    #initiate prediction arrays and score lists
    val_predictions = np.zeros((len(X)))
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics   
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = estimatorConstructor()
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        mu = X_train.mean()
        X_train.fillna(mu, inplace=True)
        X_val.fillna(mu, inplace=True)
        
        #train model
        model.fit(X_train, y_train)
        
        #make predictions
        # train_preds = model.predict(X_train)
        val_preds = model.predict(X_val)
                  
        val_predictions[val_idx] += val_preds
        
        #evaluate model for a fold
        # train_score = mean_absolute_error(y_train, train_preds)
        val_score = mean_absolute_error(y_val, val_preds)
        
        print("Fold", fold, "Val MAE:", val_score)
        
        # append model score for a fold to list
        # train_scores.append(train_score)
        val_scores.append(val_score)
    
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f}')
    
    return val_scores, val_predictions

In [51]:
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier

class MetaModel : 
    def __init__(self):
        self.firstLayerMethods = [
            {
                "type":"LGBMR",
                "model":LGBMRegressor(random_state=seed, objective="mae", verbose=0, n_estimators=50, device='GPU')
            },
            {
                "type":"catboost",
                "model":CatBoostRegressor(random_seed=seed, objective="MAE", n_estimators=50, verbose=0)
            }
        ]
        
        self.firstLayerSelector = LGBMRegressor(
            random_state=seed, objective="mae", learning_rate=0.05, verbose=0,n_estimators=100, device='GPU'
        )
        
        return
    
    def fit(self, X, y):
        firstLayerPredictions = []

        for i, method in enumerate(self.firstLayerMethods):
            type, model = method.values()
            print("Training ", type)
            model.fit(X, y)
            firstLayerPredictions.append(model.predict(X))
        
        print("Training first layer selector")
        
        # y = a*x + b*z -> a = (y-z)/(x-z)
        a = (y-firstLayerPredictions[1])/(firstLayerPredictions[0]-firstLayerPredictions[1])
        a = np.where(a > 1, 1, a)
        a = np.where(a < 0, 0, a)
        
        self.firstLayerSelector.fit(X, a)
        
        self.y_m = y.mean()
        
        return

    def predict(self, X):
        firstLayerPredictions = []
        for i, method in enumerate(self.firstLayerMethods):
            type, model = method.values()
            firstLayerPredictions.append(model.predict(X))
        
        a = self.firstLayerSelector.predict(X)
        
        firstLayerSelection = np.array([a, 1-a]).T
        firstLayerPredictions = np.array(firstLayerPredictions).T
        
        prediction = (firstLayerPredictions * firstLayerSelection).sum(axis=1)
        
        # Mean change
        prediction += self.y_m - np.mean(prediction)
        
        return prediction

In [5]:
data['imbalance_auction'] = data['imbalance_size'] * data['imbalance_buy_sell_flag']
data['imbalance_auction_proportion_matched'] = data['imbalance_size'] / data['matched_size']
data['imbalance_order_book'] = data['bid_size']/(data['bid_size']+data['ask_size'])

data['spread'] = data['ask_price'] - data['bid_price']
data['mid_price'] = (data['ask_price'] + data['bid_price']) / 2

data['bef_300'] = np.where(data['seconds_in_bucket'] <= 300, 1, 0)
data['aft_300'] = np.where(data['seconds_in_bucket'] > 300, 1, 0)

data.drop(columns=['imbalance_size', 'imbalance_buy_sell_flag', 'row_id', 'time_id'], inplace=True)
display(data.head())

Unnamed: 0,stock_id,date_id,seconds_in_bucket,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,imbalance_auction,imbalance_auction_proportion_matched,imbalance_order_book,spread,mid_price,bef_300,aft_300
0,0,0,0,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,3180602.69,0.237708,0.87717,0.000214,0.999919,1,0
1,1,0,0,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,-166603.91,0.101451,0.135625,0.000764,1.000278,1,0
2,2,0,0,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,-302879.87,0.166475,0.666468,0.000895,0.99985,1,0
3,3,0,0,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,-11917682.27,0.648061,0.00483,0.000215,1.000107,1,0
4,4,0,0,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,-447549.96,0.025058,0.974343,0.000622,0.999705,1,0


In [52]:
from sklearn.model_selection import train_test_split
    
X = data[~data.target.isna()]
y = X.pop('target')

cross_val_score(MetaModel, X, y)

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 5.886337388205201
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 7.273210929739492
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.8937905441883744
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 7.202206586801649
Training  LGBMR
Training  catboost
Training first layer selector
Fold 4 Val MAE: 6.141815477012338
Training  LGBMR
Training  catboost
Training first layer selector
Fold 5 Val MAE: 6.0298463525297334
Training  LGBMR
Training  catboost
Training first layer selector
Fold 6 Val MAE: 6.546272485442427
Training  LGBMR
Training  catboost
Training first layer selector
Fold 7 Val MAE: 6.28645606662051
Training  LGBMR
Training  catboost
Training first layer selector
Fold 8 Val MAE: 5.978952801881253
Training  LGBMR
Training  catboost
Training first layer selector
Fold 9 Val MAE: 5.863586911435755
Val Score

([5.886337388205201,
  7.273210929739492,
  6.8937905441883744,
  7.202206586801649,
  6.141815477012338,
  6.0298463525297334,
  6.546272485442427,
  6.28645606662051,
  5.978952801881253,
  5.863586911435755],
 array([ 0.        ,  0.        ,  0.        , ...,  0.11364601,
         0.7581702 , -2.31720132]))

CV 5 folds :
- 6.49228 with full features + fillna mean + 50 estimators + mean change (V3)
- 6.49264 with full features + fillna mean + 50 estimators (V2)
- 6.49265 with full features + fillna 0 + 100 estimators (V1)
- 6.49287 with full features + fillna mean + 50 estimators + new features
- 6.49358 with full features + fillna mean + 100 estimators
- 6.49651 with full features + fillna mean + 150 estimators
- 6.50264 with selected features + fillna 0 + 100 estimators

CV 10 folds :
- 6.41020 with V3 + coef predictions LGBMR 100
- 6.41232 with V3
- 6.41378 with V3 + coef predictions catboost 50
- 6.41446 with V2
- 6.41493 with V3 + mean predictions
- 6.42087 with V1