### Loading Libraries and Datasets

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px

from sklearn import set_config
from sklearn.base import clone
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [22]:
data = pd.read_csv("train.csv")
medianVol = pd.read_csv("medianVolV2.csv")
data = data.merge(medianVol, how = "left", on = "stock_id")
seed = 42

### Cross-Validation

In [38]:
cv = KFold(n_splits=10)

def cross_val_score(estimatorConstructor, X, y, cv = cv, split=False):
    #train_predictions = np.zeros((len(sample)))
    train_scores, val_scores = [], []
    
    #training model, predicting prognosis probability, and evaluating metrics   
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = estimatorConstructor()
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        if split:
            # BEFORE
            X_bef = X_train[X_train['near_price'].isna()]
            y_bef = y_train[y_train.index.isin(X_bef.index)]
            
            model_bef = MetaModel()
            model_bef.fit(X_bef, y_bef)
            models.append(model_bef)
            
            X_bef_test = X_val[X_val['near_price'].isna()]
            y_bef_test = y_val[y_val.index.isin(X_bef_test.index)]

            pred_bef = model_bef.predict(X_bef_test)
            mae_bef = mean_absolute_error(y_bef_test, pred_bef)
            
            # AFTER
            X_aft = X_train[~X_train['near_price'].isna()]
            y_aft = y_train[y_train.index.isin(X_aft.index)]
            
            model_aft = MetaModel()
            model_aft.fit(X_aft, y_aft)
            models.append(model_aft)

            X_aft_test = X_val[~X_val['near_price'].isna()]
            y_aft_test = y_val[y_val.index.isin(X_aft_test.index)]

            pred_aft = model_aft.predict(X_aft_test)
            mae_aft = mean_absolute_error(y_aft_test, pred_aft)
            
            val_score = (mae_bef * len(y_bef_test) + mae_aft * len(y_aft_test))/len(y_val)
        
        else:
            model.fit(X_train, y_train)
            models.append(model)
        
            val_preds = model.predict(X_val)
            val_score = mean_absolute_error(y_val, val_preds)
        
        print("Fold", fold, "Val MAE:", val_score)
        val_scores.append(val_score)
    
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f}')
    
    return val_scores

### Model

In [33]:
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier

class MetaModel : 
    def __init__(self):
        self.firstLayerMethods = [
            {
                "type":"LGBMR",
                "model":LGBMRegressor(random_state=seed, objective="mae", verbose=0, n_estimators=50, device='GPU')
            },
            {
                "type":"catboost",
                "model":CatBoostRegressor(random_seed=seed, objective="MAE", n_estimators=50, verbose=0)
            }
        ]
        
        self.firstLayerSelector = LGBMRegressor(random_state=seed, objective="mae", verbose=0, n_estimators=100, device='GPU')
        
        return
    
    def fit(self, X, y):
        firstLayerPredictions = []

        for i, method in enumerate(self.firstLayerMethods):
            type, model = method.values()
            print("Training ", type)
            model.fit(X, y)
            firstLayerPredictions.append(model.predict(X))
        
        print("Training first layer selector")
        
        # y = a*x + b*z -> a = (y-z)/(x-z)
        a = (y-firstLayerPredictions[1])/(firstLayerPredictions[0]-firstLayerPredictions[1])
        a = np.where(a > 1, 1, a)
        a = np.where(a < 0, 0, a)
        
        self.firstLayerSelector.fit(X, a)
        
        self.y_m = y.mean()
        
        return

    def predict(self, X):
        firstLayerPredictions = []
        for i, method in enumerate(self.firstLayerMethods):
            type, model = method.values()
            firstLayerPredictions.append(model.predict(X))
        
        a = self.firstLayerSelector.predict(X)
        
        firstLayerSelection = np.array([a, 1-a]).T
        firstLayerPredictions = np.array(firstLayerPredictions).T
        
        prediction = (firstLayerPredictions * firstLayerSelection).sum(axis=1)
        
        # Mean change
        prediction += self.y_m - np.mean(prediction)
        
        return prediction

In [26]:
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
                'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2', 'overall_medvol', 'first5min_medvol', 'last5min_medvol']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    df['mid_price'] = (df['ask_price'] + df['bid_price']) / 2
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap', 'mid_price']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')    
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    return df[features]

### Performance

In [27]:
X = data[~data.target.isna()]
y = X.pop('target')
X = generate_features(X)

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

split = False

if split:
    X_bef = X_train[X_train['near_price'].isna()]
    y_bef = y_train[y_train.index.isin(X_bef.index)]
    model_bef = MetaModel()
    model_bef.fit(X_bef, y_bef)

    X_aft = X_train[~X_train['near_price'].isna()]
    y_aft = y_train[y_train.index.isin(X_aft.index)]
    model_aft = MetaModel()
    model_aft.fit(X_aft, y_aft)
    
    X_bef_test = X_test[X_test['near_price'].isna()]
    y_bef_test = y_test[y_test.index.isin(X_bef_test.index)]

    pred_bef = model_bef.predict(X_bef_test)
    mae_bef = np.mean(np.abs(pred_bef-y_bef_test))

    X_aft_test = X_test[~X_test['near_price'].isna()]
    y_aft_test = y_test[y_test.index.isin(X_aft_test.index)]

    pred_aft = model_aft.predict(X_aft_test)
    mae_aft = np.mean(np.abs(pred_aft-y_aft_test))

    print("MAE:", (mae_bef*len(X_bef_test)+mae_aft*len(X_aft_test))/len(X_test))
else:
    model = MetaModel()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("MAE:", np.mean(np.abs(pred-y_test)))

# 6.286967181367014 nous
# 6.274157105523045 lui+[mid_price]
# 6.270861222339846 lui+[mid_price]+median_vol_v2
# 6.265982478394327 lui+[mid_price]+split_bef_aft
# 6.263266267533804 lui+[mid_price]+split_bef_aft+median_vol
# 6.262500336351308 lui+[mid_price]+split_NaN+median_vol_v2

Training  LGBMR
Training  catboost
Training first layer selector
MAE: 6.256956064248925


In [86]:
models = []
cross_val_score(MetaModel, X_train, y_train, split=split)

# 6.29479 nous
# 6.28149 lui+[mid_price]
# 6.27809 lui+[mid_price]+median_vol_v2
# 6.27396 lui+[mid_price]+split_bef_aft
# 6.27060 lui+[mid_price]+split_bef_aft+median_vol
# 6.27040 lui+[mid_price]+split_NaN+median_vol_v2

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 6.19873872787118
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 6.221486693212527
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.207644535256036
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 6.214768346421529
Training  LGBMR
Training  catboost
Training first layer selector
Fold 4 Val MAE: 6.213130733897578
Training  LGBMR
Training  catboost
Training first layer selector
Fold 5 Val MAE: 6.204202285868182
Training  LGBMR
Training  catboost
Training first layer selector
Fold 6 Val MAE: 6.198342442735598
Training  LGBMR
Training  catboost
Training first layer selector
Fold 7 Val MAE: 6.181171965348017
Training  LGBMR
Training  catboost
Training first layer selector
Fold 8 Val MAE: 6.229710807304494
Training  LGBMR
Training  catboost
Training first layer selector
Fold 9 Val MAE: 6.202128663262272
Val Score: 

[6.19873872787118,
 6.221486693212527,
 6.207644535256036,
 6.214768346421529,
 6.213130733897578,
 6.204202285868182,
 6.198342442735598,
 6.181171965348017,
 6.229710807304494,
 6.202128663262272]

In [87]:
if split:
    X_bef_test = X_test[X_test['seconds_in_bucket'] <= 300]
    y_bef_test = y_test[y_test.index.isin(X_bef_test.index)]

    pred_bef = np.mean([model.predict(X_bef_test) for model in models[::2]], axis=0)
    mae_bef = np.mean(np.abs(pred_bef-y_bef_test))

    X_aft_test = X_test[X_test['seconds_in_bucket'] > 300]
    y_aft_test = y_test[y_test.index.isin(X_aft_test.index)]

    pred_aft = np.mean([model.predict(X_aft_test) for model in models[1::2]], axis=0)
    mae_aft = np.mean(np.abs(pred_aft-y_aft_test))

    print("MAE:", (mae_bef*len(X_bef_test)+mae_aft*len(X_aft_test))/len(X_test))
    
else:
    pred = np.mean([model.predict(X_test) for model in models], axis=0)
    print("MAE:", np.mean(np.abs(pred-y_test)))

# 6.286636710886435 nous
# 6.273552818669892 lui+[mid_price]
# 6.270087093692114 lui+[mid_price]+median_vol_v2
# 6.265754044619724 lui+[mid_price]+split_bef_aft
# 6.26409276253301 lui+[mid_price]+split_NaN+median_vol_v2
# 6.262009831354247 lui+[mid_price]+split_bef_aft+median_vol

MAE: 6.19909868972393
