### Loading Libraries and Datasets

In [1]:
import numpy as np
import pandas as pd

from sklearn import set_config
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [2]:
data = pd.read_csv("train.csv")
medianVol = pd.read_csv("medianVolV2.csv")
data = data.merge(medianVol, how = "left", on = "stock_id")
seed = 42

In [3]:
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
                'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2', 'overall_medvol', 'first5min_medvol', 'last5min_medvol']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    df['mid_price'] = (df['ask_price'] + df['bid_price']) / 2
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap', 'mid_price']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')    
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    return df[features]

### Cross-Validation

In [21]:
cv = KFold(n_splits=10)
models_reg = []
models_classif = []

def cross_val_score_reg(X, y, cv = cv):
    val_scores = []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]

        model_reg = MetaModelReg()
        model_reg.fit(X_train, y_train)
        models_reg.append(model_reg)
        
        val_preds = model_reg.predict(X_val)
        val_score = mean_absolute_error(y_val, val_preds)
        
        print("Fold", fold, "Val MAE:", val_score)
        val_scores.append(val_score)
    
    print(f'Val Score: {np.mean(val_scores):.5f} ± {np.std(val_scores):.5f}')

def cross_val_score_classif(X, y, cv = cv):
    preds_not_null, good_preds_not_null = [], []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        model_classif = MetaModelClassif()
        model_classif.fit(X_train, y_train)
        models_classif.append(model_classif)

        val_preds = model_classif.predict(X_val)
        pred_not_null, good_pred_not_null = evaluate(val_preds, y_val)
        
        print("Fold", fold, "----------")
        print("Pred != 0:", pred_not_null)
        print("Good Pred | Pred != 0:", good_pred_not_null)
        print("Good Pred:", pred_not_null*good_pred_not_null)
        
        preds_not_null.append(pred_not_null)
        good_preds_not_null.append(good_pred_not_null)
        
    print("--------------------")
    print("Mean Pred != 0:", np.mean(preds_not_null))
    print("Mean Good Pred | Pred != 0:", np.mean(good_preds_not_null))

### Model Regression

In [6]:
class MetaModelReg : 
    def __init__(self):
        self.firstLayerMethods = [
            {
                "type":"LGBMR",
                "model":LGBMRegressor(random_state=seed, objective="mae", verbose=0, n_estimators=50, device='GPU')
            },
            {
                "type":"catboost",
                "model":CatBoostRegressor(random_seed=seed, objective="MAE", n_estimators=50, verbose=0)
            }
        ]
        
        self.firstLayerSelector = LGBMRegressor(random_state=seed, objective="mae", verbose=0, n_estimators=100, device='GPU')
        
        return
    
    def fit(self, X, y):
        firstLayerPredictions = []

        for i, method in enumerate(self.firstLayerMethods):
            type, model = method.values()
            print("Training ", type)
            model.fit(X, y)
            firstLayerPredictions.append(model.predict(X))
        
        print("Training first layer selector")
        
        # y = a*x + b*z -> a = (y-z)/(x-z)
        a = (y-firstLayerPredictions[1])/(firstLayerPredictions[0]-firstLayerPredictions[1])
        a = np.where(a > 1, 1, a)
        a = np.where(a < 0, 0, a)
        
        self.firstLayerSelector.fit(X, a)
        
        self.y_m = y.mean()
        
        return

    def predict(self, X):
        firstLayerPredictions = []
        for i, method in enumerate(self.firstLayerMethods):
            type, model = method.values()
            firstLayerPredictions.append(model.predict(X))
        
        a = self.firstLayerSelector.predict(X)
        
        firstLayerSelection = np.array([a, 1-a]).T
        firstLayerPredictions = np.array(firstLayerPredictions).T
        
        prediction = (firstLayerPredictions * firstLayerSelection).sum(axis=1)
        
        # Mean change
        prediction += self.y_m - np.mean(prediction)
        
        return prediction

### Model Classification

In [40]:
class MetaModelClassif : 
    def __init__(self):
        self.models = [
            {
                "type": "LGBM Classifier",
                "model": LGBMClassifier(random_seed=seed, n_estimators=400, verbose=0)
            },
            {
                "type": "CatBoost Classifier",
                "model": CatBoostClassifier(random_seed=seed, n_estimators=400, verbose=0)
            }
        ]
        
        return
    
    def fit(self, X, y):
        y = np.where(y >= 0, 1, 0)
        
        for i, method in enumerate(self.models):
            type, model = method.values()
            print("Training ", type)
            model.fit(X, y)
            
        return

    def predict(self, X):
        models_pred_class = []
        models_pred_proba = []
        
        for i, method in enumerate(self.models):
            type, model = method.values()
            models_pred_class.append(model.predict(X))
            models_pred_proba.append(np.max(model.predict_proba(X), axis=1))
        
        pred_class = np.mean(np.array(models_pred_class).T, axis=1)
        pred_proba = np.max(np.array(models_pred_proba).T, axis=1)
            
        prediction = np.where((pred_class == 1) & (pred_proba > 0.57), 1, 0)
        prediction = np.where((pred_class == 0) & (pred_proba > 0.57), -1, prediction)
        
        return prediction

In [11]:
def evaluate(pred, value):
    pred_not_null = len(value[pred!=0])/len(value)
    good_pred_not_null = len(value[(np.sign(value) == pred) & (pred!=0)])/len(value[pred!=0])
    return pred_not_null, good_pred_not_null

### Performance

In [8]:
X = data[~data.target.isna()]
y = X.pop('target')
X = generate_features(X)

#### Without CV

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

mu = X_train.mean()
X_train.fillna(mu, inplace=True)
X_test.fillna(mu, inplace=True)

In [10]:
# nb_estimators=100 for classifier
model_classif = MetaModelClassif()
model_classif.fit(X_train, y_train)
X_train['sign'] = model_classif.predict(X_train)

model_reg = MetaModelReg()
model_reg.fit(X_train, y_train)

X_test['sign'] = model_classif.predict(X_test)
pred = model_reg.predict(X_test)

print("MAE:", np.mean(np.abs(pred-y_test)))

Training  LGBM Classifier
Training  CatBoost Classifier
Training  LGBMR
Training  catboost
Training first layer selector
MAE: 6.265823436717951


In [42]:
# nb_estimators=400 for classifier
model_classif = MetaModelClassif()
model_classif.fit(X_train, y_train)
X_train['sign'] = model_classif.predict(X_train)

model_reg = MetaModelReg()
model_reg.fit(X_train, y_train)

X_test['sign'] = model_classif.predict(X_test)
pred = model_reg.predict(X_test)

print("MAE:", np.mean(np.abs(pred-y_test)))

Training  LGBM Classifier
Training  CatBoost Classifier
Training  LGBMR
Training  catboost
Training first layer selector
MAE: 6.256174230379108


In [18]:
pred_not_null, good_pred_not_null = evaluate(X_train['sign'], y_train)
print("Pred != 0:", pred_not_null)
print("Good Pred | Pred != 0:", good_pred_not_null)
print("Good Pred:", pred_not_null*good_pred_not_null)
print("----------")
pred_not_null, good_pred_not_null = evaluate(X_test['sign'], y_test)
print("Pred != 0:", pred_not_null)
print("Good Pred | Pred != 0:", good_pred_not_null)
print("Good Pred:", pred_not_null*good_pred_not_null)

Pred != 0: 0.37096478672771055
Good Pred | Pred != 0: 0.6248441651515256
Good Pred: 0.23179518246349007
----------
Pred != 0: 0.37076953783761113
Good Pred | Pred != 0: 0.6213033078110152
Good Pred: 0.23036034029406915


#### With CV on Classifier

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

mu = X_train.mean()
X_train.fillna(mu, inplace=True)
X_test.fillna(mu, inplace=True)

In [31]:
cross_val_score_classif(X_train, y_train)

In [35]:
pred_mean = np.mean([model.predict(X_train) for model in models_classif], axis=0)
pred = np.where(pred_mean > 0.2, 1, 0)
X_train['sign'] = np.where(pred_mean < -0.2, -1, pred)

pred_mean = np.mean([model.predict(X_test) for model in models_classif], axis=0)
pred = np.where(pred_mean > 0.2, 1, 0)
X_test['sign'] = np.where(pred_mean < -0.2, -1, pred)

In [36]:
pred_not_null, good_pred_not_null = evaluate(X_train['sign'], y_train)
print("Pred != 0:", pred_not_null)
print("Good Pred | Pred != 0:", good_pred_not_null)
print("Good Pred:", pred_not_null*good_pred_not_null)
print("----------")
pred_not_null, good_pred_not_null = evaluate(X_test['sign'], y_test)
print("Pred != 0:", pred_not_null)
print("Good Pred | Pred != 0:", good_pred_not_null)
print("Good Pred:", pred_not_null*good_pred_not_null)

Pred != 0: 0.4158509734036172
Good Pred | Pred != 0: 0.6202543268314944
Good Pred: 0.25793336557068225
----------
Pred != 0: 0.41546480918481576
Good Pred | Pred != 0: 0.6165000076587485
Good Pred: 0.2561340580443794


In [39]:
model_reg = MetaModelReg()
model_reg.fit(X_train, y_train)
pred = model_reg.predict(X_test)

print("MAE:", np.mean(np.abs(pred-y_test)))

Training  LGBMR
Training  catboost
Training first layer selector
MAE: 6.263797957983771


#### With CV on Classifier & Regressor

In [37]:
cross_val_score_reg(X_train, y_train)

Training  LGBMR
Training  catboost
Training first layer selector
Fold 0 Val MAE: 6.254954237074795
Training  LGBMR
Training  catboost
Training first layer selector
Fold 1 Val MAE: 6.279565550756611
Training  LGBMR
Training  catboost
Training first layer selector
Fold 2 Val MAE: 6.266214868364509
Training  LGBMR
Training  catboost
Training first layer selector
Fold 3 Val MAE: 6.271564502268674
Training  LGBMR
Training  catboost
Training first layer selector
Fold 4 Val MAE: 6.272215527258818
Training  LGBMR
Training  catboost
Training first layer selector
Fold 5 Val MAE: 6.26350170347782
Training  LGBMR
Training  catboost
Training first layer selector
Fold 6 Val MAE: 6.2572012953523135
Training  LGBMR
Training  catboost
Training first layer selector
Fold 7 Val MAE: 6.238689633136678
Training  LGBMR
Training  catboost
Training first layer selector
Fold 8 Val MAE: 6.28545893455102
Training  LGBMR
Training  catboost
Training first layer selector
Fold 9 Val MAE: 6.261338069601696
Val Score: 

In [38]:
pred = np.mean([model.predict(X_test) for model in models_reg], axis=0)
print("MAE:", np.mean(np.abs(pred-y_test)))

MAE: 6.263430775828357
