### Loading Libraries and Datasets

In [1]:
import numpy as np
import pandas as pd
import plotly_express as px

from sklearn import set_config
from sklearn.base import clone
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

In [2]:
data = pd.read_csv("train.csv")
medianVol = pd.read_csv("MedianVolV2.csv")
data = data.merge(medianVol, how = "left", on = "stock_id")
seed = 42

### Cross-Validation

In [3]:
cv = KFold(n_splits=5)
models = []

def cross_val_score(estimatorConstructor, X, y, cv = cv, split=False):
    preds_not_null, good_preds_not_null = [], []
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        
        model = estimatorConstructor()
        
        #define train set
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        
        #define validation set
        X_val = X.iloc[val_idx]
        y_val = y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        models.append(model)

        val_preds = model.predict(X_val)
        pred_not_null, good_pred_not_null = evaluate(val_preds, y_val)
        print("Fold", fold, "----------")
        print("Pred != 0:", pred_not_null)
        print("Good Pred | Pred != 0:", good_pred_not_null)
        print("Good Pred:", pred_not_null*good_pred_not_null)
        
        preds_not_null.append(pred_not_null)
        good_preds_not_null.append(good_pred_not_null)
    print("--------------------")
    print("Mean Pred != 0:", np.mean(preds_not_null))
    print("Mean Good Pred | Pred != 0:", np.mean(good_preds_not_null))

### Model

In [4]:
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
                'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2', 'overall_medvol', 'first5min_medvol', 'last5min_medvol']
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    df['mid_price'] = (df['ask_price'] + df['bid_price']) / 2
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap', 'mid_price']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')    
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    return df[features]

In [24]:
class MetaModel : 
    def __init__(self):
        self.models = [
            {
                "type": "LGBM Classifier",
                "model": LGBMClassifier(random_seed=seed, n_estimators=100, verbose=0)
            },
            {
                "type": "CatBoost Classifier",
                "model": CatBoostClassifier(random_seed=seed, n_estimators=100, verbose=0)
            }
        ]
        
        return
    
    def fit(self, X, y):
        y = np.where(y >= 0, 1, 0)
        
        for i, method in enumerate(self.models):
            type, model = method.values()
            print("Training ", type)
            model.fit(X, y)
        
        return

    def predict(self, X):
        models_pred_class = []
        models_pred_proba = []
        for i, method in enumerate(self.models):
            type, model = method.values()
            models_pred_class.append(model.predict(X))
            models_pred_proba.append(np.max(model.predict_proba(X), axis=1))
        
        pred_class = np.mean(np.array(models_pred_class).T, axis=1)
        pred_proba = np.max(np.array(models_pred_proba).T, axis=1)
            
        prediction = np.where((pred_class == 1) & (pred_proba > 0.57), 1, 0)
        prediction = np.where((pred_class == 0) & (pred_proba > 0.57), -1, prediction)
        
        return prediction

In [6]:
def evaluate(pred, value):
    pred_not_null = len(value[pred!=0])/len(value)
    good_pred_not_null = len(value[(np.sign(value) == pred) & (pred!=0)])/len(value[pred!=0])
    return pred_not_null, good_pred_not_null

### Performance

In [7]:
X = data[~data.target.isna()]
y = X.pop('target')
X = generate_features(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

mu = X_train.mean()
X_train.fillna(mu, inplace=True)
X_test.fillna(mu, inplace=True)

In [11]:
model = MetaModel()
model.fit(X_train, y_train)
pred = model.predict(X_test)
pred_not_null, good_pred_not_null = evaluate(pred, y_test)
print("Pred != 0:", pred_not_null)
print("Good Pred | Pred != 0:", good_pred_not_null)
print("Good Pred:", pred_not_null*good_pred_not_null)

Training  LGBM Classifier
Training  CatBoost Classifier
Pred != 0: 0.37076953783761113
Good Pred | Pred != 0: 0.6213033078110152
Good Pred: 0.23036034029406915


In [101]:
cross_val_score(MetaModel, X_train, y_train)

Training  LGBM Classifier
Training  CatBoost Classifier
Fold 0 ----------
Pred != 0: 0.34536486397921007
Good Pred | Pred != 0: 0.6243502582465635
Good Pred: 0.2156286420147091
Training  LGBM Classifier
Training  CatBoost Classifier
Fold 1 ----------
Pred != 0: 0.34257805487563364
Good Pred | Pred != 0: 0.6253535067873304
Good Pred: 0.21423238796486
Training  LGBM Classifier
Training  CatBoost Classifier
Fold 2 ----------
Pred != 0: 0.34430961550243133
Good Pred | Pred != 0: 0.624928102357415
Good Pred: 0.21516875463934562
Training  LGBM Classifier
Training  CatBoost Classifier
Fold 3 ----------
Pred != 0: 0.34296037624826003
Good Pred | Pred != 0: 0.62419199388491
Good Pred: 0.21407312107392035
Training  LGBM Classifier
Training  CatBoost Classifier
Fold 4 ----------
Pred != 0: 0.33852085194414405
Good Pred | Pred != 0: 0.6245286195286195
Good Pred: 0.21141596034632848
--------------------
Mean Pred != 0: 0.3427467525099358
Mean Good Pred | Pred != 0: 0.6246704961609677


In [105]:
pred_mean = np.mean([model.predict(X_test) for model in models], axis=0)
pred = np.where(pred_mean > 0.2, 1, 0)
pred = np.where(pred_mean < -0.2, -1, pred)
pred_not_null, good_pred_not_null = evaluate(pred, y_test)
print("Pred != 0:", pred_not_null)
print("Good Pred | Pred != 0:", good_pred_not_null)
print("Good Pred:", pred_not_null*good_pred_not_null)

Pred != 0: 0.3736419622737568
Good Pred | Pred != 0: 0.6219245573994878
Good Pred: 0.2323771120129823
