In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
from tqdm import tqdm
import gc

from sklearn import set_config
from sklearn.base import clone
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette('viridis')

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None

2023-10-04 21:56:10.140212: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-04 21:56:10.318859: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-04 21:56:10.988564: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-04 21:56:10.988645: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-04 21:56:10.991641: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [2]:
# DATA_DIR = '/kaggle/input/optiver-trading-at-the-close'
DATA_DIR = 'input/'

In [3]:
train = pd.read_csv(f'{DATA_DIR}/train.csv').drop(['row_id', 'time_id'], axis = 1)
test = pd.read_csv(f'{DATA_DIR}/example_test_files/test.csv').drop(['row_id', 'time_id'], axis = 1)

# Preparation

In [4]:
X = train[~train.target.isna()]
y = X.pop('target')

seed = 42
tss = TimeSeriesSplit(10)

os.environ['PYTHONHASHSEED'] = '42'
tf.keras.utils.set_random_seed(seed)

In [6]:
def imbalance_calculator(x):
    
    x_copy = x.copy()
    
    x_copy['imb_s1'] = x.eval('(bid_size - ask_size) / (bid_size + ask_size)')
    x_copy['imb_s2'] = x.eval('(imbalance_size - matched_size) / (matched_size + imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                x_copy[f'{a}_{b}_imb'] = x.eval(f'({a} - {b}) / ({a} + {b})')
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = x[[a,b,c]].max(axis=1)
                    min_ = x[[a,b,c]].min(axis=1)
                    mid_ = x[[a,b,c]].sum(axis=1)-min_-max_

                    x_copy[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
    
    return x_copy

ImbalanceCalculator = FunctionTransformer(imbalance_calculator)

In [10]:
class LGBMRegressorCV:
    
    def __init__(self, params, cv = tss, n_estimators = 1000):
        
        self.params = params
        self.cv = cv
        self.n_estimators = n_estimators
        
        self.models = []
        self.best_val_score = None
        self.val_scores = []
        self.train_scores = []
        self.best_model = None
        
    def fit(self, X, y):
        
        for train_index, test_index in tqdm(self.cv.split(X, y)):
            
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            
            model = LGBMRegressor(**self.params, n_estimators = self.n_estimators)
            model.fit(X_train, y_train, eval_set = [(X_test, y_test)], eval_metric='mae')
            
            self.models.append(model)
            self.train_scores.append(mean_absolute_error(y_train, model.predict(X_train)))
            self.val_scores.append(mean_absolute_error(y_test, model.predict(X_test)))

        if self.best_val_score is None:
            self.best_val_score = self.val_scores[-1]
            self.best_model = clone(model)
        elif self.val_scores[-1] < self.best_val_score:
                self.best_val_score = self.val_scores[-1]
                self.best_model = clone(model)
            
        return self
    

    def predict(self, X):
        # Use best model to predict
        return self.best_model.predict(X)
    
    def get_best_model_params(self):
        # Get parameters of the best model
        return self.best_model.get_params()

In [11]:
# Train model
model = make_pipeline(
    ImbalanceCalculator, 
    LGBMRegressorCV(params = {'random_state': seed}, n_estimators = 1000))
model.fit(X, y)

0it [00:00, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.049837 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12283
[LightGBM] [Info] Number of data points in the train set: 476172, number of used features: 51
[LightGBM] [Info] Start training from score -0.096914


1it [01:17, 77.67s/it]

: 