In [None]:
import numpy as np 
import pandas as pd 
import warnings
warnings.filterwarnings("ignore")
import os
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("dark") # Theme for plots as Dark
# sns.set_palette("viridis")
sns.color_palette("flare")
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, StratifiedKFold, RepeatedKFold, TimeSeriesSplit
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
import optuna
import imblearn
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from catboost import Pool, CatBoostRegressor, cv
from itertools import combinations

<div id="1" style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #000000; font-weight: bold; font-size: 42px;">
   Dataset Overview
    </h1>
</div>
<div style="padding: 20px;font-size: 16px; font-family: 'Verdana'; color: #222222; text-align: left;">This notebook expands upon the work done by @bzeniti: <a href="https://www.kaggle.com/code/bzeniti/simple-imputation-xgb-regressor" style="color: #222222;text-decoration: underline;">Notebook Link</a>.<br> Make sure to visit and upvote it. 
</div>

In [None]:
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
test_data = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv")
test_target = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv")
sample_submission = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/sample_submission.csv")

df.head()

In [None]:
print(df.isna().sum().sort_values(ascending=False))

In [None]:
columns_to_keep = df.columns.difference(['far_price', 'near_price'])
df = df.dropna(subset=columns_to_keep)


print(df.isna().sum().sort_values(ascending=False))

<div id="2" style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #000000; font-weight: bold; font-size: 42px;">
   Simple Preprocessing
    </h1>
</div>

In [None]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    # 🔄 Return the DataFrame with optimized memory usage
    return df

In [None]:
from numba import njit, prange
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

In [None]:
def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features

In [None]:
def imbalance_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size']:
        for window in [1, 2, 3, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    return df.replace([np.inf, -np.inf], 0)


In [None]:
df = imbalance_features(df)

In [None]:
pd.set_option('display.max_rows', None)
print(df.isna().sum().sort_values(ascending=False))

In [None]:
df = df.fillna(-1)
pd.set_option('display.max_rows', None)
print(df.isna().sum().sort_values(ascending=False))

<div id="4" style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #000000; font-weight: bold; font-size: 42px;">
   Training Models
    </h1>
</div>

In [None]:
df.set_index('row_id', inplace=True)

X = df.drop(['target'], axis=1)
y = df['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 21)

seed = np.random.seed(6)

<div id="4.1" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
   3.1 Baseline XGB Model
    </h1>
</div>
<hr>

In [None]:
#6.366228412975999
xgb = XGBRegressor(n_estimators = 100,tree_method="gpu_hist")
print("CV score of XGB is ",-1*cross_val_score(xgb,X,y,cv=3, scoring = 'neg_mean_absolute_error').mean())

<div id="4.2" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
   3.2 Baseline CatBoost Model
    </h1>
</div>
<hr>

In [None]:
#6.296262366690487
cat = CatBoostRegressor(n_estimators = 50,verbose=50)
print("CV score of CAT is ",-1*cross_val_score(cat,X,y,cv=3, scoring = 'neg_mean_absolute_error').mean())

<div id="4.3" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
  3.3 Baseline LGBM Model
    </h1>
</div>
<hr>

In [None]:
#6.2747298085620935
lgbm = LGBMRegressor(n_estimators = 100, verbose=0, force_col_wise=True, device="gpu")
print("CV score of LGBM is ",-1*cross_val_score(lgbm,X,y,cv=3, scoring = 'neg_mean_absolute_error').mean())

<div id="4.4" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
   3.4 Optuna-Tuning XGB Model
    </h1>
</div>
<hr>

In [None]:
# def objective(trial):
#     params = {
#         'n_estimators' : trial.suggest_int('n_estimators',100,500),
#         'max_depth':  trial.suggest_int('max_depth',3,25),
#         'min_child_weight': trial.suggest_float('min_child_weight', 2,50),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.2,log=True),
#         'subsample': trial.suggest_float('subsample', 0.2, 1),
#         'gamma': trial.suggest_float("gamma", 1e-4, 1.0),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
#         "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
#     }
#     xgbmodel_optuna = XGBRegressor(**params,random_state=seed,tree_method = "gpu_hist")
#     cv = -1*cross_val_score(xgbmodel_optuna, X, y, cv = 3,scoring='neg_mean_absolute_error').mean()
#     return cv

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100,timeout=2000)

In [None]:
# def objective(trial):
#     params = {
#         'n_estimators' : trial.suggest_int('n_estimators',100,500),
#         'max_depth':  trial.suggest_int('max_depth',3,25),
#         'min_child_weight': trial.suggest_float('min_child_weight', 2,50),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.2,log=True),
#         'subsample': trial.suggest_float('subsample', 0.2, 1),
#         'gamma': trial.suggest_float("gamma", 1e-4, 1.0),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         "colsample_bylevel" : trial.suggest_float('colsample_bylevel',0.2,1),
#         "colsample_bynode" : trial.suggest_float('colsample_bynode',0.2,1),
#     }
#     xgbmodel_optuna = XGBRegressor(**params,random_state=seed,tree_method = "gpu_hist")
#     cv = -1*cross_val_score(xgbmodel_optuna, X, y, cv = 3,scoring='neg_mean_absolute_error').mean()
#     return cv

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100,timeout=2000)

In [None]:
# 6.272049186519806
xgb_params = {'max_depth': 20, 'min_child_weight': 25.551523309923255,
              'learning_rate': 0.006449859500510082, 'subsample': 0.7408613470518393,
              'gamma': 0.517595045309937, 'colsample_bytree': 0.25779788854085006,
              'colsample_bylevel': 0.5934831570321946, 'colsample_bynode': 0.5628928537973386}

xgb_opt = XGBRegressor(**xgb_params,n_estimators = 387,random_state=seed,tree_method = "gpu_hist")
print("CV score of Optuna XGB is ",-1*cross_val_score(xgb_opt,X,y,cv=3, scoring = 'neg_mean_absolute_error').mean())

<div id="4.5" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
   3.5 Optuna-Tuning LGBM Model
    </h1>
</div>
<hr>

In [None]:
# def objective(trial):
#     params = {
#         'n_estimators' : trial.suggest_int('n_estimators',100,1000),
#         "max_depth":trial.suggest_int('max_depth',3,40),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.25, log=True),
#         "min_child_weight" : trial.suggest_float('min_child_weight', 0.5,4),
#         "min_child_samples" : trial.suggest_int('min_child_samples',1,100),
#         "subsample" : trial.suggest_float('subsample', 0.2, 1),
#         "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         'num_leaves' : trial.suggest_int('num_leaves', 2, 128),
#     }
#     lgbmmodel_optuna = LGBMRegressor(**params,random_state=seed,device="gpu")
#     cv = -1*cross_val_score(lgbmmodel_optuna, X, y, cv = 3,scoring='neg_mean_absolute_error').mean()
#     return cv

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100,timeout=2000)

In [None]:
# def objective(trial):
#     params = {
#         'n_estimators' : trial.suggest_int('n_estimators',100,1000),
#         "max_depth":trial.suggest_int('max_depth',3,40),
#         "learning_rate" : trial.suggest_float('learning_rate',1e-4, 0.25, log=True),
#         "min_child_weight" : trial.suggest_float('min_child_weight', 0.5,4),
#         "min_child_samples" : trial.suggest_int('min_child_samples',1,100),
#         "subsample" : trial.suggest_float('subsample', 0.2, 1),
#         "subsample_freq" : trial.suggest_int('subsample_freq',0,5),
#         "colsample_bytree" : trial.suggest_float('colsample_bytree',0.2,1),
#         'num_leaves' : trial.suggest_int('num_leaves', 2, 128),
#     }
#     lgbmmodel_optuna = LGBMRegressor(**params,random_state=seed,device="gpu")
#     cv = -1*cross_val_score(lgbmmodel_optuna, X, y, cv = 3,scoring='neg_mean_absolute_error').mean()
#     return cv

# study = optuna.create_study(direction='minimize')
# study.optimize(objective, n_trials=100,timeout=2000)

In [None]:
lgbm_params = {'max_depth': 36, 'learning_rate': 0.0105942655225861,
               'min_child_weight': 1.9489096317119756, 'min_child_samples': 66,
               'subsample': 0.34776401409168767, 'subsample_freq': 4,
               'colsample_bytree': 0.8255341076749996, 'num_leaves': 55}

lgbm_opt = LGBMRegressor(**lgbm_params, n_estimators = 208, device="gpu")
print("CV score of Optuna LGBM is ",-1*cross_val_score(lgbm_opt,X,y,cv=3, scoring = 'neg_mean_absolute_error').mean())

<div id="4.6" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
   3.6 Time-Series Split 4-Fold EarlyStopping
    </h1>
</div>
<hr>

In [None]:
SPLITS = 4
cat_score = []
models = []

for i,(tr,val) in enumerate(TimeSeriesSplit(n_splits=SPLITS,test_size=1000000).split(X,y)):
    
    print("-"*30,f"FOLD {i+1}/{SPLITS}","-"*30)
    X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]
    print("Train Dataset:",len(X_train),"Test Dataset:",len(X_test))
    train_dataset = Pool(data=X.iloc[tr,:],label=y.iloc[tr])
    eval_dataset = Pool(data=X.iloc[val,:],label=y.iloc[val])
    catmodel = CatBoostRegressor(iterations=2000,verbose=250, od_type="Iter",eval_metric="MAE", random_seed=seed,early_stopping_rounds=250,learning_rate=0.085)
    catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)
    mae = mean_absolute_error(y.iloc[val], catmodel.predict(X.iloc[val,:]))
    cat_score.append(mae)
    if mae<7:
        models.append(catmodel)
    
print("\n\n","-"*50,sep="")
print("CV score of CAT is ",np.array(cat_score).mean())

In [None]:
# SPLITS = 4
# REPEATS = 1
# cat_score = []
# models = []

# for i,(tr,val) in enumerate(RepeatedKFold(n_splits=SPLITS, n_repeats=REPEATS,random_state=seed).split(X,y)):
    
#     print("-"*30,f"FOLD {i+1}/{SPLITS*REPEATS}","-"*30)
#     X_train, X_test, y_train, y_test = X.iloc[tr,:],X.iloc[val,:],y.iloc[tr],y.iloc[val]
    
#     train_dataset = Pool(data=X.iloc[tr,:],label=y.iloc[tr])
#     eval_dataset = Pool(data=X.iloc[val,:],label=y.iloc[val])
#     catmodel = CatBoostRegressor(iterations=1000,verbose=100, od_type="Iter",eval_metric="MAE", random_seed=seed,early_stopping_rounds=50)
#     catmodel.fit(train_dataset, use_best_model=True, eval_set=eval_dataset)
#     mae = mean_absolute_error(y.iloc[val], catmodel.predict(X.iloc[val,:]))
#     cat_score.append(mae)
#     models.append(catmodel)
    
# print("\n\n","-"*50,sep="")
# print("CV score of CAT is ",np.array(cat_score).mean())

<div id="4.4" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
   3.4 Evaluating OOF 4-Fold CatBoost 
    </h1>
</div>
<hr>

In [None]:
sub = pd.DataFrame()
sub["index"] = X_test.index
sub["preds"] = 0

for i,model in enumerate(models):
    print(f"> Getting preds from Model{i+1}")
    sub["preds"] += model.predict(X_test)

sub["preds"] = sub["preds"]/len(models)    

In [None]:
mae = mean_absolute_error(y_test, sub["preds"])
print(f"MAE of merged predictions: {mae}")

In [None]:
xgb_opt.fit(X,y)
lgbm_opt.fit(X,y)
models.extend([xgb_opt,lgbm_opt])
len(models)

<div id="4.5" >
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #263A29; font-weight: bold; font-size: 36px;">
   3.5 Predictions on Example 'test.csv'
    </h1>
</div>
<hr>

In [None]:
def Preprocess1(data):
    data.set_index("row_id",inplace=True)
    data.drop(['far_price', 'near_price'], axis = 1,inplace=True)
    return data

In [None]:
test_data = Preprocess1(test_data)
test_data.drop(['time_id'], axis = 1,inplace=True)
test_data.head()

In [None]:
test_target.head()

In [None]:
y_sample_pred = pd.DataFrame()
y_sample_pred["index"] =  test_data.index
y_sample_pred["preds"] = 0

for i,model in enumerate(models):
    print(f"> Getting preds from Model{i+1}")
    y_sample_pred["preds"] += model.predict(test_data)

y_sample_pred["preds"] = y_sample_pred["preds"]/len(models)    

y_sample_true = test_target['revealed_target'].dropna()

In [None]:
mae_test = mean_absolute_error(y_sample_pred["preds"], y_sample_true)
print(f"MAE on the test Dataset: {mae_test}")

<div id="5" style="background-color: #DDDDDD; padding: 20px; border-radius: 20px; border: 2px solid black;">
    <h1 style="font-family:  'Garamond', 'Lucida Sans', sans-serif; text-align: center; color: #000000; font-weight: bold; font-size: 42px;">
   Submission from API
    </h1>
</div>

In [None]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    if counter == 0:
        print("Submitting Predictions")

    test_df = Preprocess1(test)
    test_df = applyPreprocess2(test_df)
    
    sample_prediction['target'] = 0
    
    for i,model in enumerate(models):
        sample_prediction["target"] += model.predict(test_df)
        
    sample_prediction["target"] = sample_prediction["target"]/len(models)    
    
    env.predict(sample_prediction)
    counter += 1

In [None]:
print(sample_prediction.head(20))