In [1]:
import gc
from itertools import combinations
import pathlib
from typing import Any, Dict, List
import warnings
import yaml

import lightgbm as lgb
import matplotlib.pyplot as plt
from mlxtend.evaluate.time_series import (GroupTimeSeriesSplit, plot_splits)
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import polars as pl
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

gc.enable()
pd.set_option('display.max_columns', 200)
warnings.simplefilter('ignore')

In [2]:
inputs_dir_path = pathlib.Path('../inputs')
outputs_dir_path = pathlib.Path('../outputs')
if not outputs_dir_path.is_dir():
    outputs_dir_path.mkdir()

train_df = pd.read_csv(inputs_dir_path. joinpath('train.csv'))
display(train_df)
train_df.drop(columns=['row_id', 'time_id'], inplace=True)

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.50,1.000026,8493.03,1.000000,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.000660,20605.09,1.000000,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.00,1.000298,18995.00,1.000000,-8.389950,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.90,1.000214,479032.40,1.000000,-4.010200,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.10,1.000000,-7.349849,0,0_0_4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237975,195,480,540,2440722.89,-1,1.000317,28280361.74,0.999734,0.999734,1.000317,32257.04,1.000434,319862.40,1.000328,2.310276,26454,480_540_195
5237976,196,480,540,349510.47,-1,1.000643,9187699.11,1.000129,1.000386,1.000643,205108.40,1.000900,93393.07,1.000819,-8.220077,26454,480_540_196
5237977,197,480,540,0.00,0,0.995789,12725436.10,0.995789,0.995789,0.995789,16790.66,0.995883,180038.32,0.995797,1.169443,26454,480_540_197
5237978,198,480,540,1000898.84,1,0.999210,94773271.05,0.999210,0.999210,0.998970,125631.72,0.999210,669893.00,0.999008,-1.540184,26454,480_540_198


In [None]:
cast_dtypes = {
    'stock_id': np.int16,
    'date_id': np.int16,
    'seconds_in_bucket': np.int16,
    'imbalance_size': np.float32,
    'imbalance_buy_sell_flag': np.int16,
    'reference_price': np.float32,
    'matched_size': np.float32,
    'far_price': np.float32,
    'near_price': np.float32,
    'bid_price': np.float32,
    'bid_size': np.float32,
    'ask_price': np.float32,
    'ask_size': np.float32,
    'wap': np.float32,
    'target': np.float32
}

display(train_df.dtypes)
train_df = train_df.astype(cast_dtypes)
display(train_df)

In [None]:
weights = np.array([
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
])

index_wap = (
    train_df
    .groupby(['date_id', 'seconds_in_bucket'])
    .apply(lambda x: (weights[x['stock_id']] * x['wap']).sum() / weights[x['stock_id']].sum())
)
index_wap = pd.DataFrame(index_wap, columns=['index_wap'])
train_df = train_df.merge(index_wap, on=['date_id', 'seconds_in_bucket'])
display(train_df)

del index_wap
gc.collect()

In [None]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = pl.from_pandas(df)

    new_features0 = [
        (pl.col('imbalance_size') / 1e+06).name.keep(),
        (pl.col('matched_size') / 1e+06).name.keep(),
        (pl.col('bid_size') / 1e+06).name.keep(),
        (pl.col('ask_size') / 1e+06).name.keep(),
    ]
    new_features1 = [
        (pl.col('ask_price') - pl.col('bid_price')).alias('feature1'),
        (pl.col('ask_price') - pl.col('reference_price')).alias('feature2'),
        (pl.col('bid_price') - pl.col('reference_price')).alias('feature3'),
        (pl.col('ask_price') - pl.col('wap')).alias('feature4'),
        (pl.col('bid_price') - pl.col('wap')).alias('feature5'),
        (pl.col('far_price') - pl.col('near_price')).alias('feature6'),
        (pl.col('far_price') - pl.col('reference_price')).alias('feature7'),
        (pl.col('near_price') - pl.col('reference_price')).alias('feature8'),
        (pl.col('index_wap') - pl.col('reference_price')).alias('feature9'),
        (pl.col('index_wap') - pl.col('ask_price')).alias('feature10'),
        (pl.col('index_wap') - pl.col('bid_price')).alias('feature11'),
        (pl.col('index_wap') - pl.col('far_price')).alias('feature12'),
        (pl.col('index_wap') - pl.col('near_price')).alias('feature13'),
        (pl.col('index_wap') - pl.col('wap')).alias('feature14'),
        (pl.col('ask_size') - pl.col('bid_size')).alias('feature15'),
        (pl.col('ask_size') - pl.col('matched_size')).alias('feature16'),
        (pl.col('bid_size') - pl.col('matched_size')).alias('feature17'),
        (pl.col('imbalance_size') - pl.col('matched_size')).alias('feature18'),
        (pl.col('ask_price') + pl.col('bid_price')).alias('feature19'),
        (pl.col('far_price') + pl.col('near_price')).alias('feature20'),
        (pl.col('ask_size') + pl.col('bid_size')).alias('feature21'),
        ((pl.col('ask_price') - pl.col('bid_price')) * ((pl.col('ask_size') - pl.col('bid_size')) / (pl.col('ask_size') + pl.col('bid_size')))).alias('feature22'),
        ((pl.col('ask_price') - pl.col('reference_price')) * ((pl.col('ask_size') - pl.col('bid_size')) / (pl.col('ask_size') + pl.col('bid_size')))).alias('feature23'),
        ((pl.col('bid_price') - pl.col('reference_price')) * ((pl.col('ask_size') - pl.col('bid_size')) / (pl.col('ask_size') + pl.col('bid_size')))).alias('feature24'),
        ((pl.col('ask_price') - pl.col('wap')) * ((pl.col('ask_size') - pl.col('bid_size')) / (pl.col('ask_size') + pl.col('bid_size')))).alias('feature25'),
        ((pl.col('bid_price') - pl.col('wap')) * ((pl.col('ask_size') - pl.col('bid_size')) / (pl.col('ask_size') + pl.col('bid_size')))).alias('feature26'),
        ((pl.col('ask_price') - pl.col('bid_price')) * ((pl.col('matched_size') - pl.col('imbalance_size')) / (pl.col('matched_size') + pl.col('imbalance_size')))).alias('feature27'),
        ((pl.col('ask_price') - pl.col('reference_price')) * ((pl.col('matched_size') - pl.col('imbalance_size')) / (pl.col('matched_size') + pl.col('imbalance_size')))).alias('feature28'),
        ((pl.col('bid_price') - pl.col('reference_price')) * ((pl.col('matched_size') - pl.col('imbalance_size')) / (pl.col('matched_size') + pl.col('imbalance_size')))).alias('feature29'),
        ((pl.col('ask_price') - pl.col('wap')) * ((pl.col('matched_size') - pl.col('imbalance_size')) / (pl.col('matched_size') + pl.col('imbalance_size')))).alias('feature30'),
        ((pl.col('bid_price') - pl.col('wap')) * ((pl.col('matched_size') - pl.col('imbalance_size')) / (pl.col('matched_size') + pl.col('imbalance_size')))).alias('feature31'),

        (pl.col('bid_price') / pl.col('ask_price')).alias('feature32'),
        (pl.col('reference_price') / pl.col('ask_price')).alias('feature33'),
        (pl.col('reference_price') / pl.col('bid_price')).alias('feature34'),
        (pl.col('wap') / pl.col('ask_price')).alias('feature35'),
        (pl.col('wap') / pl.col('bid_price')).alias('feature36'),
        (pl.col('index_wap') / pl.col('ask_price')).alias('feature37'),
        (pl.col('index_wap') / pl.col('bid_price')).alias('feature38'),
        (pl.col('index_wap') / pl.col('reference_price')).alias('feature39'),
        (pl.col('index_wap') / pl.col('wap')).alias('feature40'),

        (pl.col('bid_size') / pl.col('ask_size')).alias('feature41'),
        (pl.col('ask_size') / pl.col('matched_size')).alias('feature42'),
        (pl.col('bid_size') / pl.col('matched_size')).alias('feature43'),
        (pl.col('bid_size') / pl.col('imbalance_size')).alias('feature44'),
        (pl.col('ask_size') / pl.col('imbalance_size')).alias('feature45'),
        (pl.col('matched_size') / pl.col('imbalance_size')).alias('feature46'),
    ]

    base_features = [
        'imbalance_size',
        'imbalance_buy_sell_flag',
        'matched_size',
        'reference_price',
        'far_price',
        'near_price',
        'bid_size',
        'bid_price',
        'ask_size',
        'ask_price',
        'wap',
        'index_wap',
    ]
    #base_features += [f'feature{i}' for i in range(1, 47)]
    
    new_features2 = [
        pl.col(base_features).sort_by(['date_id', 'seconds_in_bucket']).pct_change(n=1).over(['stock_id']).name.prefix('pct_change1_'),
        pl.col(base_features).sort_by(['date_id', 'seconds_in_bucket']).pct_change(n=6).over(['stock_id']).name.prefix('pct_change6_'),
        pl.col(base_features).sort_by(['date_id', 'seconds_in_bucket']).pct_change(n=12).over(['stock_id']).name.prefix('pct_change12_'),
    ]

    new_features3 = [
        ((pl.col(feature) - pl.col(feature).mean()) / pl.col(feature).std())
        .over(['date_id', 'seconds_in_bucket'])
        .alias(f'standardized_{feature}')
        for feature in base_features
    ]

    replace_inf = [
        pl.when(pl.col(feature).is_infinite()).then(None).otherwise(pl.col(feature)).name.keep()
        for feature in df.columns
    ]

    df = (
        df
        .with_columns(new_features0)
        .with_columns(new_features1)
        .with_columns(new_features2)
        .with_columns(new_features3)
        .with_columns(replace_inf)
        .drop(['row_id'])
        .sort(by=['stock_id', 'date_id', 'seconds_in_bucket'])
        .to_pandas()
    )

    return df

In [None]:
train_df = preprocess(train_df)
train_df = train_df.dropna(subset=['target'])
display(train_df)

## train lightgbm models using cross validation

In [None]:
def train(
        df: pd.DataFrame,
        model_params: Dict[str, Any],
        outputs_dir: pathlib.Path,
        step: int,
    ):
    
    target_columns = ['stock_id', 'date_id', 'seconds_in_bucket', 'target']
    feature_columns = [col for col in df.columns if col not in ['date_id', 'target', 'cluster']]
    fimps = []
    history = {
        'train_mae': [],
        'valid_mae': [],
    }
    oofs = []

    df = df.sort_values(by=['cluster', 'stock_id', 'date_id', 'seconds_in_bucket'])
    groups = df['cluster']

    cv_args = {'n_splits': 5, 'test_size': 1}
    kfold = GroupTimeSeriesSplit(**cv_args)

    plot_splits(df, y=None, groups=groups, **cv_args)

    for k, (train_indices, valid_indices) in enumerate(kfold.split(X=df, groups=groups)):

        train_X = df.iloc[train_indices][feature_columns]
        train_y = df.iloc[train_indices][target_columns]
        valid_X = df.iloc[valid_indices][feature_columns]
        valid_y = df.iloc[valid_indices][target_columns]
        print(f'train_X.shape: {train_X.shape}, train_y.shape: {train_y.shape}')
        print(f'valid_X.shape: {valid_X.shape}, valid_y.shape: {valid_y.shape}')

        callbacks = [
            lgb.early_stopping(stopping_rounds=100, verbose=True),
            lgb.log_evaluation(500),
        ]
        
        train_dataset = lgb.Dataset(
            train_X,
            train_y['target'],
            #categorical_feature=['imbalance_buy_sell_flag'],
        )

        valid_dataset = lgb.Dataset(
            valid_X,
            valid_y['target'],
            #categorical_feature=['imbalance_buy_sell_flag'],
        )
        
        model = lgb.train(
            params=model_params,
            train_set=train_dataset,
            valid_sets=[train_dataset, valid_dataset],
            valid_names=['train', 'valid'],
            callbacks=callbacks,
            num_boost_round=5000,
        )
        model.save_model(
            outputs_dir.joinpath(f'lightgbm_fold{k+1}.txt'),
            num_iteration=model.best_iteration
        )
        
        fimp = model.feature_importance(importance_type='gain')
        fimp = pd.DataFrame(fimp, index=feature_columns, columns=[f'fold{k+1}'])
        fimps.append(fimp)

        train_pred = model.predict(train_X, num_iteration=model.best_iteration)
        valid_pred = model.predict(valid_X, num_iteration=model.best_iteration)

        history['train_mae'].append(mean_absolute_error(train_y['target'], train_pred))
        history['valid_mae'].append(mean_absolute_error(valid_y['target'], valid_pred))

        valid_y['regression'] = valid_pred
        oofs.append(valid_y)
        
        del train_X, train_y, train_dataset, valid_X, valid_y, valid_dataset, model, fimp
        del train_pred, valid_pred
        gc.collect()

    history = pd.DataFrame.from_dict(history)
    
    fimps = pd.concat(fimps, axis=1)
    mean_fimps = fimps.mean(axis=1)
    std_fimps = fimps.std(axis=1)
    fimps['mean_fimps'] = mean_fimps
    fimps['std_fimps'] = std_fimps
    fimps.sort_values(by='mean_fimps', inplace=True)

    oofs = pd.concat(oofs)
    oof_mae = mean_absolute_error(oofs['target'], oofs['regression'])
    
    print(f'test_y mae: {oof_mae:.4f}')
    
    with open(outputs_dir.joinpath('oofs_lightgbm_optuna.yaml'), 'w') as f:
        yaml.dump(
            {
                'oof_mae': oof_mae,
            },
            f,
            default_flow_style=False
        )
    return history, oofs, fimps


def plot_time(all_time, train_time, valid_time):
    _, ax = plt.subplots()
    ax.barh(y='all', height=0.6, width=len(all_time), left=0, color='tab:blue')
    ax.barh(y='train+valid+test', height=0.6, width=[len(train_time), len(valid_time)],
            left=[train_time.min(), valid_time.min()], color=['tab:orange', 'tab:green', 'tab:red'])
    xcenter = [len(all_time)//2, train_time.min()+len(train_time)//2,
               valid_time.min()+len(valid_time)//2]
    ycenter = [0, 1, 1, 1]
    width = [f'all\n{len(all_time)}', f'train\n{len(train_time)}', f'valid\n{len(valid_time)}']
    for x, y, w in zip(xcenter, ycenter, width):
        ax.text(x, y, str(w),  ha='center', va='center')
    ax.set_xticks([train_time.min(), valid_time.min(), valid_time.max(), len(all_time)])
    ax.grid(axis='x', linestyle='--')
    ax.tick_params(axis='x', labelrotation=45)
    plt.show()

In [None]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'mae',
    'metric': 'mae',
    'learning_rate': 1e-02,
    'seed': 42,
    'max_depth':  10,
    'min_data_in_leaf': 50,
    'feature_fraction': 1.0,
    'feature_fraction_bynode': 0.6,
    'lambda_l2': 0.0,
    #'bagging_fraction': 0.6,
    'verbose': -1,
}

history, oofs, fimps = train(
    df=train_df,
    model_params=params,
    outputs_dir=outputs_dir_path,
    step=120,
)

In [None]:
display(oofs)
print(fimps.shape)
display(fimps.tail(50))

_, ax = plt.subplots(figsize=(12, 24))
fimps['mean_fimps'].plot(kind='barh', xerr=fimps['std_fimps'], capsize=3, ax=ax)  
plt.tight_layout()
plt.show()

fimps_quantile_th = fimps['mean_fimps'].quantile(q=0.2)
display(fimps.query('mean_fimps < @fimps_quantile_th').index)

In [None]:
history.plot(marker='.', linestyle=':')
plt.show()

In [None]:
_, ax = plt.subplots()
ax.hist2d(oofs['regression'], oofs['target'], bins=100, cmap='Blues', vmax=1e+03)
ax.plot([-100, 100], [-100, 100], color='tab:orange')
ax.set_xlabel('regression')
ax.set_ylabel('target')
plt.show()

r = np.corrcoef(oofs['regression'], oofs['target'])
print(f'correlation coeeficient: {r[0, 1]:.4f}')

In [None]:
mae_per_sotck = oofs.groupby('stock_id')[['target', 'regression']].apply(lambda x: np.mean(abs(x['target'] - x['regression'])))
display(mae_per_sotck.describe())

_, ax = plt.subplots()
ax.plot(mae_per_sotck.values, marker='o', linestyle=':')
plt.show()

display(mae_per_sotck[mae_per_sotck <= 6])
display(mae_per_sotck[mae_per_sotck <= 6].describe())

display(mae_per_sotck[mae_per_sotck > 6])
display(mae_per_sotck[mae_per_sotck > 6].describe())

In [None]:
_, axs = plt.subplots(2, 1, sharex=True)
bins = np.linspace(-100, 100, 100)
axs[0].hist(oofs.query('stock_id in @mae_per_sotck[@mae_per_sotck <= 6].index')['target'], bins=bins, histtype='step', density=True)
axs[0].hist(oofs.query('stock_id in @mae_per_sotck[@mae_per_sotck > 6].index')['target'], bins=bins, histtype='step', density=True)
axs[1].hist(oofs.query('stock_id in @mae_per_sotck[@mae_per_sotck <= 6].index')['regression'], bins=bins, histtype='step', density=True)
axs[1].hist(oofs.query('stock_id in @mae_per_sotck[@mae_per_sotck > 6].index')['regression'], bins=bins, histtype='step', density=True)
plt.show()

In [None]:
stock = 0

fig = make_subplots(specs=[[{'secondary_y': True}]])
fig.add_trace(
    go.Scatter(
        x=oofs.query('stock_id==@stock').index, y=oofs.query('stock_id==@stock')['target'],
        name='target', mode='lines+markers', marker={'size': 5},
    ),
)
fig.add_trace(
    go.Scatter(
        x=oofs.query('stock_id==@stock').index, y=oofs.query('stock_id==@stock')['regression'],
        name='target', mode='lines+markers', marker={'size': 5},
    ),
)

In [None]:
stock = 31

fig = make_subplots(specs=[[{'secondary_y': True}]])
fig.add_trace(
    go.Scatter(
        x=oofs.query('stock_id==@stock').index, y=oofs.query('stock_id==@stock')['target'],
        name='target', mode='lines+markers', marker={'size': 5},
    ),
)
fig.add_trace(
    go.Scatter(
        x=oofs.query('stock_id==@stock').index, y=oofs.query('stock_id==@stock')['regression'],
        name='target', mode='lines+markers', marker={'size': 5},
    ),
)

## train lightgbm model using all data

In [None]:
# target_columns = ['stock_id', 'date_id', 'seconds_in_bucket', 'target']
# feature_columns = [col for col in train_df.columns if col not in ['date_id', 'target']]

# callbacks = [
#     lgb.log_evaluation(500),
# ]

# train_dataset = lgb.Dataset(
#     train_df[feature_columns],
#     train_df[target_columns]['target'],
# )

# del train_df
# gc.collect()

# model = lgb.train(
#     params=params,
#     train_set=train_dataset,
#     callbacks=callbacks,
#     num_boost_round=5000,
# )

# model.save_model(
#     outputs_dir_path.joinpath(f'lightgbm_trained_using_alldata.txt'),
#     num_iteration=model.best_iteration,
# )