In [None]:
import gc
from itertools import combinations
import pathlib
from typing import Any, Dict, List
import warnings
import yaml

from catboost import CatBoostRegressor, Pool
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import polars as pl
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit

gc.enable()
pd.set_option('display.max_columns', 200)
warnings.simplefilter('ignore')

In [None]:
inputs_dir_path = pathlib.Path('../inputs')
outputs_dir_path = pathlib.Path('../outputs')
if not outputs_dir_path.is_dir():
    outputs_dir_path.mkdir()

train_df = pd.read_csv(inputs_dir_path. joinpath('train.csv'))
train_df.drop(columns=['row_id', 'time_id'], inplace=True)
display(train_df)

In [None]:
cast_dtypes = {
    'stock_id': np.int16,
    'date_id': np.int16,
    'seconds_in_bucket': np.int16,
    'imbalance_size': np.float32,
    'imbalance_buy_sell_flag': np.int16,
    'reference_price': np.float32,
    'matched_size': np.float32,
    'far_price': np.float32,
    'near_price': np.float32,
    'bid_price': np.float32,
    'bid_size': np.float32,
    'ask_price': np.float32,
    'ask_size': np.float32,
    'wap': np.float32,
    'target': np.float32
}

display(train_df.dtypes)
train_df = train_df.astype(cast_dtypes)
display(train_df)

In [None]:
weights = np.array([
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
])

index_wap = (
    train_df
    .groupby(['date_id', 'seconds_in_bucket'])
    .apply(lambda x: (weights[x['stock_id']] * x['wap']).sum() / weights[x['stock_id']].sum())
)
index_wap = pd.DataFrame(index_wap, columns=['index_wap'])
train_df = train_df.merge(index_wap, on=['date_id', 'seconds_in_bucket'])
display(train_df)

del index_wap
gc.collect()

In [None]:
train_df = train_df.set_index(['stock_id', 'date_id', 'seconds_in_bucket']).sort_index().unstack(level=0)
train_df.columns = [col[0] + '_' + str(col[1]) for col in train_df.columns]
train_df.reset_index(drop=False, inplace=True)

targets = [f'target_{i}' for i in range(200)]
train_df[targets] = train_df[targets].fillna(0)
display(train_df[targets])

display(train_df)

## train lightgbm models using cross validation

In [None]:
def train(
        dataset: pd.DataFrame,
        model_params: Dict[str, Any],
        outputs_dir: pathlib.Path,
    ):
    
    target_columns = ['date_id', 'seconds_in_bucket']
    target_columns += [f'target_{i}' for i in range(200)]
    feature_columns = [col for col in dataset.columns if col not in target_columns]
    days= np.arange(dataset['date_id'].min(), dataset['date_id'].max())
    fimps = []
    history = {
        'train_mae': [],
        'valid_mae': [],
    }

    step = 60
    valid_days = days[-step:]
    valid_X = dataset.query('date_id in @valid_days')[feature_columns]
    valid_y = dataset.query('date_id in @valid_days')[target_columns]

    train_day_lower_limits = np.arange(0, 480-step, step)

    for k, lower_limit in enumerate(train_day_lower_limits):
        train_days = np.arange(lower_limit, lower_limit+step)
        print(f'fold {k+1}')
        print(train_days)
        print(valid_days)
        
        plot_time(days, train_days, valid_days)
        
        train_X = dataset.query('date_id in @train_days')[feature_columns]
        train_y = dataset.query('date_id in @train_days')[target_columns]
        print(f'train_X.shape: {train_X.shape}, train_y.shape: {train_y.shape}')
        print(f'valid_X.shape: {valid_X.shape}, valid_y.shape: {valid_y.shape}')
        
        train_pool = Pool(
            data=train_X,
            label=train_y[[f'target_{i}' for i in range(10)]],
        )

        valid_pool = Pool(
            data=valid_X,
            label=valid_y[[f'target_{i}' for i in range(10)]],
        )
        
        model = CatBoostRegressor(**model_params)
        model.fit(
            X=train_pool,
            eval_set=[(valid_pool)],
            use_best_model=True,
            early_stopping_rounds=100,
            verbose=100,
        )
        model.save_model(
            outputs_dir.joinpath(f'catboost_fold{k+1}.txt'),
        )
        
        fimp = model.get_feature_importance(valid_pool, type='PredictionValuesChange')
        fimp = pd.DataFrame(fimp, index=feature_columns, columns=[f'fold{k+1}'])
        fimps.append(fimp)

        train_pred = model.predict(train_X)
        valid_pred = model.predict(valid_X)

        valid_y[f'regression_fold{k+1}'] = valid_pred

        history['train_mae'].append(mean_absolute_error(train_y['target'], train_pred))
        history['valid_mae'].append(mean_absolute_error(valid_y['target'], valid_pred))
        
        del train_X, train_y, train_pool, valid_pool, model, fimp
        del train_pred, valid_pred
        gc.collect()

    del valid_X
    gc.collect()

    history = pd.DataFrame.from_dict(history)
    
    fimps = pd.concat(fimps, axis=1)
    mean_fimps = fimps.mean(axis=1)
    std_fimps = fimps.std(axis=1)
    fimps['mean_fimps'] = mean_fimps
    fimps['std_fimps'] = std_fimps
    fimps.sort_values(by='mean_fimps', inplace=True)
    
    valid_y['regression'] = valid_y[[f'regression_fold{k+1}' for k in range(len(train_day_lower_limits))]].mean(axis=1)
    test_y_mae = mean_absolute_error(valid_y['target'], valid_y['regression'])
    print(f'test_y mae: {test_y_mae:.4f}')
    
    with open(outputs_dir.joinpath('result_lightgbm_optuna.yaml'), 'w') as f:
        yaml.dump(
            {
                'test_y rmse': test_y_mae,
            },
            f,
            default_flow_style=False
        )
    return history, valid_y, fimps


def plot_time(all_time, train_time, valid_time):
    _, ax = plt.subplots()
    ax.barh(y='all', height=0.6, width=len(all_time), left=0, color='tab:blue')
    ax.barh(y='train+valid+test', height=0.6, width=[len(train_time), len(valid_time)],
            left=[train_time.min(), valid_time.min()], color=['tab:orange', 'tab:green', 'tab:red'])
    xcenter = [len(all_time)//2, train_time.min()+len(train_time)//2,
               valid_time.min()+len(valid_time)//2]
    ycenter = [0, 1, 1, 1]
    width = [f'all\n{len(all_time)}', f'train\n{len(train_time)}', f'valid\n{len(valid_time)}']
    for x, y, w in zip(xcenter, ycenter, width):
        ax.text(x, y, str(w),  ha='center', va='center')
    ax.set_xticks([train_time.min(), train_time.max(), valid_time.min(), len(all_time)])
    ax.grid(axis='x', linestyle='--')
    ax.tick_params(axis='x', labelrotation=45)
    plt.show()

In [None]:
params = {
    'loss_function': 'MultiRMSE',
    'eval_metric': 'MultiRMSE',
    'iterations': 5000,
    'depth': 10,
    'learning_rate': 5e-03,
    'random_state': 42,
    'bagging_temperature': 0.8,
    'random_strength': 0.8,
    #'subsample': 0.8,
    'colsample_bylevel': 0.8,
    'l2_leaf_reg': 0.0,
    'min_data_in_leaf': 20,
}

history, result, fimps = train(
    dataset=train_df,
    model_params=params,
    outputs_dir=outputs_dir_path,
)

In [None]:
display(result)
print(fimps.shape)
display(fimps.tail(50))

_, ax = plt.subplots(figsize=(12, 36))
fimps['mean_fimps'].plot(kind='barh', xerr=fimps['std_fimps'], capsize=3, ax=ax)  
plt.tight_layout()
plt.show()

fimps_quantile_th = fimps['mean_fimps'].quantile(q=0.2)
display(fimps.query('mean_fimps < @fimps_quantile_th').index)

In [None]:
history.plot(marker='.', linestyle=':')
plt.show()

In [None]:
_, ax = plt.subplots()
ax.hist2d(result['regression'], result['target'], bins=100, cmap='Blues', vmax=1e+03)
ax.plot([-100, 100], [-100, 100], color='tab:orange')
ax.set_xlabel('regression')
ax.set_ylabel('target')
plt.show()

r = np.corrcoef(result['regression'], result['target'])
print(f'correlation coeeficient: {r[0, 1]:.4f}')

## train lightgbm model using all data

In [None]:
target_columns = ['stock_id', 'date_id', 'seconds_in_bucket', 'target']
feature_columns = [col for col in train_df.columns if col not in ['date_id', 'target']]

train_pool = Pool(
    data=train_df[feature_columns],
    label=train_df[target_columns]['target'],
)

del train_df
gc.collect()

model = CatBoostRegressor(**params)
model.fit(train_pool, use_best_model=True, verbose=100)

model.save_model(
    outputs_dir_path.joinpath(f'catboost_trained_using_alldata.txt'),
)