In [1]:
import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter
# import psutil
import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
is_offline = False
is_train = True
is_infer = False
max_lookback = np.nan
split_day = 435



In [2]:
df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df.shape

(5237892, 17)

In [3]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """

    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)

    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df



from numba import njit, prange

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            if mid_val == min_val:  # Prevent division by zero
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features



# generate imbalance features
def imbalance_features(df):
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")
    # V1
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    # V2
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
    # V3
    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size',
                'wap', 'near_price', 'far_price','market_urgency', 'imbalance_momentum', 'size_imbalance']:
        for window in [1, 2, 3, 5, 10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)

    return df.replace([np.inf, -np.inf], 0)

# generate time & stock features
def other_features(df):
    df["dow"] = df["date_id"] % 5
    df["dom"] = df["date_id"] % 20
    df["seconds"] = df["seconds_in_bucket"] % 60
    df["minute"] = df["seconds_in_bucket"] // 60
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())
    return df

# 添加基础排名因子
def dfrank(newdf): 
    columns=[column for column in newdf.columns if (
            ('target' not in column) # 这个是预测目标，不能排名
        and ('date_id' not in column) # 这个在回测时不显示，不能使用
        and ('time_id' not in column) # 这个在回测时不显示，不能使用
        and ('row_id' not in column)
        and ('stock_id' not in column)
        and ('seconds_in_bucket' not in column)
        )]
    for column in columns:
        # 从小到大排名【测试下双排名有效果是因为加上了na_option='bottom'的处理机制还是因为实现的双排名方案】
        newdf=pd.concat([newdf,(newdf[str(column)].rank(method="max", ascending=False,na_option='bottom')/len(newdf)).rename(f"{str(column)}_rank")], axis=1)
        # 从大到小排名
        newdf=pd.concat([newdf,(newdf[str(column)].rank(method="max", ascending=True,na_option='bottom')/len(newdf)).rename(f"{str(column)}_rerank")], axis=1) # 从大到小排序
    return newdf

# generate all features
def generate_all_features(df):
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    # 计算基础排名因子
    df = df.groupby(['date_id','seconds_in_bucket']).apply(dfrank) # 计算排名因子
    df = df.reset_index(drop=True) # 重置索引并且不使其生成新的列【提交之前尽量对df的格式进行还原,需要注意这俩索引的值在分组之后是保留了的，如果不去掉容易造成资源竞争】
    df = reduce_mem_usage(df)
#     print("基础排名因子")
#     mem = psutil.virtual_memory()
#     zj = float(mem.total) / 1024 / 1024 / 1024
#     ysy = float(mem.used) / 1024 / 1024 / 1024
#     kx = float(mem.free) / 1024 / 1024 / 1024
#     print('系统总计内存:%d.3GB' % zj)
#     print('系统已经使用内存:%d.3GB' % ysy)
#     print('系统空闲内存:%d.3GB' % kx)
    gc.collect()
    df = imbalance_features(df)
    df = reduce_mem_usage(df)
#     print("不平衡特征")
    gc.collect()
    df = other_features(df)
    df = reduce_mem_usage(df)
#     print("其余特征")
    gc.collect()
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    return df[feature_name]





weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

if is_offline:
    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")
else:
    df_train = df
    print("Online mode")

Online mode


In [4]:
if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")
        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)
    else:
        df_train_feats = generate_all_features(df_train)
        print("Build Online Train Feats Finished.")
        
df_train_feats = reduce_mem_usage(df_train_feats)

Build Online Train Feats Finished.


In [5]:
# df_train_feats.reset_index(drop = True).to_feather("/kaggle/working/df_train_Nov6.fea")

In [6]:
print("df_train_feats_length: ", len(df_train_feats))

df_train_feats_length:  5237892


In [7]:
df_train_feats

Unnamed: 0,stock_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,...,dow,dom,seconds,minute,global_median_size,global_std_size,global_ptp_size,global_median_price,global_std_price,global_ptp_price
0,0,0,3.180603e+06,1,0.999812,13380277.00,,,0.999812,60651.500000,...,0,0,0,0,42739.160156,132986.921875,5.898990e+06,1.999695,0.003353,0.017414
1,1,0,1.666039e+05,-1,0.999896,1642214.25,,,0.999896,3233.040039,...,0,0,0,0,25548.500000,66444.906250,6.938986e+05,1.999827,0.005588,0.029370
2,2,0,3.028799e+05,-1,0.999561,1819368.00,,,0.999403,37956.000000,...,0,0,0,0,26228.099609,75674.656250,1.069838e+06,2.000200,0.005333,0.051622
3,3,0,1.191768e+07,-1,1.000171,18389746.00,,,0.999999,2324.899902,...,0,0,0,0,41667.000000,93875.773438,1.928848e+06,1.999980,0.002903,0.018551
4,4,0,4.475500e+05,-1,0.999532,17860614.00,,,0.999394,16485.539062,...,0,0,0,0,34014.578125,80670.273438,1.604066e+06,1.999816,0.003717,0.017379
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5237887,195,540,2.440723e+06,-1,1.000317,28280362.00,0.999734,0.999734,1.000317,32257.039062,...,0,0,0,9,51941.550781,98218.031250,2.761659e+06,1.999930,0.003051,0.014076
5237888,196,540,3.495105e+05,-1,1.000643,9187699.00,1.000129,1.000386,1.000643,205108.406250,...,0,0,0,9,42476.949219,78070.062500,4.596574e+05,2.000042,0.003416,0.017398
5237889,197,540,0.000000e+00,0,0.995789,12725436.00,0.995789,0.995789,0.995789,16790.660156,...,0,0,0,9,30070.039062,71964.171875,1.575294e+06,1.999984,0.004696,0.020387
5237890,198,540,1.000899e+06,1,0.999210,94773272.00,0.999210,0.999210,0.998970,125631.718750,...,0,0,0,9,304739.250000,354682.781250,2.159163e+06,1.999917,0.003146,0.015738


In [8]:
# if is_train:
#     feature_name = list(df_train_feats.columns)
#     # 3.399
# #     lgb_params = {
# #         "objective" : "mae",
# #         "n_estimators" : 6000,
# #         "num_leaves" : 512,
# #         "subsample" : 0.65,
# #         "colsample_bytree" : 0.65,
# #         "learning_rate" : 0.007,
# #         "n_jobs" : 4,
# #         "device" : "gpu",
# #         "verbosity": -1,
# #         "importance_type" : "gain",
# #         "max_depth": 12,  # Maximum depth of the tree
# #         "min_child_samples": 15,  # Minimum number of data points in a leaf
# #         "reg_alpha": 0.1,  # L1 regularization term
# #         "reg_lambda": 0.3,  # L2 regularization term
# #         "min_split_gain": 0.2,  # Minimum loss reduction required for further partitioning
# #         "min_child_weight": 0.001,  # Minimum sum of instance weight (hessian) in a leaf
# #         "bagging_fraction": 0.9,  # Fraction of data to be used for training each tree
# #         "bagging_freq": 5,  # Frequency for bagging
# #         "feature_fraction": 0.9,  # Fraction of features to be used for training each tree
# #         "num_threads": 4,  # Number of threads for LightGBM to use
# #     }
#     lgb_params = {
#         "objective": "mae",
#         "n_estimators": 6000,
#         "num_leaves": 384,  # 比较确定会更好
#         "subsample": 0.6,   # 比较确定会更好
#         "colsample_bytree": 0.8,
#         "learning_rate": 0.1, # 0.00871 学习率影响不大
#         'max_depth': 12,
#         "n_jobs": 4,
#         "device": "gpu",
#         "verbosity": -1,
#         "importance_type": "gain",
#     }
    

#     print(f"Feature length = {len(feature_name)}")
#     offline_split = df_train['date_id']>(split_day - 45)
# #     df_offline_train = df_train_feats[~offline_split]
# #     df_offline_valid = df_train_feats[offline_split]
# #     df_offline_train_target = df_train['target'][~offline_split]
# #     df_offline_valid_target = df_train['target'][offline_split]

#     print("Valid Model Trainning.")
#     lgb_model = lgb.LGBMRegressor(**lgb_params)
#     lgb_model.fit(
#         df_train_feats[~offline_split][feature_name],
#         df_train['target'][~offline_split],
#         eval_set=[(df_train_feats[offline_split][feature_name], df_train['target'][offline_split])],
#         callbacks=[
#             lgb.callback.early_stopping(stopping_rounds=100),
#             lgb.callback.log_evaluation(period=100),
#         ],
#     )

# #     del df_offline_train, df_offline_valid, df_offline_train_target, df_offline_valid_target
#     gc.collect()
#     # infer
#     df_train_target = df_train["target"]
#     print("Infer Model Trainning.")
#     infer_params = lgb_params.copy()
#     infer_params["n_estimators"] = int(1.2 * lgb_model.best_iteration_)
#     infer_lgb_model = lgb.LGBMRegressor(**infer_params)
#     infer_lgb_model.fit(df_train_feats[feature_name], df_train_target)
#     if is_offline:   
#         # offline predictions
#         df_valid_target = df_valid["target"]
#         offline_predictions = infer_lgb_model.predict(df_valid_feats[feature_name])
#         offline_score = mean_absolute_error(offline_predictions, df_valid_target)
#         print(f"Offline Score {np.round(offline_score, 4)}")

In [9]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import gc

lgb_params = {
    "objective": "mae",
    "n_estimators": 6000,
    "num_leaves": 384,
    "subsample": 0.6,  # 0.6和0.8没区别
    "colsample_bytree": 0.8,
    "learning_rate": 0.00871,
    'max_depth': 11,
    "n_jobs": 4,
    "device": "gpu",
    "verbosity": -1,
    "importance_type": "gain",
}
feature_name = list(df_train_feats.columns)
print(f"Feature length = {len(feature_name)}")

num_folds = 5
fold_size = 480 // num_folds
gap = 5

models = []
scores = []

model_save_path = 'modelitos_para_despues' 
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

date_ids = df['date_id'].values

for i in range(num_folds):
    start = i * fold_size
    end = start + fold_size
    if i < num_folds - 1:  # No need to purge after the last fold
        purged_start = end - 2
        purged_end = end + gap + 2
        train_indices = (date_ids >= start) & (date_ids < purged_start) | (date_ids > purged_end)
    else:
        train_indices = (date_ids >= start) & (date_ids < end)
    
    test_indices = (date_ids >= end) & (date_ids < end + fold_size)
    
#     df_fold_train = df_train_feats[train_indices]
#     df_fold_train_target = df['target'][train_indices]
#     df_fold_valid = df_train_feats[test_indices]
#     df_fold_valid_target = df['target'][test_indices]

    print(f"Fold {i+1} Model Training")
    
    # Train a LightGBM model for the current fold
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        df_train_feats[train_indices][feature_name],
        df['target'][train_indices],
        eval_set=[(df_train_feats[test_indices][feature_name], df['target'][test_indices])],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=100),
            lgb.callback.log_evaluation(period=100),
        ],
    )

    models.append(lgb_model)
    # Save the model to a file
    model_filename = os.path.join(model_save_path, f'doblez_{i+1}.txt')
    lgb_model.booster_.save_model(model_filename)
    print(f"Model for fold {i+1} saved to {model_filename}")

    # Evaluate model performance on the validation set
    fold_predictions = lgb_model.predict(df_train_feats[test_indices][feature_name])
    fold_score = mean_absolute_error(fold_predictions, df['target'][test_indices])
    scores.append(fold_score)
    print(f"Fold {i+1} MAE: {fold_score}")

    # Free up memory by deleting fold specific variables
#     del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
    gc.collect()

# Calculate the average best iteration from all regular folds
average_best_iteration = int(np.mean([model.best_iteration_ for model in models]))

# Update the lgb_params with the average best iteration
final_model_params = lgb_params.copy()
final_model_params['n_estimators'] = average_best_iteration

print(f"Training final model with average best iteration: {average_best_iteration}")

# Train the final model on the entire dataset
final_model = lgb.LGBMRegressor(**final_model_params)
final_model.fit(
    df_train_feats[feature_name],
    df['target'],
    callbacks=[
        lgb.callback.log_evaluation(period=100),
    ],
)

# Append the final model to the list of models
models.append(final_model)

# Save the final model to a file
final_model_filename = os.path.join(model_save_path, 'doblez-conjunto.txt')
final_model.booster_.save_model(final_model_filename)
print(f"Final model saved to {final_model_filename}")

# Now 'models' holds the trained models for each fold and 'scores' holds the validation scores
print(f"Average MAE across all folds: {np.mean(scores)}")

Feature length = 184
Fold 1 Model Training
Training until validation scores don't improve for 100 rounds
[100]	valid_0's l1: 7.09465
[200]	valid_0's l1: 7.04375
[300]	valid_0's l1: 7.01561
[400]	valid_0's l1: 6.99261
[500]	valid_0's l1: 6.97344
[600]	valid_0's l1: 6.95565
[700]	valid_0's l1: 6.93968
[800]	valid_0's l1: 6.92541
[900]	valid_0's l1: 6.91291
[1000]	valid_0's l1: 6.9008
[1100]	valid_0's l1: 6.88944
[1200]	valid_0's l1: 6.87882
[1300]	valid_0's l1: 6.86879
[1400]	valid_0's l1: 6.85951
[1500]	valid_0's l1: 6.85118
[1600]	valid_0's l1: 6.84226
[1700]	valid_0's l1: 6.83432
[1800]	valid_0's l1: 6.82669
[1900]	valid_0's l1: 6.81843
[2000]	valid_0's l1: 6.81079
[2100]	valid_0's l1: 6.80359
[2200]	valid_0's l1: 6.79636
[2300]	valid_0's l1: 6.78868
[2400]	valid_0's l1: 6.78084
[2500]	valid_0's l1: 6.77416
[2600]	valid_0's l1: 6.76729
[2700]	valid_0's l1: 6.7607
[2800]	valid_0's l1: 6.7534
[2900]	valid_0's l1: 6.74603
[3000]	valid_0's l1: 6.74027
[3100]	valid_0's l1: 6.73303
[3200]	v

In [10]:
# # Testing
# def zero_sum(prices, volumes):
#     std_error = np.sqrt(volumes)
#     step = np.sum(prices)/np.sum(std_error)
#     out = prices-std_error*step
#     return out

# if is_infer:
#     import optiver2023
#     env = optiver2023.make_env()
#     iter_test = env.iter_test()
#     counter = 0
#     y_min, y_max = -64, 64
#     qps, predictions = [], []
#     cache = pd.DataFrame()
#     for (test, revealed_targets, sample_prediction) in iter_test:
#         now_time = time.time()
#         cache = pd.concat([cache, test], ignore_index=True, axis=0)
#         if counter > 0:
#             cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
#         feat = generate_all_features(cache)[-len(test):]
#         lgb_prediction = infer_lgb_model.predict(feat)
# #         lgb_prediction = zero_sum(lgb_prediction, test['bid_size'] + test['ask_size'])
#         lgb_prediction = lgb_prediction - np.mean(lgb_prediction)
#         clipped_predictions = np.clip(lgb_prediction, y_min, y_max)
#         sample_prediction['target'] = clipped_predictions
#         env.predict(sample_prediction)
#         counter += 1
#         qps.append(time.time() - now_time)
#         if counter % 10 == 0:
#             print(counter, 'qps:', np.mean(qps))
#     time_cost = 1.146 * np.mean(qps)
#     print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")

In [11]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

if is_infer:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    y_min, y_max = -64, 64
    qps, predictions = [], []
    cache = pd.DataFrame()

    # Weights for each fold model
    model_weights = [1/len(models)] * len(models) 
    
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        feat = generate_all_features(cache)[-len(test):]

        # Generate predictions for each model and calculate the weighted average
        lgb_predictions = np.zeros(len(test))
        for model, weight in zip(models, model_weights):
            lgb_predictions += weight * model.predict(feat)

        lgb_predictions = zero_sum(lgb_predictions, test['bid_size'] + test['ask_size'])
        clipped_predictions = np.clip(lgb_predictions, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
10 qps: 1.46881103515625
20 qps: 1.5808643341064452
30 qps: 1.6635185480117798
40 qps: 1.7057941436767579
50 qps: 1.736162166595459
60 qps: 1.7592510024706522
70 qps: 1.770152143069676
80 qps: 1.7811688750982284
90 qps: 1.7888750500149198
100 qps: 1.7978293251991273
110 qps: 1.8059961232272062
120 qps: 1.809061481555303
130 qps: 1.8133647221785325
140 qps: 1.8142146706581115
150 qps: 1.8178698174158732
160 qps: 1.8203248128294944
The code will take approximately 2.089 hours to reason about
