In [1]:
import gc
import os
import time
import warnings
from itertools import combinations
from warnings import simplefilter

import joblib
import lightgbm as lgb
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, TimeSeriesSplit

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

is_offline = False
is_train = True
is_infer = True
max_lookback = np.nan
split_day = 435

In [2]:
df = pd.read_csv("A:\optiver-trading-at-the-close/train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df.shape

(5237892, 17)

In [3]:
def reduce_mem_usage(df, verbose=0):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
               
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")
    return df

## Speed up triplet imbalance calculations

In [None]:
from numba import njit, prange,jit

@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features

def calculate_triplet_imbalance_numba(price, df):
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]
    features_array = compute_triplet_imbalance(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_imbtriplet" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features

from itertools import combinations
import numpy as np
from numba import njit, prange

@njit(parallel=True)
def compute_imb2(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            imbalance_features[j, i] = min_val / max_val

    return imbalance_features

def calculate_imb2_numba(cols, df):
    df_values = df[cols].values
    comb_indices = [(cols.index(a), cols.index(b), cols.index(c)) for a, b, c in combinations(cols, 3)]
    features_array = compute_imb2(df_values, comb_indices)
    columns = [f"{a}_{b}_{c}_newimb2" for a, b, c in combinations(cols, 3)]
    features = pd.DataFrame(features_array, columns=columns)
    return features

from numba import njit, prange
import numpy as np
import pandas as pd
from itertools import combinations

@njit
def nanmean(arr):
    total = 0.0
    count = 0
    for val in arr:
        if not np.isnan(val):
            total += val
            count += 1
    return total / count if count > 0 else np.nan

@njit
def nanstd(arr):
    mean_val = nanmean(arr)
    ssq = 0.0
    count = 0
    for val in arr:
        if not np.isnan(val):
            ssq += (val - mean_val) ** 2
            count += 1
    return np.sqrt(ssq / count) if count > 1 else np.nan

@njit
def normalize_series_numba(arr):
    mean_val = nanmean(arr)
    std_val = nanstd(arr)
    return (arr - mean_val) / std_val if std_val != 0 else np.full(arr.shape, np.nan)

@njit(parallel=True)
def compute_imb3(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))
    
    for i in prange(num_combinations):
        a, b = comb_indices[i]
        norm_a = normalize_series_numba(df_values[:, a])
        norm_b = normalize_series_numba(df_values[:, b])

        for j in range(num_rows):
            if norm_a[j] + norm_b[j] != 0:  # 避免除以零
                imbalance_features[j, i] = (norm_a[j] - norm_b[j]) / (norm_a[j] + norm_b[j])
            else:
                imbalance_features[j, i] = np.nan

    return imbalance_features

def calculate_imb3_numba(cols, df):
    df_values = df[cols].values
    comb_indices = [(cols.index(a), cols.index(b)) for a, b in combinations(cols, 2)]
    features_array = compute_imb3(df_values, comb_indices)
    columns = [f"{a}_{b}_newimb3" for a, b in combinations(cols, 2)]
    features = pd.DataFrame(features_array, columns=columns)
    return features

def calculate_rsi(data, window_size=14):
    price_diff = data['wap'].diff()
    gain = price_diff.where(price_diff > 0, 0)
    loss = -price_diff.where(price_diff < 0, 0)

    avg_gain = gain.rolling(window=window_size).mean()
    avg_loss = loss.rolling(window=window_size).mean()

    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))

    return rsi


def wap_feats(df):
    window_size = 7
    short_window = 7
    long_window = 30

    # Group by 'stock_id', 'date_id', and 'time_id' and apply the rolling calculations within each group
    #df.loc['rolling_vol'] = df.groupby(['stock_id', 'date_id', 'time_id'])['wap'].pct_change().rolling(window=window_size).std()
    df['vol_st'] = df.groupby(['stock_id'])['wap'].pct_change().rolling(window=window_size).std()
    df['rolling_vol_di'] = df.groupby(['date_id'])['wap'].pct_change().rolling(window=window_size).std()
    df['std_st'] = df.groupby(['stock_id'])['wap'].rolling(window=window_size).std().values
    df['wap_pctch'] = df.groupby(['stock_id','date_id'])['wap'].pct_change().values*100
    df['short_ema'] = df.groupby(['stock_id'])['wap'].ewm(span=short_window, adjust=False).mean().values
    df['long_ema'] = df.groupby(['stock_id'])['wap'].ewm(span=long_window, adjust=False).mean().values
    wap_mean = df['wap'].mean()
    df['wap_vs_market'] = df['wap'] - df.groupby(['stock_id'])['wap'].transform('mean')
    df['macd'] = df['short_ema'] - df['long_ema']
    
    # Bollinger Bands calculation within each stock, date, and time
    df['bollinger_upper'] = df.groupby(['stock_id'])['wap'].rolling(window=long_window).mean().values + 2 * df.groupby(['stock_id'])['wap'].rolling(window=window_size).std().values
    df['bollinger_lower'] = df.groupby(['stock_id'])['wap'].rolling(window=long_window).mean().values - 2 * df.groupby(['stock_id'])['wap'].rolling(window=window_size).std().values
    # RSI calculation within each stock, date, and time
    df['rsi'] = df.groupby(['stock_id']).apply(calculate_rsi).values
    
    return df

In [None]:
# 读取低重要性特征
low_importance_features = pd.read_csv('A:/optiver-trading-at-the-close/lower_than_random_feature_importances.csv')
low_importance_feature_names = low_importance_features['Feature'].tolist()


## Feature groups

In [None]:
def imbalance_features(df):
    # Define lists of price and size-related column names
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    alls = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap","matched_size", "bid_size", "ask_size", "imbalance_size"]
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
        
    for c in combinations(sizes, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")
  

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
   
    for c in [["matched_size", "bid_size", "ask_size", "imbalance_size"], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
        
    # 计算 imb2 特征
 #   for triplet in combinations(prices, 3):
    #    imb2_feature = calculate_imb2_numba([triplet[0], triplet[1], triplet[2]], df)
   #     for col in imb2_feature.columns:
      #      df[col] = imb2_feature[col]
    
  #  for triplet in combinations(sizes, 3):
   #     imb2_feature = calculate_imb2_numba([triplet[0], triplet[1], triplet[2]], df)
   #     for col in imb2_feature.columns:
     #       df[col] = imb2_feature[col]

# 计算 imb3 特征

  #  for pair in combinations(prices, 2):
     #   imb3_feature = calculate_imb3_numba([pair[0], pair[1]], df)
    #    for col in imb3_feature.columns:
     #       df[col] = imb3_feature[col]
  
  #  for pair in combinations(sizes, 2):
    #    imb3_feature = calculate_imb3_numba([pair[0], pair[1]], df)
    #    for col in imb3_feature.columns:
      #      df[col] = imb3_feature[col]
        
           
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    # Calculate various statistical aggregation features
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        
    cols = ['matched_size','reference_price', "liquidity_imbalance","all_prices_skew",'ask_price', 'bid_price', 'ask_size', 'bid_size', 'liquidity_imbalance', 'size_imbalance', "wap",'market_urgency']
    for col in cols:
        for window in [2,3,5,10,15,20,25,30,35]:
            col_name = f'{col}_mawithdate_{window}'
            df[col_name] = df.groupby(['stock_id', 'date_id'])[col].rolling(window=window).mean(engine='numba').reset_index(level=['stock_id', 'date_id'], drop=True)

    cols = ['matched_size','reference_price', "liquidity_imbalance","all_prices_skew",'ask_price', 'bid_price', 'ask_size', 'bid_size', 'liquidity_imbalance', 'size_imbalance', "wap",'market_urgency']
    for col in cols:
        for window in [3,5,10,15,20,25,30,35]:
            col_name = f'{col}_mstdwithdate_{window}'
            df[col_name] = df.groupby(['stock_id', 'date_id'])[col].rolling(window=window).std(engine='numba').reset_index(level=['stock_id', 'date_id'], drop=True)



    for col in ['matched_size','reference_price', "liquidity_imbalance","all_prices_skew",'ask_price', 'bid_price', 'ask_size', 'bid_size', 'liquidity_imbalance', 'size_imbalance', "wap", 'imbalance_buy_sell_flag']:
        for window in [1,2,3, 5, 10, 15, 20,25,30,35]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)  
            df[f"{col}_shiftwithdate_{window}"] = df.groupby(['stock_id','date_id'])[col].shift(window)
            df[f"{col}_retwithdate_{window}"] = df.groupby(['stock_id','date_id'])[col].pct_change(window)    


    # Calculate diff features for specific columns
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'market_urgency', 'size_imbalance', "liquidity_imbalance", "all_prices_skew", "wap", 'near_price', 'far_price', 'imbalance_momentum']:
        for window in [1,2,3, 5, 10, 15, 20,25,30,35]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
            df[f"{col}_diffwithdate_{window}"] = df.groupby(['stock_id','date_id'])[col].diff(window)





    return df.replace([np.inf, -np.inf], 0)

# 读取低重要性特征
low_importance_features = pd.read_csv('A:/optiver-trading-at-the-close/lower_than_random_feature_importances.csv')
low_importance_feature_names = low_importance_features['Feature'].tolist()



def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

import numpy as np

def generate_all_features(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()

    # Add a random number feature for feature importance baseline
    #df['random_feature'] = np.random.rand(df.shape[0])
    # 删除低重要性特征
    df.drop(columns=low_importance_feature_names, errors='ignore', inplace=True)
    #df = wap_feats(df)
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    
    return df[feature_name]

'''def generate_all_features(df):
    # Select relevant columns for feature generation
    cols = [c for c in df.columns if c not in ["row_id", "time_id"]]
    df = df[cols]
    
    # Generate imbalance features
    df = imbalance_features(df)
    df = other_features(df)
    gc.collect()  
    feature_name = [i for i in df.columns if i not in ["row_id",  "time_id"]]
    
    return df[feature_name]'''

In [None]:
gc.collect()

In [None]:
if is_offline:
    
    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")
else:
    df_train = df
    print("Online mode")

In [9]:
if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")
        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)
    else:
        df_train_feats = generate_all_features(df_train)
        print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

MemoryError: Unable to allocate 36.1 GiB for an array with shape (925, 5237892) and data type float64

##  Because this is a time series dataset, we cannot use random KFold to partition the data, which will lead to data leakage (the model will have difficulty converging). Therefore, we strictly ensure that a part of the test set is not used, and divide all *training* data into (train, valid, test) in time series, valid is used as the basis for train iteration, and the optimal parameters are used on train.

In [None]:
if is_train:
    feature_name = list(df_train_feats.columns)
    lgb_params = {
        "objective": "mae",
        "n_estimators": 5500,
        "num_leaves": 256,
        "subsample": 0.6,
        "colsample_bytree": 0.6,
        "learning_rate": 0.00877,
        "n_jobs": 4,
        'seed': 79,
        "device": "gpu",
        "verbosity": -1,
        "importance_type": "gain",
        "max_depth": 12,  # Maximum depth of the tree
        "min_child_samples": 15,  # Minimum number of data points in a leaf
        "reg_alpha": 0.1,  # L1 regularization term
        "reg_lambda": 0.3,  # L2 regularization term
        "min_split_gain": 0.2,  # Minimum loss reduction required for further partitioning
        "min_child_weight": 0.001,  # Minimum sum of instance weight (hessian) in a leaf
        "bagging_fraction": 0.9,  # Fraction of data to be used for training each tree
        "bagging_freq": 5,  # Frequency for bagging
        "feature_fraction": 0.9,  # Fraction of features to be used for training each tree
        "num_threads": 4,  # Number of threads for LightGBM to use
}
    
    print(f"Feature length = {len(feature_name)}")

    offline_split = df_train['date_id']>(split_day - 45)
    df_offline_train = df_train_feats[~offline_split]
    df_offline_valid = df_train_feats[offline_split]
    df_offline_train_target = df_train['target'][~offline_split]
    df_offline_valid_target = df_train['target'][offline_split]

    print("Valid Model Trainning.")
    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        df_offline_train[feature_name],
        df_offline_train_target,
        eval_set=[(df_offline_valid[feature_name], df_offline_valid_target)],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=100),
            lgb.callback.log_evaluation(period=100),
        ],
    )

    del df_offline_train, df_offline_valid, df_offline_train_target, df_offline_valid_target
    gc.collect()

    # infer
    df_train_target = df_train["target"]
    print("Infer Model Trainning.")
    infer_params = lgb_params.copy()
    infer_params["n_estimators"] = int(1.2 * lgb_model.best_iteration_)
    infer_lgb_model = lgb.LGBMRegressor(**infer_params)
    infer_lgb_model.fit(df_train_feats[feature_name], df_train_target)

    if is_offline:   
        # offline predictions
        df_valid_target = df_valid["target"]
        offline_predictions = infer_lgb_model.predict(df_valid_feats[feature_name])
        offline_score = mean_absolute_error(offline_predictions, df_valid_target)
        print(f"Offline Score {np.round(offline_score, 4)}")

In [10]:
import pickle

# 保存模型为 .pkl 文件
model_filename = 'A:\optiver-trading-at-the-close\lgb-models-optv2/lgb_model570fe.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(infer_lgb_model, file)

| Version  | Feature Description                           | MAE(t/v) offline   | MAE(t/t) online       |  Inferance QPS  | Model              | Lookback |
| :-----:  | :-------------------------------------------: | :----------------: | :-------------------: | :-------------: | :----------------: | :------: |
| V1       | *Baseline* (41 feats)                         | 5.9712(*5.8578*)   | 5.9161(**5.3693**)    |  0.1887s        | LightGBM           |  /       |
| V2       | Add *imbalance* feature (58 feats)            | 5.9589(*5.8465*)   | 5.9055(**5.3700**)    |  0.2351s        | LightGBM           |  /       |
| V3       | Add *global* feature (68 feats)      total 147         | 5.9616(*5.8449*)   | 5.9069(**5.3683**)    |  0.1964s        | LightGBM           |  /       |
| V4       | Add *sliding window* feature (33 feats)   total 184            | 5.94319(*5.8288*)   | 5.88859(**5.3683**)    |  0.1964s        | LightGBM           |  /       |
| V5       | Add *sliding window* feature (150 feats)   total 336            | 5.94257(*5.8293*)   | 5.88859(**5.3683**)    |  0.1964s        | LightGBM           |  /       |
| V6       |   total 336  change param for lR and no estimater          | 5.93954(*5.8247*)   | 5.88498(**5.3683**)    |  0.1964s        | LightGBM           |  /       |
| V7       | change param for lR and no estimater 8500 estimator         | 5.93905(*5.825*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V8       | Add *sliding window* feature (180 feats)   total 408 feature. 5500 estimator         | 5.93896(*5.8245*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V9       | Add *sliding window* feature (120 feats)   total 540 feature.        | 5.93755(*5.8243*)   | 5.88389 (  )  |  /  0.1964s        | LightGBM           |  /       |
| V10       | Add imbalance feature (12feats)   total 546 feature.        | 5.93752(*5.8227*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V11       | Add imbalance feature (12feats)   total 570 feature.        | 5.93706(*5.8336*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V12       | Add rolling mean feature  total 616 feature.        | 5.93697(*5.822*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V13       | Add rolling min max feature  total 696 feature.        | 5.93873(*5.824*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V14       | Add rolling std feature  total 648 feature.        | 5.93789(*5.824*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V15       | delete all the feature less important than random number | 5.93739(*5.824*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V16       | delete all the feature less important than random number and add all the big window 658feature| 5.93789(*5.8219*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
| V17       | v7feature total 498feature| 5.93476(*5.8217*)   | /(**/**)    |  0.1964s        | LightGBM           |  /       |
> **MAE(t/v)** means train/valid offline scoring, **MAE(t/t)** means train/test online scoring. Inferance QPS means the inference time of a single piece of data (including feature engineering, model prediction). Lookback means the maximum number of lookback seqs for data and models.


In [1]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices)/np.sum(std_error)
    out = prices-std_error*step
    
    return out

if is_infer:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    y_min, y_max = -64, 64
    qps, predictions = [], []
    cache = pd.DataFrame()
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        feat = generate_all_features(cache)[-len(test):]
        lgb_prediction = infer_lgb_model.predict(feat)
#         lgb_prediction = zero_sum(lgb_prediction, test['bid_size'] + test['ask_size'])
        lgb_prediction = lgb_prediction - np.mean(lgb_prediction)
        clipped_predictions = np.clip(lgb_prediction, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))
           
    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")

NameError: name 'is_infer' is not defined