In [1]:

import os
import numpy as np
import pandas as pd
import glob
from joblib import Parallel, delayed
import gc
from scipy.stats import kurtosis
import category_encoders as ce
from sklearn.model_selection import GroupKFold
from sklearn.metrics import make_scorer
import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor, VotingRegressor
from sklearn.linear_model import Ridge
import matplotlib.pyplot as plt

DATA_DIR = r'/Users/ahmadashfaq/Stock-Volatility-Prediction-/optiver-realized-volatility-prediction'
df_train_raw = pd.read_csv(f'{DATA_DIR}/train.csv')
df_test_raw = pd.read_csv(f'{DATA_DIR}/test.csv')
sample_submission = pd.read_csv(f'{DATA_DIR}/sample_submission.csv')

# Feature Engineering Functions (Enhanced from both models)
def calc_wap1(df):
    return (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def calc_wap2(df):
    return (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])

def log_return(series):
    return np.log(series).diff().fillna(0)

def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

def book_preprocess(file_path):
    df = pd.read_parquet(file_path)
    
    # WAP and Log Returns (from both models)
    df['wap1'] = calc_wap1(df)
    df['wap2'] = calc_wap2(df)
    df['log_ret1'] = df.groupby('time_id')['wap1'].transform(log_return)
    df['log_ret2'] = df.groupby('time_id')['wap2'].transform(log_return)
    
    # Volatility Features
    df['volatility'] = df.groupby('time_id')['log_ret1'].transform(realized_volatility)
    df['volatility_5min'] = df[df['seconds_in_bucket'] <= 300].groupby('time_id')['log_ret1'].transform(realized_volatility)
    df['wap_balance'] = df['wap1'] - df['wap2']  # From previous model
    
    # Order Book Features
    df['spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['imbalance'] = (df['bid_size1'] - df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['total_volume'] = df['ask_size1'] + df['ask_size2'] + df['bid_size1'] + df['bid_size2']  # From previous model
    
    # Time-based Features
    df['time_norm'] = df['seconds_in_bucket'] / 600
    
    # Aggregations (enhanced)
    agg_dict = {
        'wap1': ['mean', 'std', 'skew'],
        'wap2': ['mean', 'std'],
        'log_ret1': ['std', 'skew', realized_volatility],
        'log_ret2': ['std'],
        'spread': ['mean', 'std', 'max'],
        'imbalance': ['mean', 'std'],
        'volatility': ['mean'],
        'volatility_5min': ['mean'],
        'bid_size1': ['sum'],
        'ask_size1': ['sum'],
        'wap_balance': ['mean', 'std'],
        'total_volume': ['mean', 'std'],
        'time_norm': ['mean', 'std']
    }
    df_agg = df.groupby('time_id').agg(agg_dict).reset_index()
    df_agg.columns = ['time_id'] + ['_'.join(col) if isinstance(col, tuple) else col for col in df_agg.columns[1:]]
    
    stock_id = file_path.split('=')[1]
    df_agg['row_id'] = str(stock_id) + '-' + df_agg['time_id'].astype(str)
    df_agg.drop('time_id', axis=1, inplace=True)
    return df_agg

def trade_preprocess(file_path):
    df = pd.read_parquet(file_path)
    df['log_ret'] = df.groupby('time_id')['price'].transform(log_return)
    df['trade_intensity'] = df['size'] / df['order_count']  # From previous enhancement
    
    agg_dict = {
        'log_ret': ['std', 'mean'],
        'size': ['sum', 'mean'],
        'order_count': ['sum'],
        'trade_intensity': ['mean']
    }
    df_agg = df.groupby('time_id').agg(agg_dict).reset_index()
    df_agg.columns = ['time_id'] + ['_'.join(col) if isinstance(col, tuple) else col for col in df_agg.columns[1:]]
    
    stock_id = file_path.split('=')[1]
    df_agg['row_id'] = str(stock_id) + '-' + df_agg['time_id'].astype(str)
    df_agg.drop('time_id', axis=1, inplace=True)
    return df_agg

def preprocessor(stock_id_list, train_mode=True):
    df = Parallel(n_jobs=-1, verbose=1)(
        delayed(lambda x: pd.merge(book_preprocess(f'{DATA_DIR}/{"book_train" if train_mode else "book_test"}.parquet/stock_id={x}'),
                                 trade_preprocess(f'{DATA_DIR}/{"trade_train" if train_mode else "trade_test"}.parquet/stock_id={x}'),
                                 on='row_id', how='left'))(stock_id) 
        for stock_id in stock_id_list
    )
    return pd.concat(df, ignore_index=True)

# Data Processing
train_stock_id_list = df_train_raw['stock_id'].unique()
df_train = preprocessor(train_stock_id_list, train_mode=True)
df_train = pd.merge(df_train, 
                   df_train_raw.assign(row_id=lambda x: x['stock_id'].astype(str) + '-' + x['time_id'].astype(str))[['row_id', 'target']],
                   on='row_id', how='right')
df_train['stock_id'] = df_train['row_id'].str.split('-').str[0].astype('category')
df_train['time_id'] = df_train['row_id'].str.split('-').str[1].astype(int)

test_stock_id_list = df_test_raw['stock_id'].unique()
df_test = preprocessor(test_stock_id_list, train_mode=False)
df_test = pd.merge(df_test, 
                  df_test_raw.assign(row_id=lambda x: x['stock_id'].astype(str) + '-' + x['time_id'].astype(str))[['row_id']],
                  on='row_id', how='right')
df_test['stock_id'] = df_test['row_id'].str.split('-').str[0].astype('category')
df_test['time_id'] = df_test['row_id'].str.split('-').str[1].astype(int)

# Data Preparation (No explicit imputation - let LightGBM handle NaNs)
X_train = df_train.drop(columns=['target', 'row_id'])
Y_train = df_train['target']
X_test = df_test.drop(columns=['row_id'])

t_encoder = ce.TargetEncoder(smoothing=10)
X_train['stock_id'] = t_encoder.fit_transform(X_train['stock_id'], Y_train)
X_test['stock_id'] = t_encoder.transform(X_test['stock_id'])

# RMSPE Metric
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))

# Training Process Callback for LightGBM
def training_callback():
    train_rmspe_history = []
    val_rmspe_history = []
    
    def callback(env):
        if env.iteration % 100 == 0:  # Record every 100 iterations
            train_pred = env.model.predict(env.data[0])  # Training data
            val_pred = env.model.predict(env.data[1])    # Validation data
            train_rmspe = rmspe(Y_train.iloc[env.data[0].index], train_pred)
            val_rmspe = rmspe(Y_train.iloc[env.data[1].index], val_pred)
            train_rmspe_history.append(train_rmspe)
            val_rmspe_history.append(val_rmspe)
            print(f"Iteration {env.iteration}: Train RMSPE = {train_rmspe:.4f}, Val RMSPE = {val_rmspe:.4f}")
    
    return callback, train_rmspe_history, val_rmspe_history

# Model Definitions (Incorporating previous tuning)
lgb_params = {
    'objective': 'regression',
    'learning_rate': 0.005,  # From previous model
    'num_leaves': 80,
    'max_depth': 6,
    'reg_alpha': 0.01,
    'reg_lambda': 0.01,
    'colsample_bytree': 0.1,
    'subsample': 0.01,
    'n_estimators': 1500,
    'random_state': 2025
}

cat_params = {
    'iterations': 1500,
    'learning_rate': 0.005,
    'depth': 8,
    'l2_leaf_reg': 3.0,
    'bagging_temperature': 0.2,
    'random_seed': 2025,
    'verbose': 100
}

# Enhanced Model with Training Monitoring
def train_and_predict(X_train, X_val, y_train, y_val, X_test):
    # Split for validation
    train_idx = X_train.index
    val_idx = X_val.index
    
    # LightGBM with callback
    callback, train_rmspe_history, val_rmspe_history = training_callback()
    lgb_model = LGBMRegressor(**lgb_params)
    lgb_model.fit(
        X_train.drop(columns=['time_id']), y_train,
        eval_set=[(X_train.drop(columns=['time_id']), y_train), (X_val.drop(columns=['time_id']), y_val)],
        eval_metric='rmse',
        callbacks=[callback]
    )
    
    # CatBoost
    cat_model = CatBoostRegressor(**cat_params)
    cat_model.fit(X_train.drop(columns=['time_id']), y_train, verbose=100)
    
    # MLP
    mlp_model = MLPRegressor(hidden_layer_sizes=(256, 128, 64), early_stopping=True, random_state=42, max_iter=500)
    mlp_model.fit(X_train.drop(columns=['time_id']), y_train)
    
    # Ensemble (Combining Voting from previous model with Stacking)
    voting_model = VotingRegressor([
        ('lgb', lgb_model),
        ('cat', cat_model),
        ('mlp', mlp_model)
    ], weights=[0.4, 0.35, 0.25])  # Adjusted weights from previous model
    
    final_model = StackingRegressor(
        estimators=[
            ('voting', voting_model)
        ],
        final_estimator=Ridge(alpha=1.0)
    )
    
    final_model.fit(X_train.drop(columns=['time_id']), y_train)
    
    # Predictions
    val_preds = final_model.predict(X_val.drop(columns=['time_id']))
    test_preds = final_model.predict(X_test.drop(columns=['time_id']))
    test_preds = np.clip(test_preds, Y_train.min(), Y_train.max())
    
    # Plot training process
    plt.figure(figsize=(10, 6))
    plt.plot(range(0, len(train_rmspe_history) * 100, 100), train_rmspe_history, label='Train RMSPE')
    plt.plot(range(0, len(val_rmspe_history) * 100, 100), val_rmspe_history, label='Validation RMSPE')
    plt.xlabel('Iteration')
    plt.ylabel('RMSPE')
    plt.title('Training Process: RMSPE over Iterations (LightGBM)')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    return final_model, val_preds, test_preds

# Temporal Validation
def temporal_validation(X, y):
    cv = GroupKFold(n_splits=5)
    group_ids = X['time_id']
    all_val_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups=group_ids)):
        X_train_fold, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val = y.iloc[train_idx], y.iloc[val_idx]
        model, val_preds, _ = train_and_predict(X_train_fold, X_val, y_train_fold, y_val, X_test)
        fold_score = rmspe(y_val, val_preds)
        all_val_scores.append(fold_score)
        print(f'Fold {fold+1} RMSPE: {fold_score:.4f}')
    
    overall_rmspe = np.mean(all_val_scores)
    print(f'Overall RMSPE: {overall_rmspe:.4f}')
    return overall_rmspe

# Main Execution
def main():
    X = X_train.copy()
    y = Y_train.copy()
    rmspe_score = temporal_validation(X, y)
    
    final_model, _, test_preds = train_and_predict(X, X, y, y, X_test)
    submission = pd.DataFrame({
        'row_id': sample_submission['row_id'],
        'target': test_preds
    })
    submission.to_csv('submission.csv', index=False)
    print(f"Submission saved with RMSPE: {rmspe_score:.4f}")

if __name__ == "__main__":
    main()

  from .autonotebook import tqdm as notebook_tqdm
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.4s
[Parallel(n_jobs=-1)]: Done 112 out of 112 | elapsed:   59.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7763
[LightGBM] [Info] Number of data points in the train set: 343145, number of used features: 31
[LightGBM] [Info] Start training from score 0.003863


AttributeError: 'CallbackEnv' object has no attribute 'data'