In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# ============================================================
# üöÄ Hull Tactical Market Prediction - GPU Ensemble Enhanced
# ============================================================

import os
import time
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
import optuna
import warnings
warnings.filterwarnings('ignore')

# ============================================================
# 1Ô∏è‚É£ Global Variables
# ============================================================
TARGET = 'market_forward_excess_returns'
MODERN_ERA_START = 1000
FEATURES_TO_ENGINEER = ['I1', 'P10', 'S1', 'V1']
LAGS = [1, 3, 5]
ROLL_WINDOWS = [10, 20, 50]

# ============================================================
# 2Ô∏è‚É£ Feature Engineering
# ============================================================
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df = df.ffill().bfill()
    for col in FEATURES_TO_ENGINEER:
        for lag in LAGS:
            df[f'{col}_lag_{lag}'] = df[col].shift(lag)
        for window in ROLL_WINDOWS:
            df[f'{col}_roll_mean_{window}'] = df[col].rolling(window=window, min_periods=1).mean()
            df[f'{col}_roll_std_{window}'] = df[col].rolling(window=window, min_periods=1).std()
    df = df.fillna(0)
    return df

# ============================================================
# 3Ô∏è‚É£ Load & Prepare Data
# ============================================================
print("üìÇ Loading training data...")
df_train = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
train_modern = df_train[df_train['date_id'] >= MODERN_ERA_START].copy()
train_processed = preprocess(train_modern)

features_to_drop = ['forward_returns', 'risk_free_rate', 'date_id', TARGET]
model_features = [col for col in train_processed.columns if col not in features_to_drop]

X = train_processed[model_features]
y = train_processed[TARGET]
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.15, random_state=42, shuffle=True)

# ============================================================
# 4Ô∏è‚É£ Optuna Hyperparameter Tuning (LightGBM)
# ============================================================
def objective_lgb(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "device": "gpu",
        "n_estimators": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "num_leaves": trial.suggest_int("num_leaves", 31, 127),
        "subsample": trial.suggest_float("subsample", 0.6, 0.9),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 0.3),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 0.3),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 100)
    }

    model = lgb.LGBMRegressor(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
              eval_metric="rmse",
              callbacks=[lgb.early_stopping(stopping_rounds=100)])
    preds = model.predict(X_valid)
    corr, _ = spearmanr(y_valid, preds)
    return -corr

print("üéØ Running Optuna tuning for LightGBM...")
study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(objective_lgb, n_trials=25, show_progress_bar=True)
best_lgb_params = study_lgb.best_params
print("‚úÖ Best LightGBM params:", best_lgb_params)

# ============================================================
# 5Ô∏è‚É£ Train Final LightGBM
# ============================================================
final_lgb = lgb.LGBMRegressor(**best_lgb_params, n_estimators=1500, device="gpu")
final_lgb.fit(X, y)
print("‚úÖ LightGBM trained.")

# ============================================================
# 6Ô∏è‚É£ Train XGBoost (GPU)
# ============================================================
print("‚ö° Training XGBoost on GPU...")
params_xgb = {
    "tree_method": "gpu_hist",
    "n_estimators": 1200,
    "learning_rate": 0.03,
    "max_depth": 7,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "reg_alpha": 0.1,
    "reg_lambda": 0.1,
    "random_state": 42
}
model_xgb = xgb.XGBRegressor(**params_xgb)
model_xgb.fit(X, y, verbose=False)
print("‚úÖ XGBoost trained.")

# ============================================================
# 7Ô∏è‚É£ Train CatBoost (GPU) with Fixed Bootstrap
# ============================================================
print("ü¶ã Training CatBoost on GPU...")
best_cat_params = {
    "iterations": 1200,
    "learning_rate": 0.03,
    "depth": 7,
    "l2_leaf_reg": 3,
    "bootstrap_type": "Poisson",   # ‚úÖ Fixes subsample issue
    "task_type": "GPU",
    "devices": "0:1",
    "verbose": 200
}
model_cat = CatBoostRegressor(**best_cat_params)
model_cat.fit(X, y)
print("‚úÖ CatBoost trained.")

# ============================================================
# 8Ô∏è‚É£ Ensemble Predictions
# ============================================================
def ensemble_predict(df):
    df_proc = preprocess(df)
    X_test = df_proc[model_features].fillna(0)

    preds_lgb = final_lgb.predict(X_test)
    preds_xgb = model_xgb.predict(X_test)
    preds_cat = model_cat.predict(X_test)

    final_pred = (0.45 * preds_lgb) + (0.35 * preds_xgb) + (0.20 * preds_cat)
    return final_pred

# ============================================================
# 9Ô∏è‚É£ Local Validation
# ============================================================
print("üìà Evaluating ensemble locally...")
valid_preds = ensemble_predict(X_valid)
corr, _ = spearmanr(y_valid, valid_preds)
print(f"üìä Local validation Spearman correlation: {corr:.4f}")

# ============================================================
# üîü Inference Server (for Kaggle)
# ============================================================
import polars as pl
import kaggle_evaluation.default_inference_server as kes

def predict(test_df: pl.DataFrame) -> float:
    try:
        df = test_df.to_pandas()
        preds = ensemble_predict(df)
        allocation = (preds > 0).astype(int) * 2.0
        return float(allocation[-1])
    except Exception as e:
        print(f"‚ö†Ô∏è Prediction error: {e}")
        return 0.0

print("üîß Starting inference server...")
inference_server = kes.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))

print("‚úÖ Final GPU Ensemble Submission Ready!")