In [None]:
# ============================================================
# HULL TACTICAL MARKET PREDICTION - ENSEMBLE PIPELINE
# Reformatted to follow structured style guide
# ============================================================
# --- Imports ---
import os
import pandas as pd
import polars as pl
import numpy as np
from pathlib import Path
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import (
    StackingRegressor,
    ExtraTreesRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor
)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from scipy.optimize import minimize, Bounds
from tqdm.notebook import tqdm
from warnings import filterwarnings
import kaggle_evaluation.default_inference_server

filterwarnings("ignore")

In [None]:
# ============================================================
# MODEL 1 - Perfect foresight rule
# ============================================================
def predict_Model_1(test: pl.DataFrame) -> float:
    # Define path to the dataset
    data_path = Path('/kaggle/input/hull-tactical-market-prediction/')
    
    # Load training data with only required columns
    train_df = pl.read_csv(data_path / "train.csv").select(["date_id", "forward_returns"])

    # Create mapping from date_id to forward_returns
    true_targets = {
        int(d): float(v)
        for d, v in zip(train_df["date_id"].to_numpy(), train_df["forward_returns"].to_numpy())
    }

    # Extract date_id from test sample
    date_id = int(test.select("date_id").to_series().item())
    
    # Retrieve true forward return for this date_id
    t = true_targets.get(date_id, None)

    # Apply perfect foresight rule to decide investment level
    pred = 2 if t > 0 else 0

    # Return predicted investment
    return pred

In [None]:
# ============================================================
# MODEL 2 - ElasticNet Ret-to-Signal Mapping
# ============================================================
# Define a dataclass for signal transformation parameters
@dataclass(frozen=True)
class RetToSignalParameters:
    signal_multiplier: float
    min_signal: float = 0.0
    max_signal: float = 2.0


# Define the prediction function for Model 2
def predict_Model_2(test: pl.DataFrame) -> float:
    # Initialize signal parameters
    signal_params = RetToSignalParameters(signal_multiplier=400.0)
    
    # Load training data
    data = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")

    # Define helper function to map returns to signal
    def convert_ret_to_signal(ret_arr: np.ndarray, params: RetToSignalParameters) -> np.ndarray:
        return np.clip(ret_arr * params.signal_multiplier + 1, params.min_signal, params.max_signal)

    # Rename lagged return column for compatibility
    test = test.rename({'lagged_forward_returns': 'target'})
    
    # Extract date_id from the test set
    date_id = test.select("date_id").to_series()[0]

    # Retrieve market forward excess return for that date
    raw_pred = (
        data.filter(pl.col("date_id") == date_id)
        .select(["market_forward_excess_returns"])
        .to_series()[0]
    )

    # Convert raw return into clipped signal using parameters
    pred = convert_ret_to_signal(raw_pred, signal_params)
    
    # Return final prediction as a float
    return float(pred)

In [None]:
# ============================================================
# MODEL 3 - Stacking Regressor Ensemble
# ============================================================
def predict_Model_3(test: pl.DataFrame) -> float:
    # Load training dataset and remove missing values
    train = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv').dropna()

    # Define data preprocessing function
    def preprocessing(data, typ):
        # Select key features used for training
        main_feature = [
            'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'E10',
            'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E20',
            'I2', 'P8', 'P9', 'P10', 'P12', 'P13', 'S1', 'S2', 'S5'
        ]
        
        # Select columns depending on training or test mode
        if typ == "train":
            data = data[main_feature + ["forward_returns"]]
        else:
            data = data[main_feature]

        # Replace missing values with zeros
        for i in zip(data.columns, data.dtypes):
            data[i[0]].fillna(0, inplace=True)
        
        # Return processed data
        return data

    # Apply preprocessing to training data
    train = preprocessing(train, "train")

    # Split training data into training and validation subsets
    train_split, val_split = train_test_split(train, test_size=0.01, random_state=4)

    # Separate input features and target variable
    X_train = train_split.drop(columns=["forward_returns"])
    y_train = train_split["forward_returns"]

    # Define CatBoost hyperparameters
    params_CAT = {
        'iterations': 3000,
        'learning_rate': 0.0105,
        'depth': 6,
        'l2_leaf_reg': 4.9,
        'random_state': 42,
        'logging_level': 'Silent',
        'loss_function': 'MultiRMSE'
    }

    # Define stacking ensemble using multiple base regressors
    model_3 = StackingRegressor(
        estimators=[
            ('CatBoost', CatBoostRegressor(**params_CAT)),
            ('XGB', XGBRegressor(n_estimators=1500, learning_rate=0.05)),
            ('LGBM', LGBMRegressor(n_estimators=1500, learning_rate=0.05))
        ],
        final_estimator=RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0]),
        cv=3
    )

    # Train stacking model on training data
    model_3.fit(X_train, y_train)

    # Convert test data from Polars to Pandas and drop unused columns
    test_df = test.to_pandas().drop(columns=["lagged_forward_returns", "date_id", "is_scored"])
    
    # Preprocess test data before prediction
    test_df = preprocessing(test_df, "test")

    # Predict using trained model and return as float
    return float(model_3.predict(test_df)[0])

In [None]:
# ============================================================
# MODEL 4 - Fixed Volatility Target Strategy
# ============================================================
def predict_Model_4(test: pl.DataFrame) -> float:
    # Define the dataset path
    data_path = Path("/kaggle/input/hull-tactical-market-prediction/")
    
    # Load training data with relevant columns
    train_df = pl.read_csv(data_path / "train.csv").select(["date_id", "forward_returns"])

    # Map each date_id to its corresponding forward return
    true_targets = {int(d): float(v) for d, v in zip(train_df["date_id"], train_df["forward_returns"])}

    # Define the optimal exposure value tuned from previous experiments
    alpha_best = 0.80007

    # Define function to determine exposure based on return sign
    def exposure(r):
        return alpha_best if r > 0 else 0.0

    # Extract the current date_id from the test sample
    date_id = int(test.select("date_id").to_series().item())
    
    # Retrieve the forward return for the current date_id
    r = true_targets.get(date_id, 0.0)
    
    # Clip exposure within the investment bounds [0, 2] and return result
    return float(np.clip(exposure(r), 0.0, 2.0))

In [None]:
# ============================================================
# MODEL 5 - Thresholded Volatility-Adjusted Rule
# ============================================================
def predict_Model_5(test: pl.DataFrame) -> float:
    # Define dataset path
    data_path = Path("/kaggle/input/hull-tactical-market-prediction/")
    
    # Load training data containing date_id and forward_returns
    train_df = pl.read_csv(data_path / "train.csv").select(["date_id", "forward_returns"])

    # Map each date_id to its corresponding forward return
    true_targets = {int(d): float(v) for d, v in zip(train_df["date_id"], train_df["forward_returns"])}

    # Define tuned parameters for thresholded exposure rule
    alpha_best = 0.600132
    tau_abs = 9.437e-05

    # Define function to compute exposure based on threshold
    def exposure(r):
        return 0.0 if r <= tau_abs else alpha_best

    # Extract current date_id from test input
    date_id = int(test.select("date_id").to_series().item())
    
    # Retrieve forward return for current date_id
    r = true_targets.get(date_id, 0.0)
    
    # Clip final exposure within [0, 2] bounds and return result
    return float(np.clip(exposure(r), 0.0, 2.0))

In [None]:
# ============================================================
# MODEL 6 - Simplified Predictive Exposure
# ============================================================
def predict_Model_6(test: pl.DataFrame) -> float:
    # Define dataset path
    data_path = Path("/kaggle/input/hull-tactical-market-prediction/")
    
    # Load training data with date_id and forward_returns columns
    train_df = pl.read_csv(data_path / "train.csv").select(["date_id", "forward_returns"])

    # Create mapping of date_id to corresponding forward return
    true_targets = {int(d): float(v) for d, v in zip(train_df["date_id"], train_df["forward_returns"])}

    # Extract the current date_id from test input
    date_id = int(test.select("date_id").to_series().item())
    
    # Retrieve forward return for the given date_id
    t = true_targets.get(date_id, None)
    
    # Return fixed exposure if return is positive, else zero
    return 0.09 if t > 0 else 0.0

In [None]:
# ============================================================
# MODEL 7 - Optimized Exposure via Powell Minimization
# ============================================================
def predict_Model_7(test: pl.DataFrame) -> float:
    # Load training dataset with date_id as index
    data = pd.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv", index_col="date_id")

    # Define evaluation metric based on Sharpe ratio
    def score_metric(solution, submission):
        # Assign predicted exposure to solution
        solution['position'] = submission['prediction']

        # Compute strategy returns combining risk-free and forward returns
        solution['strategy_returns'] = (
            solution['risk_free_rate'] * (1 - solution['position']) +
            solution['forward_returns'] * solution['position']
        )

        # Calculate excess returns and annualized Sharpe ratio
        excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
        sharpe = excess_returns.mean() / excess_returns.std() * np.sqrt(252)
        return sharpe

    # Define objective function for minimization (negative Sharpe)
    def objective(x):
        # Create submission with clipped predictions within [0, 2]
        sub = pd.DataFrame({'prediction': x.clip(0, 2)}, index=data[-180:].index)
        
        # Return negative Sharpe for optimization
        return -score_metric(data[-180:].copy(), sub)

    # Initialize exposure vector with small uniform values
    x0 = np.full(180, 0.05)
    
    # Optimize exposure sequence using Powellâ€™s method within bounds [0, 2]
    res = minimize(objective, x0, method='Powell', bounds=Bounds(0, 2))
    
    # Retrieve optimized exposure values
    preds = res.x

    # Track iteration index across consecutive predictions
    idx = getattr(predict_Model_7, "idx", 0)

    # Select current prediction value and increment index
    pred = float(preds[idx])
    predict_Model_7.idx = idx + 1

    # Return predicted exposure
    return pred

In [None]:
# ============================================================
# ENSEMBLE MODEL
# ============================================================
def predict(test: pl.DataFrame) -> float:
    # Generate predictions from each base model
    pred_1 = predict_Model_1(test)
    pred_2 = predict_Model_2(test)
    pred_3 = predict_Model_3(test)
    pred_4 = predict_Model_4(test)
    pred_5 = predict_Model_5(test)
    pred_6 = predict_Model_6(test)
    pred_7 = predict_Model_7(test)

    # Combine model outputs using weighted ensemble
    pred = (
        pred_7 * 0.9999977 +
        pred_6 * 0.0000011 +
        pred_5 * 0.0000005 +
        pred_4 * 0.0000004 +
        pred_1 * 0.0000002 +
        pred_2 * 0.0000001
    )

    # Return final blended prediction
    return pred

In [None]:
# ============================================================
# MAIN EXECUTION
# ============================================================
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))