In [None]:
# =============================================================================
# 1. IMPORTS
# =============================================================================
import os
import time
from pathlib import Path
from dataclasses import dataclass, field

import numpy as np
import polars as pl
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
import kaggle_evaluation.default_inference_server

In [None]:
# =============================================================================
# 2.CONFIGURATION
# Centralizes all tunable parameters for easy experimentation.
# =============================================================================
@dataclass
class ModelConfig:
    # --- Model Hyperparameters ---
    enet_alpha: float = 0.001
    enet_l1_ratio: float = 0.5
    
    xgb_n_estimators: int = 500
    xgb_max_depth: int = 8
    xgb_learning_rate: float = 0.04
    xgb_subsample: float = 0.8
    xgb_colsample_bytree: float = 0.8
    
    lgb_n_estimators: int = 500
    lgb_max_depth: int = 8
    lgb_learning_rate: float = 0.04
    lgb_num_leaves: int = 20
    lgb_subsample: float = 0.8
    lgb_colsample_bytree: float = 0.8
    
    # --- Strategy Parameters ---
    ensemble_weights: dict = field(default_factory=lambda: {'enet': 0.1, 'xgb': 0.5, 'lgb': 0.4})
    vol_window: int = 20  # Window for calculating rolling volatility
    signal_multiplier: float = 500.0  # Scales the raw prediction to a signal
    min_signal: float = 0.0  # Minimum allocation allowed (no shorting)
    max_signal: float = 2.0  # Maximum allocation allowed (up to 2x leverage)
    vol_scaling: float = 1.0  # Factor to adjust volatility-based allocation
    smoothing_factor: float = 0.8  # Smoothes allocation changes to reduce turnover (0.8*new + 0.2*old)
    
    # --- Training Control ---
    retrain_freq: int = 20  # Retrain the models every 20 timesteps for efficiency
    max_train_rows: int = 1000  # Use a rolling window of the most recent 1000 rows for training

In [None]:
# =============================================================================
# 3. MODEL CLASS
# Encapsulates the entire modeling pipeline, including state management.
# =============================================================================
class Model:
    def __init__(self, config: ModelConfig):
        """
        Constructor for the Model class.
        Initializes models, scaler, and state-tracking variables.
        """
        self.config = config
        self.scaler = StandardScaler()
        self.models = self._init_models()
        
        # State variables to be updated during online inference
        self.training_data = None      # Stores the raw training data (features + target)
        self.median_values = {}        # Caches median values for imputation
        self.feature_cols = []         # List of feature names used for training
        self.last_allocation = 0.0     # Stores the allocation from the previous timestep
        self.test_row_count = 0        # Counter for incoming test rows
        self.previous_test_df = None   # Stores the previous test row for online learning
        
        print("Model initialized.")

    def _init_models(self):
        """Initializes all model instances with hyperparameters from the config."""
        return {
            'enet': ElasticNet(alpha=self.config.enet_alpha, l1_ratio=self.config.enet_l1_ratio, max_iter=10000),
            
            'xgb': xgb.XGBRegressor(objective='reg:squarederror', n_estimators=self.config.xgb_n_estimators, max_depth=self.config.xgb_max_depth, \
                                    learning_rate=self.config.xgb_learning_rate, subsample=self.config.xgb_subsample, colsample_bytree=self.config.xgb_colsample_bytree, random_state=42, n_jobs=-1),
            
            'lgb': lgb.LGBMRegressor(objective='regression_l1', n_estimators=self.config.lgb_n_estimators, max_depth=self.config.lgb_max_depth, \
                                     learning_rate=self.config.lgb_learning_rate, num_leaves=self.config.lgb_num_leaves, subsample=self.config.lgb_subsample, \
                                     colsample_bytree=self.config.lgb_colsample_bytree, random_state=42, n_jobs=-1)
        }

    def _create_features(self, df: pl.DataFrame) -> pl.DataFrame:
        """
        Generates time-series features for the input dataframe.
        - Rolling Mean / Std: Captures recent trends and volatility.
        - EWMA: Gives more weight to recent data, adapting faster to changes.
        """
        key_features_for_rolling = ['V1', 'S1', 'P1', 'E1', 'M11'] # Key features for time-series analysis
        windows = [5, 10, 20] # Short, medium, and long-term windows
        spans = [5, 10, 20]   # Spans for Exponentially Weighted Moving Average
        rolling_exprs = []

        for col_name in key_features_for_rolling:
            if col_name in df.columns:
                for w in windows:
                    rolling_exprs.append(pl.col(col_name).rolling_mean(w).alias(f'{col_name}_roll_mean_{w}'))
                    rolling_exprs.append(pl.col(col_name).rolling_std(w).alias(f'{col_name}_roll_std_{w}'))
                for s in spans:
                    rolling_exprs.append(pl.col(col_name).ewm_mean(span=s).alias(f'{col_name}_ewm_mean_{s}'))
        
        return df.with_columns(rolling_exprs) if rolling_exprs else df

    def _run_training_cycle(self):
        """
        Runs a complete feature engineering and model training cycle on `self.training_data`.
        This is a reusable function for both initial training and periodic retraining.
        """
        print("Running a full training cycle...")
        
        # Step 1: Generate features from the raw training data
        processed_data = self._create_features(self.training_data)
        
        # Step 2: Identify feature columns and cache median values for imputation
        all_features = [col for col in processed_data.columns if col not in ['date_id', 'target']]
        self.feature_cols = all_features
        self.median_values = {col: processed_data[col].median() for col in self.feature_cols if processed_data[col].is_not_null().any()}

        # Step 3: Impute missing values
        impute_exprs = [pl.col(c).fill_null(self.median_values.get(c, 0.0)) for c in self.feature_cols]
        processed_data = processed_data.with_columns(impute_exprs)

        # Step 4: Prepare data for Scikit-learn models (convert to Pandas, scale)
        # Final fill_null in Polars before converting to Pandas to prevent errors
        X_train_pd = processed_data.select(self.feature_cols).fill_null(0.0).to_pandas()
        y_train_pd = processed_data['target'].to_pandas()
        X_train_scaled = self.scaler.fit_transform(X_train_pd)

        # Step 5: Train each model in the ensemble
        for name, model in self.models.items():
            model.fit(X_train_scaled, y_train_pd)
        print("Training cycle completed.")

    def train(self, train_df: pl.DataFrame):
        """
        Initializes the model by performing the first training cycle.
        """
        print("Starting initialization and first training...")
        
        # Drop columns that exist in train.csv but not in test.csv to ensure consistency
        unused_cols = ['forward_returns', 'risk_free_rate']
        train_df = train_df.drop([col for col in unused_cols if col in train_df.columns])
        
        # Prepare and store the initial raw training data
        self.training_data = train_df.rename({'market_forward_excess_returns': 'target'}) \
                           .with_columns(pl.exclude('date_id').cast(pl.Float64, strict=False)) \
                           .filter(pl.col('date_id') >= 37) \
                           .tail(self.config.max_train_rows)
        
        # Run the first training cycle
        self._run_training_cycle()

    def predict(self, test_df: pl.DataFrame) -> float:
        """
        Generates a prediction and allocation for a single test data point.
        This function is called iteratively by the Kaggle API.
        """
        # Ensure incoming data has the correct float type to prevent errors
        test_df = test_df.with_columns(pl.exclude(['date_id', 'is_scored']).cast(pl.Float64, strict=False))

        # --- Online Learning Step ---
        # If this is not the first prediction, update the training data with the previous step's observation.
        if self.previous_test_df is not None:
            # The lagged return from the current test_df is the true target for the previous test_df
            target_value = test_df['lagged_market_forward_excess_returns'][0]
            new_training_row = self.previous_test_df.with_columns(pl.lit(target_value).alias('target'))
            
            # Append the new row and keep the training data size fixed
            self.training_data = pl.concat([
                self.training_data,
                new_training_row.select(self.training_data.columns)
            ]).tail(self.config.max_train_rows)
            
            # Retrain periodically based on the configured frequency
            if self.test_row_count % self.config.retrain_freq == 0:
                self._run_training_cycle()

        # --- Prediction Step for the Current Timestep ---
        # Step 1: Create features for the current test row using historical context
        history_for_features = self.training_data.drop('target')
        current_test_with_history = pl.concat([
            history_for_features.tail(self.config.vol_window * 2),
            test_df.select(history_for_features.columns)
        ])
        featured_test_with_history = self._create_features(current_test_with_history)
        featured_test = featured_test_with_history.tail(1)
        
        # Step 2: Impute and prepare the test data for prediction
        impute_exprs = [pl.col(c).fill_null(self.median_values.get(c, 0.0)) for c in self.feature_cols]
        featured_test = featured_test.with_columns(impute_exprs)
        X_test_pd = featured_test.select(self.feature_cols).fill_null(0.0).to_pandas()
        X_test_scaled = self.scaler.transform(X_test_pd)
        
        # Step 3: Get predictions from all models and create an ensemble prediction
        predictions = {name: model.predict(X_test_scaled)[0] for name, model in self.models.items()}
        raw_pred = sum(predictions[name] * self.config.ensemble_weights[name] for name in self.models)
        
        # --- Allocation Strategy ---
        # Step 4: Estimate recent volatility from the training data returns
        recent_returns = self.training_data['target'].tail(self.config.vol_window).to_numpy()
        volatility = np.std(recent_returns) if len(recent_returns) > 1 else 0.01
        volatility = max(volatility, 0.01) # Avoid division by zero
        
        # Step 5: Convert raw prediction to a signal and adjust for volatility
        signal = np.clip(raw_pred * self.config.signal_multiplier, self.config.min_signal, self.config.max_signal)
        allocation = np.clip(signal / (volatility * self.config.vol_scaling), self.config.min_signal, self.config.max_signal)
        
        # Step 6: Smooth the final allocation to reduce rapid changes
        final_allocation = (self.config.smoothing_factor * allocation + (1 - self.config.smoothing_factor) * self.last_allocation)
        
        # --- State Update ---
        # Update state variables for the next iteration
        self.last_allocation = final_allocation
        self.test_row_count += 1
        self.previous_test_df = test_df # Store the current test data for the next learning step
        
        return float(final_allocation)

In [None]:
# =============================================================================
# 4. MAIN EXECUTION BLOCK
# =============================================================================
if __name__ == '__main__':
    # Setup data path and load initial training data
    DATA_PATH = Path('/kaggle/input/hull-tactical-market-prediction/')
    initial_train_df = pl.read_csv(DATA_PATH / "train.csv")
    
    # Initialize and train the model
    start_time = time.time()
    model_config = ModelConfig()
    model = Model(model_config)
    model.train(initial_train_df)
    
    # Check if the startup process is within the time limit
    startup_time = time.time() - start_time
    print(f"Model startup and initial training time: {startup_time:.2f} seconds")
    if startup_time > 900:
        raise RuntimeError("Startup time exceeded the 900-second limit.")

    # Create a wrapper function for the Kaggle API
    # The API requires a plain 'predict' function, not a class method.
    def predict(test_df: pl.DataFrame) -> float:
        return model.predict(test_df)

    # Initialize the inference server with the wrapper function
    inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)
    
    # Run the server based on the environment (Kaggle submission vs. local testing)
    if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
        inference_server.serve()
    else:
        inference_server.run_local_gateway((str(DATA_PATH),))