## Imports

In [11]:
import os
from pathlib import Path
import datetime
from dataclasses import dataclass, asdict

import polars as pl 
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler


## Project Directory Structure

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Configurations

In [None]:
# ============ PATHS ============
DATA_PATH: Path = Path(r'\Users\chenx\Documents\Hull-comp\hull-tactical-market-prediction')

# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0                         # Minimum value for the daily signal 
MAX_SIGNAL: float = 2.0                         # Maximum value for the daily signal 
SIGNAL_MULTIPLIER: float = 400.0                # Multiplier of the OLS market forward excess returns predictions to signal 

# ============ MODEL CONFIGS ============
CV: int = 10                                    # Number of cross validation folds in the model fitting
L1_RATIO: float = 0.5                           # ElasticNet mixing parameter
ALPHAS: np.ndarray = np.logspace(-4, 2, 100)    # Constant that multiplies the penalty terms
MAX_ITER: int = 1000000                         # The maximum number of iterations

## Dataset Loading/Creating Helper Functions

In [6]:
def load_trainset() -> pl.DataFrame:
    """
    Loads and preprocesses the training dataset.

    Returns:
        pl.DataFrame: The preprocessed training DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "train.csv")
        .rename({'market_forward_excess_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
        .head(-10)
    )

def load_testset() -> pl.DataFrame:
    """
    Loads and preprocesses the testing dataset.

    Returns:
        pl.DataFrame: The preprocessed testing DataFrame.
    """
    return (
        pl.read_csv(DATA_PATH / "test.csv")
        .rename({'lagged_forward_returns':'target'})
        .with_columns(
            pl.exclude('date_id').cast(pl.Float64, strict=False)
        )
    )

    
def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    Joins two dataframes by common columns and concatenates them vertically.

    Args:
        train (pl.DataFrame): The training DataFrame.
        test (pl.DataFrame): The testing DataFrame.

    Returns:
        pl.DataFrame: A single DataFrame with vertically stacked data from common columns.
    """
    common_columns: list[str] = [col for col in train.columns if col in test.columns]
    
    return pl.concat([train.select(common_columns), test.select(common_columns)], how="vertical")

def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput: 
    """
    Splits the data into features (X) and target (y), and scales the features.

    Args:
        train (pl.DataFrame): The processed training DataFrame.
        test (pl.DataFrame): The processed testing DataFrame.
        features (list[str]): List of features to used in model. 

    Returns:
        DatasetOutput: A dataclass containing the scaled feature sets, target series, and the fitted scaler.
    """
    X_train = train.drop(['date_id','target']) 
    y_train = train.get_column('target')
    X_test = test.drop(['date_id','target']) 
    y_test = test.get_column('target')
    
    scaler = StandardScaler() 
    
    X_train_scaled_np = scaler.fit_transform(X_train)
    X_train = pl.from_numpy(X_train_scaled_np, schema=features)
    
    X_test_scaled_np = scaler.transform(X_test)
    X_test = pl.from_numpy(X_test_scaled_np, schema=features)
    
    
    return DatasetOutput(
        X_train = X_train,
        y_train = y_train, 
        X_test = X_test, 
        y_test = y_test,
        scaler = scaler
    )

In [7]:
train: pl.DataFrame = load_trainset()
test: pl.DataFrame = load_testset() 
print(train.columns,train)
print(test.columns,test)

['date_id', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14', 'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'forward_returns', 'risk_free_rate', 'target'] shape: (9_011, 98)
┌─────────┬─────┬─────┬─────┬───┬───────────┬─────────────────┬────────────────┬───────────┐
│ date_id ┆ D1  ┆ D2  ┆ D3  ┆ … ┆ V9        ┆ forward_returns ┆ risk_free_rate ┆ target    │
│ ---     ┆ --- ┆ --- ┆ --- ┆   ┆ ---       ┆ ---             ┆ ---            ┆ ---       │
│ i64     ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64       ┆ f6

## Converting Return Prediction to Signal

Here is an example of a potential function used to convert a prediction based on the market forward excess return to a daily signal position. 

## Generating the Train and Test

In [8]:
df: pl.DataFrame = join_train_test_dataframes(train, test)
df = df.fill_null(0)
train: pl.DataFrame = df.filter(pl.col('date_id').is_in(train.get_column('date_id')))
test: pl.DataFrame = df.filter(pl.col('date_id').is_in(test.get_column('date_id')))

FEATURES: list[str] = [col for col in test.columns if col not in ['date_id', 'target']]

dataset: DatasetOutput = split_dataset(train=train, test=test, features=FEATURES) 

X_train: pl.DataFrame = dataset.X_train
X_test: pl.DataFrame = dataset.X_test
y_train: pl.DataFrame = dataset.y_train
y_test: pl.DataFrame = dataset.y_test
scaler: StandardScaler = dataset.scaler 

Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  train: pl.DataFrame = df.filter(pl.col('date_id').is_in(train.get_column('date_id')))
Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  test: pl.DataFrame = df.filter(pl.col('date_id').is_in(test.get_column('date_id')))


In [9]:
print(df.shape)

(9021, 96)


## Fitting the Model 

In [10]:
model_cv: ElasticNetCV = ElasticNetCV(
    **asdict(enet_params)
)
model_cv.fit(X_train, y_train) 
        
# Fit the final model using the best alpha found by cross-validation
model: ElasticNet = ElasticNet(alpha=model_cv.alpha_, l1_ratio=enet_params.l1_ratio) 
model.fit(X_train, y_train)

NameError: name 'enet_params' is not defined

## Prediction Function via Kaggle Server

In [11]:
def predict(test: pl.DataFrame) -> float:
    # rename only if present, avoid raising when key missing
    if 'lagged_forward_returns' in test.columns and 'target' not in test.columns:
        test = test.rename({'lagged_forward_returns': 'target'}, strict=False)

    if 'target' not in test.columns:
        raise ValueError("Input test DataFrame must contain 'lagged_forward_returns' or 'target' column")
    
    df: pl.DataFrame = test

    # ensure all features are present after preprocessing
    missing = [f for f in FEATURES if f not in df.columns]
    if missing:
        raise ValueError(f"Missing feature columns after preprocessing: {missing}")

    X_test_df: pl.DataFrame = df.select(FEATURES)
    X_test_np: np.ndarray = X_test_df.to_numpy()
    X_test_scaled_np: np.ndarray = scaler.transform(X_test_np)
    raw_pred: np.ndarray = model.predict(X_test_scaled_np)
    return convert_ret_to_signal(raw_pred, ret_signal_params)

In [12]:
prediction = predict(test = test)
print(prediction)

[1.05144634 1.15416558 0.99950111 0.96067718 0.93712809 0.97670075
 0.97337924 1.03456828 0.95085851 0.83524328 1.05144634 1.15416558
 0.99950111 0.96067718 0.93712809 0.97670075 0.97337924 1.03456828
 0.95085851 0.83524328]




In [17]:
MIN_INVESTMENT = 0
MAX_INVESTMENT = 2


class ParticipantVisibleError(Exception):
    pass


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates a custom evaluation metric (volatility-adjusted Sharpe ratio).

    This metric penalizes strategies that take on significantly more volatility
    than the underlying market.

    Returns:
        float: The calculated adjusted Sharpe ratio.
    """

    #if not pandas.api.types.is_numeric_dtype(submission['prediction']):
    #    raise ParticipantVisibleError('Predictions must be numeric')

    solution = solution
    solution['position'] = submission['prediction']

    if solution['position'].max() > MAX_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].max()} exceeds maximum of {MAX_INVESTMENT}')
    if solution['position'].min() < MIN_INVESTMENT:
        raise ParticipantVisibleError(f'Position of {solution["position"].min()} below minimum of {MIN_INVESTMENT}')

    solution['strategy_returns'] = solution['risk_free_rate'] * (1 - solution['position']) + solution['position'] * solution['forward_returns']

    # Calculate strategy's Sharpe ratio
    strategy_excess_returns = solution['strategy_returns'] - solution['risk_free_rate']
    strategy_excess_cumulative = (1 + strategy_excess_returns).prod()
    strategy_mean_excess_return = (strategy_excess_cumulative) ** (1 / len(solution)) - 1
    strategy_std = solution['strategy_returns'].std()

    trading_days_per_yr = 252
    if strategy_std == 0:
        raise ParticipantVisibleError('Division by zero, strategy std is zero')
    sharpe = strategy_mean_excess_return / strategy_std * np.sqrt(trading_days_per_yr)
    strategy_volatility = float(strategy_std * np.sqrt(trading_days_per_yr) * 100)

    # Calculate market return and volatility
    market_excess_returns = solution['forward_returns'] - solution['risk_free_rate']
    market_excess_cumulative = (1 + market_excess_returns).prod()
    market_mean_excess_return = (market_excess_cumulative) ** (1 / len(solution)) - 1
    market_std = solution['forward_returns'].std()

    market_volatility = float(market_std * np.sqrt(trading_days_per_yr) * 100)

    if market_volatility == 0:
        raise ParticipantVisibleError('Division by zero, market std is zero')

    # Calculate the volatility penalty
    excess_vol = max(0, strategy_volatility / market_volatility - 1.2) if market_volatility > 0 else 0
    vol_penalty = 1 + excess_vol

    # Calculate the return penalty
    return_gap = max(
        0,
        (market_mean_excess_return - strategy_mean_excess_return) * 100 * trading_days_per_yr,
    )
    return_penalty = 1 + (return_gap**2) / 100

    # Adjust the Sharpe ratio by the volatility and return penalty
    adjusted_sharpe = sharpe / (vol_penalty * return_penalty)
    return min(float(adjusted_sharpe), 1_000_000)

In [18]:
score(test, prediction, 'date_id')

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [None]:
# ============================================================================
# Supervised Learning Program with Score Function as Reward
# ============================================================================

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings('ignore')

class SupervisedLearningReward:
    """
    A supervised learning framework that uses the score function as the reward metric.
    """
    
    def __init__(self, train_data: pl.DataFrame, test_data: pl.DataFrame, features: list[str]):
        """
        Initialize the supervised learning framework.
        
        Args:
            train_data: Training DataFrame with features and target
            test_data: Test DataFrame with features and target
            features: List of feature column names
        """
        self.train_data = train_data
        self.test_data = test_data
        self.features = features
        self.models_results = []
        
    def prepare_data_for_scoring(self, predictions: pd.Series, test_data: pl.DataFrame) -> tuple:
        """
        Prepare solution and submission DataFrames required by the score function.
        
        Args:
            predictions: Model predictions (positions to take)
            test_data: Test DataFrame containing market data
            
        Returns:
            Tuple of (solution_df, submission_df)
        """
        solution = test_data.select(['date_id', 'risk_free_rate', 'forward_returns']).to_pandas()
        submission = pd.DataFrame({'prediction': predictions})
        
        return solution, submission
    
    def evaluate_model(self, model, X_train, y_train, X_test, y_test, test_data: pl.DataFrame, model_name: str) -> dict:
        """
        Train a model and evaluate it using the score function.
        
        Args:
            model: Scikit-learn model instance
            X_train, y_train: Training features and target
            X_test, y_test: Test features and target
            test_data: Test data for scoring
            model_name: Name of the model for tracking
            
        Returns:
            Dictionary with evaluation results including score
        """
        # Train the model
        model.fit(X_train, y_train)
        
        # Make predictions
        train_predictions = model.predict(X_train)
        test_predictions = model.predict(X_test)
        
        # Clip predictions to valid range [MIN_INVESTMENT, MAX_INVESTMENT]
        test_predictions = np.clip(test_predictions, MIN_INVESTMENT, MAX_INVESTMENT)
        
        # Calculate score using the reward function
        solution, submission = self.prepare_data_for_scoring(test_predictions, test_data)
        
        try:
            reward_score = score(solution, submission, 'date_id')
        except ParticipantVisibleError as e:
            reward_score = -float('inf')
            print(f"  Error: {e}")
        
        # Calculate traditional metrics
        from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
        mse = mean_squared_error(y_test, test_predictions)
        mae = mean_absolute_error(y_test, test_predictions)
        r2 = r2_score(y_test, test_predictions)
        
        result = {
            'model_name': model_name,
            'reward_score': reward_score,
            'mse': mse,
            'mae': mae,
            'r2': r2,
            'model': model,
            'predictions': test_predictions
        }
        
        self.models_results.append(result)
        return result
    
    def train_and_compare(self):
        """
        Train multiple models and compare them using the score function.
        """
        # Convert polars to numpy for scikit-learn
        X_train = X_train.to_numpy()
        y_train = y_train.to_numpy()
        X_test = X_test.to_numpy()
        y_test = y_test.to_numpy()
        
        models = {
            'Linear Regression': Ridge(alpha=1.0),
            'Lasso': Lasso(alpha=0.01, max_iter=5000),
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
            'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        }
        
        print("Training models and evaluating with score function...\n")
        
        for model_name, model in models.items():
            print(f"Training {model_name}...")
            result = self.evaluate_model(
                model, X_train, y_train, X_test, y_test, 
                test_data, model_name
            )
            print(f"  Reward Score: {result['reward_score']:.4f}")
            print(f"  MSE: {result['mse']:.6f}, R²: {result['r2']:.4f}\n")
        
        return self.get_best_model()
    
    def get_best_model(self):
        """Get the model with the highest score."""
        if not self.models_results:
            return None
        
        best = max(self.models_results, key=lambda x: x['reward_score'])
        print(f"Best Model: {best['model_name']} with Score: {best['reward_score']:.4f}")
        return best


# ============================================================================
# Run the Supervised Learning Framework
# ============================================================================

print("=" * 70)
print("SUPERVISED LEARNING WITH SCORE-BASED REWARD")
print("=" * 70)

learner = SupervisedLearningReward(train, test, FEATURES)
best_result = learner.train_and_compare()

print("\n" + "=" * 70)
print("Summary of All Models:")
print("=" * 70)
for result in learner.models_results:
    print(f"{result['model_name']:25} | Score: {result['reward_score']:10.4f} | R²: {result['r2']:8.4f}")