## Instructions:
Complete the code for all functions that have a "TODO" in their docstring.

This notebook should take you ~20 minutes to complete. AI usage is allowed, but try not to overuse it.

## Imports

In [None]:

import os
from pathlib import Path
import datetime

from tqdm import tqdm
from dataclasses import dataclass, asdict

import polars as pl
import numpy as np
from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler

import kaggle_evaluation.default_inference_server


## Project Directory Structure

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Configurations

In [None]:

# ============ PATHS ============
DATA_PATH: Path = Path('/kaggle/input/hull-tactical-market-prediction/')

# ============ RETURNS TO SIGNAL CONFIGS ============
MIN_SIGNAL: float = 0.0
MAX_SIGNAL: float = 2.0
SIGNAL_MULTIPLIER: float = 400.0

# ============ MODEL CONFIGS ============
CV: int = 10
L1_RATIO: float = 0.5
ALPHAS: np.ndarray = np.logspace(-4, 2, 100)
MAX_ITER: int = 1000000


## Dataclasses Helpers

In [None]:

@dataclass
class DatasetOutput:
    X_train : pl.DataFrame
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler

@dataclass(frozen=True)
class RetToSignalParameters:
    signal_multiplier: float
    min_signal : float = MIN_SIGNAL
    max_signal : float = MAX_SIGNAL


## Set the Parameters

## Dataset Loading/Creating Helper Functions

In [None]:

def load_trainset() -> pl.DataFrame:
    """
    TODO: Load and preprocess the training dataset.
    - Read the CSV file from DATA_PATH ('train.csv').
    - Rename the column 'market_forward_excess_returns' to 'target'.
    - Convert numeric columns to Float64 (excluding 'date_id').
    - Return the processed DataFrame.
    """
    raise NotImplementedError("Implement load_trainset()")


In [None]:

def load_testset() -> pl.DataFrame:
    """
    TODO: Load and preprocess the testing dataset.
    - Read the CSV file from DATA_PATH ('test.csv').
    - Rename the column 'lagged_forward_returns' to 'target'.
    - Convert numeric columns to Float64 (excluding 'date_id').
    - Return the processed DataFrame.
    """
    raise NotImplementedError("Implement load_testset()")


In [None]:

def create_example_dataset(df: pl.DataFrame) -> pl.DataFrame:
    """
    TODO: Create new engineered features and clean the data.
    - Compute new columns 'U1' = I2 - I1 and 'U2' = M11 / ((I2 + I9 + I7) / 3).
    - Fill null values using exponential weighted mean or a similar method.
    - Drop any remaining nulls.
    """
    raise NotImplementedError("Implement create_example_dataset()")


In [None]:

def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    TODO: Combine training and testing DataFrames.
    - Identify common columns.
    - Concatenate them vertically into a single DataFrame.
    - Return the combined DataFrame.
    """
    raise NotImplementedError("Implement join_train_test_dataframes()")


In [None]:

def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput:
    """
    TODO: Split the data into training/testing sets and scale the features.
    Steps:
    1. Separate X (features) and y (target) for both train and test sets.
    2. Use StandardScaler to fit on train and transform both train/test.
    3. Convert scaled numpy arrays back to Polars DataFrames.
    4. Return a DatasetOutput dataclass instance.
    """
    raise NotImplementedError("Implement split_dataset()")


## Converting Return Prediction to Signal

In [None]:

def convert_ret_to_signal(ret_arr: np.ndarray, params: RetToSignalParameters) -> np.ndarray:
    """
    TODO: Convert model return predictions into a daily signal.
    - Multiply predicted returns by params.signal_multiplier and add 1.
    - Clip resulting signal between params.min_signal and params.max_signal.
    - Return the clipped numpy array.
    """
    raise NotImplementedError("Implement convert_ret_to_signal()")


## Looking at the Data

In [None]:

# Once implemented, these lines will load and display samples of your data.
train: pl.DataFrame = load_trainset()
test: pl.DataFrame = load_testset()
print(train.tail(3))
print(test.head(3))


Here is what the output should look like:

```
shape: (3, 98)
┌─────────┬─────┬─────┬─────┬───┬───────────┬─────────────────┬────────────────┬──────────┐
│ date_id ┆ D1  ┆ D2  ┆ D3  ┆ … ┆ V9        ┆ forward_returns ┆ risk_free_rate ┆ target   │
│ ---     ┆ --- ┆ --- ┆ --- ┆   ┆ ---       ┆ ---             ┆ ---            ┆ ---      │
│ i64     ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64       ┆ f64             ┆ f64            ┆ f64      │
╞═════════╪═════╪═════╪═════╪═══╪═══════════╪═════════════════╪════════════════╪══════════╡
│ 8977    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.708599 ┆ 0.004187        ┆ 0.000162       ┆ 0.003713 │
│ 8978    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.725858 ┆ 0.002279        ┆ 0.000162       ┆ 0.001805 │
│ 8979    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.720092 ┆ 0.003541        ┆ 0.000161       ┆ 0.003068 │
└─────────┴─────┴─────┴─────┴───┴───────────┴─────────────────┴────────────────┴──────────┘
shape: (3, 99)
┌─────────┬─────┬─────┬─────┬───┬───────────┬───────────┬─────────────────────┬────────────────────┐
│ date_id ┆ D1  ┆ D2  ┆ D3  ┆ … ┆ is_scored ┆ target    ┆ lagged_risk_free_ra ┆ lagged_market_forw │
│ ---     ┆ --- ┆ --- ┆ --- ┆   ┆ ---       ┆ ---       ┆ te                  ┆ ard_excess_r…      │
│ i64     ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64       ┆ f64       ┆ ---                 ┆ ---                │
│         ┆     ┆     ┆     ┆   ┆           ┆           ┆ f64                 ┆ f64                │
╞═════════╪═════╪═════╪═════╪═══╪═══════════╪═══════════╪═════════════════════╪════════════════════╡
│ 8980    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ 1.0       ┆ 0.003541  ┆ 0.000161            ┆ 0.003068           │
│ 8981    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ 1.0       ┆ -0.005964 ┆ 0.000162            ┆ -0.006437          │
│ 8982    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ 1.0       ┆ -0.00741  ┆ 0.00016             ┆ -0.007882          │
└─────────┴─────┴─────┴─────┴───┴───────────┴───────────┴─────────────────────┴────────────────────┘
```

## Generating the Train and Test

In [None]:

df: pl.DataFrame = join_train_test_dataframes(train, test)
df = create_example_dataset(df=df)
train: pl.DataFrame = df.filter(pl.col('date_id').is_in(train.get_column('date_id')))
test: pl.DataFrame = df.filter(pl.col('date_id').is_in(test.get_column('date_id')))

FEATURES: list[str] = [col for col in test.columns if col not in ['date_id', 'target']]
dataset: DatasetOutput = split_dataset(train=train, test=test, features=FEATURES)

X_train: pl.DataFrame = dataset.X_train
X_test: pl.DataFrame = dataset.X_test
y_train: pl.Series = dataset.y_train
y_test: pl.Series = dataset.y_test
scaler: StandardScaler = dataset.scaler

print(df.head(3))
print(FEATURES)


Here's what the output should look like:
```
shape: (3, 15)
┌─────────┬───────────┬───────────┬──────────┬───┬───────────┬──────────┬───────────┬───────────┐
│ date_id ┆ target    ┆ S2        ┆ E2       ┆ … ┆ P12       ┆ P13      ┆ U1        ┆ U2        │
│ ---     ┆ ---       ┆ ---       ┆ ---      ┆   ┆ ---       ┆ ---      ┆ ---       ┆ ---       │
│ i64     ┆ f64       ┆ f64       ┆ f64      ┆   ┆ f64       ┆ f64      ┆ f64       ┆ f64       │
╞═════════╪═══════════╪═══════════╪══════════╪═══╪═══════════╪══════════╪═══════════╪═══════════╡
│ 1511    ┆ 0.003079  ┆ -0.28579  ┆ 2.029588 ┆ … ┆ -0.162462 ┆ 0.592262 ┆ -2.318559 ┆ -0.731815 │
│ 1512    ┆ 0.004344  ┆ -0.399753 ┆ 2.045731 ┆ … ┆ -0.578615 ┆ 0.591931 ┆ -2.305802 ┆ -0.220781 │
│ 1513    ┆ -0.001013 ┆ 0.059127  ┆ 2.075762 ┆ … ┆ -0.781019 ┆ 0.591601 ┆ -2.370795 ┆ -0.300937 │
└─────────┴───────────┴───────────┴──────────┴───┴───────────┴──────────┴───────────┴───────────┘
['S2', 'E2', 'E3', 'P9', 'S1', 'S5', 'I2', 'P8', 'P10', 'P12', 'P13', 'U1', 'U2']
```