## Imports

In [1]:
# ============================================================
# 1. Imports & Global Setup
# ============================================================
from __future__ import annotations

# ---------- Standard Library ----------
import os
import datetime
import random
import warnings
import logging
import json
from pathlib import Path
from dataclasses import dataclass, asdict, field
from typing import List, Dict, Tuple, Optional, Any

# ---------- Third-party ----------
import numpy as np
import polars as pl

from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score

from tqdm.auto import tqdm  # progress bar di notebook

import kaggle_evaluation.default_inference_server as kei
# nanti dipakai sebagai: kei.DefaultInferenceServer(predict)


# ============================================================
# 2. Reproducibility
# ============================================================
SEED: int = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
# kalau nanti pakai model lain (mis. RandomizedSearch, dsb.), pakai SEED ini juga


# ============================================================
# 3. Polars / Display Config (QoL)
# ============================================================
# Supaya print DataFrame tidak terlalu panjang di output
pl.Config.set_tbl_cols(20)
pl.Config.set_tbl_rows(20)


# ============================================================
# 4. Logging & Warning Setup
# ============================================================
logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

logger.info("Environment initialised | SEED=%d", SEED)


## Project Directory Structure

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggl

## Configurations

In [3]:
# ============================================================
# 1. IMPORTS & GLOBAL SETUP
# ============================================================
from __future__ import annotations

# ---- Standard library ----
import os
import datetime
import random
import warnings
import logging
from pathlib import Path
from dataclasses import dataclass, asdict, field
from typing import List, Dict, Tuple, Optional, Any

# ---- Third-party ----
import numpy as np
import polars as pl
from tqdm.auto import tqdm

from sklearn.linear_model import (
    ElasticNet,
    ElasticNetCV,
    LinearRegression,
    Ridge,
    Lasso,
)
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Optional: tree-based model (kalau nanti kamu mau coba ensemble non-linear)
try:
    import lightgbm as lgb
    HAS_LGBM = True
except ImportError:
    HAS_LGBM = False

import kaggle_evaluation.default_inference_server as kei


# ---- Reproducibility ----
SEED: int = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# ---- Polars & Logging QoL ----
pl.Config.set_tbl_cols(20)
pl.Config.set_tbl_rows(20)

logging.basicConfig(
    level=logging.INFO,
    format="[%(asctime)s][%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

logger.info("Environment initialised | SEED=%d | HAS_LGBM=%s", SEED, HAS_LGBM)


# ============================================================
# 2. PROJECT DIRECTORY STRUCTURE & CONFIG
# ============================================================

# Nama kompetisi (dipakai untuk bikin folder kerja terpisah)
COMP_NAME: str = "hull-tactical-market-prediction"

# Nama eksperimen (bisa kamu ganti saat coba konfigurasi lain)
EXPERIMENT_NAME: str = f"enet_v1_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}"

# ---- Input & Working Dirs ----
INPUT_DIR: Path = Path("/kaggle/input") / COMP_NAME
WORK_DIR: Path  = Path("/kaggle/working") / COMP_NAME

WORK_DIR.mkdir(parents=True, exist_ok=True)

# File data utama
TRAIN_PATH: Path = INPUT_DIR / "train.csv"
TEST_PATH: Path  = INPUT_DIR / "test.csv"

# Folder resmi dari Kaggle Evaluation API (source & copy ke working jika perlu)
KAGGLE_EVAL_SRC: Path  = INPUT_DIR / "kaggle_evaluation"
KAGGLE_EVAL_WORK: Path = WORK_DIR / "kaggle_evaluation"
KAGGLE_EVAL_WORK.mkdir(parents=True, exist_ok=True)

# ---- Output structure (untuk hasil eksperimen) ----
OUT_DIR: Path        = WORK_DIR / "outputs"
MODEL_DIR: Path      = OUT_DIR / "models"       # simpan model, scaler, dsb.
FEATURE_DIR: Path    = OUT_DIR / "features"     # simpan dataset hasil FE (opsional)
LOG_DIR: Path        = OUT_DIR / "logs"         # catatan eksperimen, metrik
SUBMISSION_DIR: Path = OUT_DIR / "submissions"  # submission lokal untuk dicek

for p in [OUT_DIR, MODEL_DIR, FEATURE_DIR, LOG_DIR, SUBMISSION_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Flag environment: apakah ini run dalam mode kompetisi (rerun) atau lokal
IS_COMP_RERUN: bool = os.getenv("KAGGLE_IS_COMPETITION_RERUN") is not None
logger.info("IS_COMP_RERUN = %s", IS_COMP_RERUN)


# ============================================================
# 3. RETURNS -> SIGNAL CONFIGS
# ============================================================

@dataclass(frozen=True)
class SignalConfig:
    """
    Konfigurasi untuk mengubah prediksi return -> sinyal trading harian.

    min_signal / max_signal : batas posisi (0 = cash, 2 = 2x leverage)
    multiplier              : skala sensitivitas sinyal terhadap prediksi return
    """
    min_signal: float = 0.0
    max_signal: float = 2.0
    multiplier: float = 400.0


# Beberapa preset sinyal yang nanti bisa kamu pilih via validasi Sharpe
SIGNAL_PRESETS: Dict[str, SignalConfig] = {
    "baseline":     SignalConfig(min_signal=0.0, max_signal=2.0, multiplier=400.0),
    "conservative": SignalConfig(min_signal=0.5, max_signal=1.5, multiplier=200.0),
    "aggressive":   SignalConfig(min_signal=0.0, max_signal=2.0, multiplier=600.0),
}

ACTIVE_SIGNAL_KEY: str = "baseline"  # bisa kamu ubah setelah ada hasil CV
SIGNAL_CFG: SignalConfig = SIGNAL_PRESETS[ACTIVE_SIGNAL_KEY]


# ============================================================
# 4. MODEL & EVALUATION CONFIGS
# ============================================================

@dataclass(frozen=True)
class ModelConfig:
    """
    Konfigurasi utama untuk model ElasticNet baseline.
    """
    cv_folds: int = 10
    l1_ratio: float = 0.5
    alphas: np.ndarray = field(
        default_factory=lambda: np.logspace(-4, 2, 100)
    )
    max_iter: int = 1_000_000
    random_state: int = SEED

ENET_CFG = ModelConfig()


@dataclass(frozen=True)
class EvaluationConfig:
    """
    Konfigurasi evaluasi internal (Sharpe, validasi time-series).
    """
    n_folds: int = 5                 # jumlah fold time-series CV
    val_fraction: float = 0.2        # porsi akhir data untuk pure hold-out (opsional)
    sharpe_annualization_factor: float = float(np.sqrt(252.0))  # 252 hari bursa

EVAL_CFG = EvaluationConfig()
logger.info(
    "Configs ready | ENET l1_ratio=%.3f | n_folds=%d | signal_preset=%s",
    ENET_CFG.l1_ratio,
    EVAL_CFG.n_folds,
    ACTIVE_SIGNAL_KEY,
)


In [4]:
print("WORK_DIR :", WORK_DIR)
print("TRAIN_PATH exists:", TRAIN_PATH.exists())
print("TEST_PATH exists :", TEST_PATH.exists())
print("Signal config    :", SIGNAL_CFG)
print("Model config     :", ENET_CFG)

WORK_DIR : /kaggle/working/hull-tactical-market-prediction
TRAIN_PATH exists: True
TEST_PATH exists : True
Signal config    : SignalConfig(min_signal=0.0, max_signal=2.0, multiplier=400.0)
Model config     : ModelConfig(cv_folds=10, l1_ratio=0.5, alphas=array([1.00000000e-04, 1.14975700e-04, 1.32194115e-04, 1.51991108e-04,
       1.74752840e-04, 2.00923300e-04, 2.31012970e-04, 2.65608778e-04,
       3.05385551e-04, 3.51119173e-04, 4.03701726e-04, 4.64158883e-04,
       5.33669923e-04, 6.13590727e-04, 7.05480231e-04, 8.11130831e-04,
       9.32603347e-04, 1.07226722e-03, 1.23284674e-03, 1.41747416e-03,
       1.62975083e-03, 1.87381742e-03, 2.15443469e-03, 2.47707636e-03,
       2.84803587e-03, 3.27454916e-03, 3.76493581e-03, 4.32876128e-03,
       4.97702356e-03, 5.72236766e-03, 6.57933225e-03, 7.56463328e-03,
       8.69749003e-03, 1.00000000e-02, 1.14975700e-02, 1.32194115e-02,
       1.51991108e-02, 1.74752840e-02, 2.00923300e-02, 2.31012970e-02,
       2.65608778e-02, 3.05385551e-0

## Dataclasses Helpers

In [5]:
# ============================================================
# 5. DATACLASSES HELPERS
# ============================================================

@dataclass
class DatasetOutput:
    """
    Paket hasil preprocessing dataset.

    - X_train, y_train : data untuk melatih model
    - X_test,  y_test  : data untuk evaluasi (mock test / hold-out)
    - scaler           : StandardScaler yang sudah di-fit pada X_train
    - feature_names    : daftar nama fitur (opsional, berguna untuk debugging/analisis)
    """
    X_train: pl.DataFrame
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler
    feature_names: list[str] | None = None


@dataclass(frozen=True)
class ElasticNetParameters:
    """
    Parameter yang dipakai ketika membangun ElasticNetCV / ElasticNet.

    Default-nya diambil dari ENET_CFG (ModelConfig) supaya konsisten
    dengan konfigurasi global, tapi tetap bisa dioverride kalau perlu.
    """
    l1_ratio: float = ENET_CFG.l1_ratio
    cv: int = ENET_CFG.cv_folds
    alphas: np.ndarray = field(
        default_factory=lambda: ENET_CFG.alphas.copy()
    )
    max_iter: int = ENET_CFG.max_iter
    random_state: int = ENET_CFG.random_state

    def __post_init__(self) -> None:
        if not (0.0 <= self.l1_ratio <= 1.0):
            raise ValueError(
                "ElasticNet l1_ratio harus berada di dalam interval [0, 1]."
            )


@dataclass(frozen=True)
class RetToSignalParameters:
    """
    Parameter untuk mengubah prediksi return -> sinyal trading harian.

    Default diambil dari SIGNAL_CFG supaya selaras dengan konfigurasi global.
    """
    signal_multiplier: float = SIGNAL_CFG.multiplier
    min_signal: float = SIGNAL_CFG.min_signal
    max_signal: float = SIGNAL_CFG.max_signal

    def __post_init__(self) -> None:
        if self.min_signal >= self.max_signal:
            raise ValueError(
                "min_signal harus lebih kecil daripada max_signal."
            )


# ============================================================
# 5b. EXPERIMENT & CV RESULT STRUCTS
# ============================================================

@dataclass(frozen=True)
class ExperimentConfig:
    """
    Konfigurasi satu eksperimen penuh:
    - feature_set_name  : nama subset fitur / skenario FE yang dipakai
    - enet_params       : parameter model ElasticNet
    - signal_params     : parameter konversi return -> signal
    - use_gbm           : placeholder kalau nanti mau ensemble dengan tree model
    """
    name: str
    feature_set_name: str
    enet_params: ElasticNetParameters
    signal_params: RetToSignalParameters
    use_gbm: bool = False


@dataclass
class FoldResult:
    """
    Hasil evaluasi satu fold time-series.
    """
    fold_idx: int
    train_start_date: int
    train_end_date: int
    val_start_date: int
    val_end_date: int
    sharpe: float
    mean_return: float
    vol_return: float
    n_train: int
    n_val: int


@dataclass
class CVSummary:
    """
    Ringkasan hasil cross-validation untuk satu ExperimentConfig.

    - fold_results : daftar hasil per fold
    - sharpe_mean  : rata-rata Sharpe antar fold
    - sharpe_std   : standar deviasi Sharpe antar fold (stabilitas)
    """
    experiment_name: str
    fold_results: list[FoldResult]

    @property
    def sharpe_mean(self) -> float:
        if not self.fold_results:
            return float("nan")
        return float(np.mean([fr.sharpe for fr in self.fold_results]))

    @property
    def sharpe_std(self) -> float:
        if not self.fold_results:
            return float("nan")
        return float(np.std([fr.sharpe for fr in self.fold_results], ddof=1))


## Set the Parameters

In [6]:
# ============================================================
# 5. DATACLASSES HELPERS + PARAMETER OBJECTS
# ============================================================

@dataclass
class DatasetOutput:
    """
    Paket hasil preprocessing dataset.

    - X_train, y_train : data untuk melatih model
    - X_test,  y_test  : data untuk evaluasi (mock test / hold-out)
    - scaler           : StandardScaler yang sudah di-fit pada X_train
    - feature_names    : daftar nama fitur (opsional, untuk debugging/analisis)
    - dates_train      : date_id untuk baris X_train (opsional, untuk TS-CV)
    - dates_test       : date_id untuk baris X_test  (opsional, untuk TS-CV)
    """
    X_train: pl.DataFrame
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler
    feature_names: list[str] | None = None
    dates_train: pl.Series | None = None
    dates_test: pl.Series | None = None


@dataclass(frozen=True)
class ElasticNetParameters:
    """
    Parameter yang dipakai ketika membangun ElasticNetCV / ElasticNet.

    Default diambil dari ENET_CFG (ModelConfig) supaya konsisten dengan
    konfigurasi global, tapi bisa dioverride kalau perlu.
    """
    l1_ratio: float = ENET_CFG.l1_ratio
    cv: int = ENET_CFG.cv_folds
    alphas: np.ndarray = field(
        default_factory=lambda: ENET_CFG.alphas.copy()
    )
    max_iter: int = ENET_CFG.max_iter
    random_state: int = ENET_CFG.random_state
    n_jobs: int = -1
    fit_intercept: bool = True

    def __post_init__(self) -> None:
        if not (0.0 <= self.l1_ratio <= 1.0):
            raise ValueError(
                "ElasticNet l1_ratio harus berada di dalam interval [0, 1]."
            )


@dataclass(frozen=True)
class RetToSignalParameters:
    """
    Parameter untuk mengubah prediksi return -> sinyal trading harian.

    Default diambil dari SIGNAL_CFG supaya selaras dengan konfigurasi global.
    """
    signal_multiplier: float = SIGNAL_CFG.multiplier
    min_signal: float = SIGNAL_CFG.min_signal
    max_signal: float = SIGNAL_CFG.max_signal

    def __post_init__(self) -> None:
        if self.min_signal >= self.max_signal:
            raise ValueError(
                "min_signal harus lebih kecil daripada max_signal."
            )
        if self.signal_multiplier <= 0:
            raise ValueError("signal_multiplier harus > 0.")


# ---- GBMParameters: hanya aktif kalau LightGBM tersedia ----
if HAS_LGBM:
    @dataclass(frozen=True)
    class GBMParameters:
        """
        Parameter untuk model tree-based (LightGBM) yang akan kita gunakan
        sebagai pelengkap model linear (ElasticNet/Ridge) kalau HAS_LGBM=True.
        Default-nya dibuat eksplisit di sini supaya tidak bergantung pada GBM_CFG.
        """
        num_leaves: int = 31
        max_depth: int = -1          # -1 = unlimited depth
        learning_rate: float = 0.03
        n_estimators: int = 500
        subsample: float = 0.8       # row subsampling
        colsample_bytree: float = 0.8
        reg_alpha: float = 0.0
        reg_lambda: float = 1.0
        random_state: int = SEED
else:
    GBMParameters = None  # type: ignore


# ------------------------------------------------------------
# 5b. EXPERIMENT & CV RESULT STRUCTS
# ------------------------------------------------------------

@dataclass(frozen=True)
class ExperimentConfig:
    """
    Konfigurasi satu eksperimen penuh:
    - name             : nama unik eksperimen
    - feature_set_name : nama subset fitur / skenario FE yang dipakai
    - enet_params      : parameter model ElasticNet
    - signal_params    : parameter konversi return -> signal
    - use_gbm          : apakah akan menambahkan model GBM untuk ensemble
    """
    name: str
    feature_set_name: str
    enet_params: ElasticNetParameters
    signal_params: RetToSignalParameters
    use_gbm: bool = False


@dataclass
class FoldResult:
    """
    Hasil evaluasi satu fold time-series.
    """
    fold_idx: int
    train_start_date: int
    train_end_date: int
    val_start_date: int
    val_end_date: int
    sharpe: float
    mean_return: float
    vol_return: float
    n_train: int
    n_val: int


@dataclass
class CVSummary:
    """
    Ringkasan hasil cross-validation untuk satu ExperimentConfig.
    """
    experiment_name: str
    fold_results: list[FoldResult]

    @property
    def sharpe_mean(self) -> float:
        return float(np.mean([fr.sharpe for fr in self.fold_results])) if self.fold_results else float("nan")

    @property
    def sharpe_std(self) -> float:
        return float(np.std([fr.sharpe for fr in self.fold_results], ddof=1)) if self.fold_results else float("nan")


# ------------------------------------------------------------
# Instansiasi objek parameter utama yang akan dipakai di pipeline
# ------------------------------------------------------------

ret_signal_params = RetToSignalParameters()
enet_params       = ElasticNetParameters()
gbm_params: Optional["GBMParameters"] = GBMParameters() if HAS_LGBM and GBMParameters is not None else None

print("ElasticNet params:", enet_params)
print("Signal params    :", ret_signal_params)
print("GBM params       :", gbm_params)


ElasticNet params: ElasticNetParameters(l1_ratio=0.5, cv=10, alphas=array([1.00000000e-04, 1.14975700e-04, 1.32194115e-04, 1.51991108e-04,
       1.74752840e-04, 2.00923300e-04, 2.31012970e-04, 2.65608778e-04,
       3.05385551e-04, 3.51119173e-04, 4.03701726e-04, 4.64158883e-04,
       5.33669923e-04, 6.13590727e-04, 7.05480231e-04, 8.11130831e-04,
       9.32603347e-04, 1.07226722e-03, 1.23284674e-03, 1.41747416e-03,
       1.62975083e-03, 1.87381742e-03, 2.15443469e-03, 2.47707636e-03,
       2.84803587e-03, 3.27454916e-03, 3.76493581e-03, 4.32876128e-03,
       4.97702356e-03, 5.72236766e-03, 6.57933225e-03, 7.56463328e-03,
       8.69749003e-03, 1.00000000e-02, 1.14975700e-02, 1.32194115e-02,
       1.51991108e-02, 1.74752840e-02, 2.00923300e-02, 2.31012970e-02,
       2.65608778e-02, 3.05385551e-02, 3.51119173e-02, 4.03701726e-02,
       4.64158883e-02, 5.33669923e-02, 6.13590727e-02, 7.05480231e-02,
       8.11130831e-02, 9.32603347e-02, 1.07226722e-01, 1.23284674e-01,
       1.

## Dataset Loading/Creating Helper Functions

In [7]:
# ============================================================
# 6. DATA LOADING & PREPROCESSING HELPERS
# ============================================================

def load_trainset(path: Path = TRAIN_PATH, drop_last_n: int = 10) -> pl.DataFrame:
    """
    Load dan praproses training dataset

    - Mengganti nama kolom target menjadi 'target'
      (market_forward_excess_returns -> target).
    - Meng-cast semua kolom selain 'date_id' ke Float64.
    - Sorting berdasarkan date_id.
    - Opsional: membuang N baris terakhir (drop_last_n) untuk
      menghindari kebocoran saat mock test.

    Args:
        path (Path): lokasi file train.csv.
        drop_last_n (int): jumlah baris terakhir yang dibuang.

    Returns:
        pl.DataFrame: DataFrame training yang sudah rapi.
    """
    df = (
        pl.read_csv(path)
        .rename({"market_forward_excess_returns": "target"})
        .with_columns(
            pl.col("date_id").cast(pl.Int32, strict=False)
        )
        .with_columns(
            pl.exclude("date_id").cast(pl.Float64, strict=False)
        )
        .sort("date_id")
    )

    if drop_last_n > 0:
        df = df.head(-drop_last_n)

    return df


def load_testset(path: Path = TEST_PATH) -> pl.DataFrame:
    """
    Load dan praproses test/mock dataset.

    - Mengganti nama 'lagged_forward_returns' -> 'target' agar
      struktur mirip train (meski tidak dipakai sebagai ground truth).
    - Meng-cast semua kolom selain 'date_id' ke Float64.
    - Sorting berdasarkan date_id.

    Args:
        path (Path): lokasi file test.csv.

    Returns:
        pl.DataFrame: DataFrame test yang sudah rapi.
    """
    df = (
        pl.read_csv(path)
        .rename({"lagged_forward_returns": "target"})
        .with_columns(
            pl.col("date_id").cast(pl.Int32, strict=False)
        )
        .with_columns(
            pl.exclude("date_id").cast(pl.Float64, strict=False)
        )
        .sort("date_id")
    )
    return df


def create_example_dataset(df: pl.DataFrame) -> pl.DataFrame:
    """
    Membuat fitur baseline + beberapa fitur tambahan (momentum, trend & volatilitas),
    lalu membersihkan DataFrame.

    Fitur baru:
        - U1 = I2 - I1
        - U2 = M11 / mean(I2, I9, I7)
        - Untuk setiap fitur dasar F di base_vars:
            * F_diff1      : F_t - F_{t-1}         (momentum 1 hari)
            * F_rm{w}      : rolling mean w hari   (trend jangka pendek/menengah)
            * F_vol{w}     : rolling std w hari    (volatilitas lokal)

    Setelah itu:
        - Pilih subset kolom: ['date_id', 'target'] + semua fitur di vars_to_keep
        - Imputasi missing dengan exponential weighted mean (EWM)
        - Drop baris yang masih mengandung null
        - Sort by date_id

    Args:
        df (pl.DataFrame): input Polars DataFrame (train+test gabungan).

    Returns:
        pl.DataFrame: DataFrame dengan fitur baru & tanpa null.
    """
    # Fitur dasar yang akan dipakai sebagai anchor untuk FE
    base_vars: List[str] = [
        "S2", "E2", "E3", "P9", "S1", "S5", "I2", "P8",
        "P10", "P12", "P13",
    ]

    # Kolom yang wajib ada untuk menghitung U1 & U2
    required_base_cols = ["I1", "I2", "M11", "I7", "I9"] + base_vars
    missing = [c for c in required_base_cols if c not in df.columns]
    if missing:
        raise KeyError(f"Kolom berikut hilang di DataFrame: {missing}")

    # Pastikan urutan berdasarkan waktu
    df = df.sort("date_id")

    # Fitur U1 & U2
    df_feat = df.with_columns(
        (pl.col("I2") - pl.col("I1")).alias("U1"),
        (
            pl.col("M11")
            / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3.0)
        ).alias("U2"),
    )

    # Konfigurasi horizon untuk trend & volatilitas
    trend_windows = [5, 10, 20]   # MA pendek-menengah
    vol_windows   = [10, 20]      # volatilitas jangka pendek/menengah

    derived_vars: List[str] = []
    new_exprs: List[pl.Expr] = []

    for col in base_vars:
        # Momentum 1 hari
        diff_name = f"{col}_diff1"
        derived_vars.append(diff_name)
        new_exprs.append(
            (pl.col(col) - pl.col(col).shift(1)).alias(diff_name)
        )

        # Rolling mean beberapa horizon
        for w in trend_windows:
            rm_name = f"{col}_rm{w}"
            derived_vars.append(rm_name)
            new_exprs.append(
                pl.col(col).rolling_mean(window_size=w, min_periods=1).alias(rm_name)
            )

        # Rolling std (volatilitas)
        for w in vol_windows:
            vol_name = f"{col}_vol{w}"
            derived_vars.append(vol_name)
            new_exprs.append(
                pl.col(col).rolling_std(window_size=w, min_periods=1).alias(vol_name)
            )

    # Tambahkan semua fitur turunan dalam satu with_columns supaya efisien
    df_feat = df_feat.with_columns(new_exprs)

    # Kumpulan semua fitur yang akan kita pakai
    vars_to_keep: List[str] = base_vars + derived_vars + ["U1", "U2"]

    # Imputasi missing dengan EWM untuk semua fitur
    df_feat = (
        df_feat
        .select(["date_id", "target"] + vars_to_keep)
        .with_columns(
            [
                pl.col(col).fill_null(pl.col(col).ewm_mean(com=0.5))
                for col in vars_to_keep
            ]
        )
        .drop_nulls()
        .sort("date_id")
    )

    return df_feat


def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    Menggabungkan train dan test berdasarkan kolom yang sama
    (untuk memastikan feature engineering konsisten).

    Args:
        train (pl.DataFrame): DataFrame training mentah.
        test (pl.DataFrame): DataFrame test mentah.

    Returns:
        pl.DataFrame: DataFrame hasil concatenation vertical train+test
                      pada kolom-kolom yang sama.
    """
    common_columns: list[str] = [
        col for col in train.columns if col in test.columns
    ]

    if "date_id" not in common_columns:
        raise KeyError("'date_id' harus ada di kedua DataFrame.")

    return (
        pl.concat(
            [train.select(common_columns), test.select(common_columns)],
            how="vertical",
        )
        .sort("date_id")
    )


def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput:
    """
    Memisahkan data menjadi fitur (X) dan target (y), lalu melakukan scaling.

    Args:
        train (pl.DataFrame): DataFrame training yang sudah diproses.
        test (pl.DataFrame): DataFrame test yang sudah diproses.
        features (list[str]): Daftar nama fitur yang digunakan model.

    Returns:
        DatasetOutput: Dataclass berisi X_train, y_train, X_test, y_test,
                       scaler yang sudah di-fit, feature_names,
                       dan date_id masing-masing set.
    """
    # Pastikan kolom wajib ada
    for col in ["date_id", "target"]:
        if col not in train.columns or col not in test.columns:
            raise KeyError(f"Kolom wajib '{col}' hilang di train/test.")

    # Simpan date_id untuk keperluan time-series CV nanti
    dates_train = train.get_column("date_id")
    dates_test  = test.get_column("date_id")

    X_train = train.drop(["date_id", "target"])
    y_train = train.get_column("target")

    X_test = test.drop(["date_id", "target"])
    y_test = test.get_column("target")

    scaler = StandardScaler()

    # Sklearn lebih nyaman kalau dikasih numpy array
    X_train_np = X_train.to_numpy()
    X_test_np  = X_test.to_numpy()

    # fit_transform pada train
    X_train_scaled_np = scaler.fit_transform(X_train_np)
    X_train_scaled = pl.from_numpy(X_train_scaled_np, schema=features)

    # transform pada test
    X_test_scaled_np = scaler.transform(X_test_np)
    X_test_scaled = pl.from_numpy(X_test_scaled_np, schema=features)

    return DatasetOutput(
        X_train=X_train_scaled,
        X_test=X_test_scaled,
        y_train=y_train,
        y_test=y_test,
        scaler=scaler,
        feature_names=features,
        dates_train=dates_train,
        dates_test=dates_test,
    )


## Converting Return Prediction to Signal

Here is an example of a potential function used to convert a prediction based on the market forward excess return to a daily signal position. 

In [8]:
# ============================================================
# 7. RETURN -> SIGNAL MAPPING
# ============================================================

def convert_ret_to_signal(
    ret_arr: np.ndarray | float | list[float],
    params: RetToSignalParameters,
    debug: bool = False,
) -> np.ndarray:
    """
    Convert raw model predictions (expected excess returns) into a trading signal.

    Mode utama (linear):
        signal = clip( ret * signal_multiplier + 1, min_signal, max_signal )

    Di mana:
        - signal ≈ 1  : posisi netral / benchmark
        - signal < 1  : underweight (kurang dari pasar)
        - signal > 1  : overweight (lebih agresif dari pasar)

    Args:
        ret_arr:
            Predicted returns (bisa scalar, list, atau numpy array).
        params (RetToSignalParameters):
            Parameter scaling dan clipping (min/max signal, multiplier).
        debug (bool):
            Jika True dan bukan mode kompetisi, log ringkas statistik sinyal.

    Returns:
        np.ndarray:
            Array sinyal trading dengan shape 1D (n_samples,),
            sudah di-clip di [min_signal, max_signal].
            (Kalau input scalar, tetap dikembalikan array 1 elemen.)
    """
    # Pastikan dalam bentuk numpy array float (1D)
    ret_arr = np.asarray(ret_arr, dtype=float).reshape(-1)

    if ret_arr.size == 0:
        raise ValueError("ret_arr kosong, tidak ada prediksi yang bisa dikonversi ke sinyal.")

    # Sanity-check: tidak boleh ada NaN / inf
    if not np.all(np.isfinite(ret_arr)):
        raise ValueError(
            "ret_arr mengandung nilai non-finite (NaN/inf). "
            "Pastikan prediksi model sudah dibersihkan dulu."
        )

    # Mapping linear dari return -> posisi
    raw_signal = ret_arr * params.signal_multiplier + 1.0

    # Clip supaya tidak keluar dari range yang diizinkan
    signal = np.clip(raw_signal, params.min_signal, params.max_signal)

    # Debug ringan (hanya kalau diminta, dan bukan di rerun kompetisi)
    if debug and not IS_COMP_RERUN:
        s_min = float(signal.min())
        s_max = float(signal.max())
        s_mean = float(signal.mean())
        logger.info(
            "[signal debug] n=%d | min=%.4f | max=%.4f | mean=%.4f",
            signal.size, s_min, s_max, s_mean,
        )

    return signal


def convert_ret_to_signal_ranked(
    ret_arr: np.ndarray | float | list[float],
    params: RetToSignalParameters,
    lower_q: float = 0.3,
    upper_q: float = 0.7,
) -> np.ndarray:
    """
    Versi alternatif: konversi prediksi return menjadi sinyal berdasarkan ranking.

    Ide:
      - titik di bawah quantile lower_q  -> bias ke sisi bawah (dekat min_signal)
      - titik di atas quantile upper_q  -> bias ke sisi atas (dekat max_signal)
      - titik di tengah                 -> sekitar 1.0 (netral)

    Cocok untuk eksperimen offline dengan Sharpe internal.
    Untuk submission, kamu bisa pilih apakah mau pakai versi linear atau ranking
    berdasarkan hasil validasi.

    Args:
        ret_arr:
            Predicted returns (scalar/list/array).
        params:
            Batas min/max dan titik netral (1.0).
        lower_q, upper_q:
            Quantile untuk menentukan zona bawah/tengah/atas.

    Returns:
        np.ndarray:
            Sinyal 1D terklip dalam [min_signal, max_signal].
    """
    ret_arr = np.asarray(ret_arr, dtype=float).reshape(-1)

    if ret_arr.size == 0:
        raise ValueError("ret_arr kosong, tidak ada prediksi yang bisa dikonversi ke sinyal.")

    if not np.all(np.isfinite(ret_arr)):
        raise ValueError(
            "ret_arr mengandung nilai non-finite (NaN/inf). "
            "Pastikan prediksi model sudah dibersihkan dulu."
        )

    # Hitung threshold quantile
    q_low = np.quantile(ret_arr, lower_q)
    q_high = np.quantile(ret_arr, upper_q)

    # Skala 0–1 berdasarkan posisi antara q_low dan q_high
    # lalu map ke [min_signal, max_signal], dengan 1.0 sebagai titik netral.
    # (Ini bisa kamu tweak lagi setelah lihat hasil Sharpe internal.)
    scaled = np.zeros_like(ret_arr, dtype=float)

    # Zona bawah
    mask_low = ret_arr <= q_low
    scaled[mask_low] = params.min_signal

    # Zona atas
    mask_high = ret_arr >= q_high
    scaled[mask_high] = params.max_signal

    # Zona tengah: lerp ke sekitar 1.0
    mask_mid = ~(mask_low | mask_high)
    if mask_mid.any() and q_high > q_low:
        mid_vals = ret_arr[mask_mid]
        # normalisasi ke [0,1] di antara q_low dan q_high
        norm_mid = (mid_vals - q_low) / (q_high - q_low)
        # map ke [min_signal, max_signal], tapi tarik ke arah 1.0
        mid_signal = params.min_signal + norm_mid * (params.max_signal - params.min_signal)
        # blend dengan 1.0 supaya tidak terlalu ekstrem
        alpha = 0.5  # 0.5 = setengah ke arah 1.0
        mid_signal = alpha * 1.0 + (1 - alpha) * mid_signal
        scaled[mask_mid] = mid_signal
    else:
        # fallback kalau distribusi ret_arr terlalu degenerate
        scaled[mask_mid] = 1.0

    signal = np.clip(scaled, params.min_signal, params.max_signal)
    return signal


## Looking at the Data

In [9]:
# ============================================================
# 8. QUICK DATA CHECK: TRAIN & TEST (LOCAL ONLY)
# ============================================================

# Bagian ini hanya dijalankan saat TIDAK rerun kompetisi,
# supaya tidak menambah waktu eksekusi saat submit.
if not IS_COMP_RERUN:
    # --------------------------------------------------------
    # 8.1 Load data mentah
    # --------------------------------------------------------
    train: pl.DataFrame = load_trainset()
    test: pl.DataFrame  = load_testset()

    print("=== SHAPE ===")
    print("Train shape:", train.shape)
    print("Test shape :", test.shape)

    # --------------------------------------------------------
    # 8.2 Range date_id untuk memastikan urut dan tidak bolong
    # --------------------------------------------------------
    print("\n=== DATE RANGE ===")
    print(
        "Train date_id range:",
        int(train["date_id"].min()),
        "→",
        int(train["date_id"].max()),
    )
    print(
        "Test  date_id range:",
        int(test["date_id"].min()),
        "→",
        int(test["date_id"].max()),
    )

    # Cek beberapa baris terakhir train
    print("\n=== TRAIN SAMPLE (tail 3) ===")
    print(train.tail(3))

    # Cek beberapa kolom penting di test (date_id, is_scored, target)
    cols_to_show_test = [
        c for c in ["date_id", "is_scored", "target"] if c in test.columns
    ]
    print("\n=== TEST SAMPLE (head 5) ===")
    print(test.select(cols_to_show_test).head(5))

    # --------------------------------------------------------
    # 8.3 Statistik dasar target
    # --------------------------------------------------------
    print("\n=== TARGET STATS (train.target) ===")
    print(train.select("target").describe())

    # Tambahan: proporsi return positif/negatif & outlier kasar
    target_np = train["target"].to_numpy()
    n = target_np.size
    pos_ratio = float((target_np > 0).sum()) / n
    neg_ratio = float((target_np < 0).sum()) / n
    zero_ratio = float((target_np == 0).sum()) / n

    print("\n=== TARGET SIGN COUNTS (train.target) ===")
    print(f"n       : {n}")
    print(f"> 0     : {pos_ratio:.3%}")
    print(f"< 0     : {neg_ratio:.3%}")
    print(f"== 0    : {zero_ratio:.3%}")

    # Outlier sederhana (5th, 95th, max abs)
    q05, q95 = np.quantile(target_np, [0.05, 0.95])
    max_abs = float(np.max(np.abs(target_np)))
    print("\n=== TARGET QUANTILES (train.target) ===")
    print(f"5%      : {q05:.6f}")
    print(f"95%     : {q95:.6f}")
    print(f"max|x|  : {max_abs:.6f}")

    # --------------------------------------------------------
    # 8.4 Missing values (top 10)
    # --------------------------------------------------------
    print("\n=== TOP 10 NULL COUNTS (TRAIN) ===")
    null_counts_train = (
        train.null_count()
        .melt(variable_name="column", value_name="n_null")
        .with_columns(
            (pl.col("n_null") / train.height).alias("null_frac")
        )
        .sort("n_null", descending=True)
        .head(10)
    )
    print(null_counts_train)

    print("\n=== TOP 10 NULL COUNTS (TEST) ===")
    null_counts_test = (
        test.null_count()
        .melt(variable_name="column", value_name="n_null")
        .with_columns(
            (pl.col("n_null") / test.height).alias("null_frac")
        )
        .sort("n_null", descending=True)
        .head(10)
    )
    print(null_counts_test)

    # --------------------------------------------------------
    # 8.5 Korelasi sederhana fitur–target (top 15)
    # --------------------------------------------------------
    # Hanya untuk numeric, dan exclude date_id/target
    num_cols = [
        c for c, dt in train.schema.items()
        if c not in ("date_id", "target") and dt in (pl.Float64, pl.Float32, pl.Int64, pl.Int32)
    ]

    if num_cols:
        # pakai pandas sebentar untuk corrwith (lebih simpel)
        train_pd = train.select(["target"] + num_cols).to_pandas()
        corr_series = train_pd[num_cols].corrwith(train_pd["target"])
        corr_df = (
            pl.DataFrame({
                "feature": corr_series.index.to_list(),
                "corr_target": corr_series.values,
            })
            .with_columns(
                pl.col("corr_target").abs().alias("abs_corr")
            )
            .sort("abs_corr", descending=True)
            .head(15)
        )

        print("\n=== TOP 15 |CORR(feature, target)| (TRAIN) ===")
        print(corr_df)
    else:
        print("\n[WARN] Tidak ada numeric feature yang bisa dihitung korelasinya.")


=== SHAPE ===
Train shape: (9011, 98)
Test shape : (10, 99)

=== DATE RANGE ===
Train date_id range: 0 → 9010
Test  date_id range: 8980 → 8989

=== TRAIN SAMPLE (tail 3) ===
shape: (3, 98)
┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬───┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐
│ dat ┆ D1  ┆ D2  ┆ D3  ┆ D4  ┆ D5  ┆ D6  ┆ D7  ┆ D8  ┆ D9  ┆ … ┆ V3  ┆ V4  ┆ V5  ┆ V6  ┆ V7  ┆ V8  ┆ V9  ┆ for ┆ ris ┆ tar │
│ e_i ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆   ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ war ┆ k_f ┆ get │
│ d   ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ f64 ┆ d_r ┆ ree ┆ --- │
│ --- ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆   ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆ etu ┆ _ra ┆ f64 │
│ i32 ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆   ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆ rns ┆ te  ┆     │
│     ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆     ┆

  .melt(variable_name="column", value_name="n_null")
  .melt(variable_name="column", value_name="n_null")


## Generating the Train and Test

In [10]:
# ============================================================
# 9. FEATURE ENGINEERING + TRAIN/TEST SPLIT
# ============================================================

# 1) Selalu load train & test mentah di sini
#    (jangan bergantung ke cell EDA yang hanya jalan saat not IS_COMP_RERUN)
train_raw: pl.DataFrame = load_trainset()
test_raw: pl.DataFrame  = load_testset()

# (Opsional) Simpan is_scored dari test untuk analisis lokal / LB-mirroring
is_scored_test: Optional[pl.Series] = (
    test_raw.get_column("is_scored") if "is_scored" in test_raw.columns else None
)

# 2) Gabungkan train & test mentah pada kolom yang sama (supaya FE konsisten)
df_all: pl.DataFrame = join_train_test_dataframes(train_raw, test_raw)

# 3) Terapkan feature engineering (U1, U2, momentum, rolling trend & vol, EWM fill, drop_null)
df_fe: pl.DataFrame = create_example_dataset(df=df_all)

# 4) Kembalikan lagi ke train_fe dan test_fe berdasarkan date_id awal
train_ids = train_raw.get_column("date_id").unique()
test_ids  = test_raw.get_column("date_id").unique()

train_fe: pl.DataFrame = df_fe.filter(pl.col("date_id").is_in(train_ids))
test_fe: pl.DataFrame  = df_fe.filter(pl.col("date_id").is_in(test_ids))

# Pastikan sudah terurut waktu
train_fe = train_fe.sort("date_id")
test_fe  = test_fe.sort("date_id")

# Sanity-check: tidak ada overlap aneh (harusnya union = seluruh df_fe)
if not IS_COMP_RERUN:
    n_all = df_fe.height
    n_union = train_fe.height + test_fe.height
    print(f"[DEBUG] df_fe height     : {n_all}")
    print(f"[DEBUG] train_fe + test_fe: {n_union}")
    if n_union != n_all:
        print("[WARN] Ada baris yang tidak terklasifikasi ke train/test berdasarkan date_id.")

# 5) Definisikan daftar fitur (semua kolom kecuali 'date_id' dan 'target')
FEATURES: list[str] = sorted(
    [col for col in test_fe.columns if col not in ["date_id", "target"]]
)

# 6) Split menjadi X/y + scaling, dibungkus dalam DatasetOutput
dataset: DatasetOutput = split_dataset(
    train=train_fe,
    test=test_fe,
    features=FEATURES,
)

X_train: pl.DataFrame      = dataset.X_train
X_test: pl.DataFrame       = dataset.X_test
y_train: pl.Series         = dataset.y_train
y_test: pl.Series          = dataset.y_test
scaler: StandardScaler     = dataset.scaler
feature_names: list[str]   = dataset.feature_names or FEATURES
dates_train: pl.Series     = dataset.dates_train
dates_test: pl.Series      = dataset.dates_test

# (Opsional) Quick check – hanya saat lokal, supaya tidak nambah waktu di kompetisi
if not IS_COMP_RERUN:
    print("=== FE & SPLIT SUMMARY ===")
    print("X_train shape :", X_train.shape)
    print("X_test  shape :", X_test.shape)
    print("y_train length:", y_train.len())
    print("y_test  length:", y_test.len())
    print("Num features  :", len(FEATURES))
    print("Train dates   :", int(dates_train.min()), "→", int(dates_train.max()))
    print("Test  dates   :", int(dates_test.min()),  "→", int(dates_test.max()))

    assert X_train.height == y_train.len(), "X_train dan y_train tidak sebaris!"
    assert X_test.height == y_test.len(), "X_test dan y_test tidak sebaris!"

    if is_scored_test is not None:
        n_scored = int((is_scored_test == 1).sum())
        print(f"Test is_scored=1 count : {n_scored} / {is_scored_test.len()}")


[DEBUG] df_fe height     : 7509
[DEBUG] train_fe + test_fe: 7529
[WARN] Ada baris yang tidak terklasifikasi ke train/test berdasarkan date_id.
=== FE & SPLIT SUMMARY ===
X_train shape : (7509, 79)
X_test  shape : (20, 79)
y_train length: 7509
y_test  length: 20
Num features  : 79
Train dates   : 1512 → 9010
Test  dates   : 8980 → 8989
Test is_scored=1 count : 9 / 10


  pl.col(col).rolling_mean(window_size=w, min_periods=1).alias(rm_name)
  pl.col(col).rolling_std(window_size=w, min_periods=1).alias(vol_name)


## Fitting the Model 

In [11]:
# ============================================================
# 10. FIT ELASTICNET BASELINE (TIME-SERIES CV + SHARPE)
# ============================================================

# Sklearn lebih aman kalau dikasih numpy array
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()
dates_train_np = dates_train.to_numpy()

def compute_sharpe(
    y_true: np.ndarray,
    y_pred: np.ndarray,
    signal_params: RetToSignalParameters,
) -> float:
    """
    Hitung Sharpe ratio tahunan sederhana dari prediksi return.

    - Konversi prediksi return -> sinyal (0..2) via convert_ret_to_signal
    - Return strategi harian = signal * y_true
    - Sharpe harian = mean / std
    - Sharpe tahunan = Sharpe harian * sqrt(252)
    """
    signal = convert_ret_to_signal(y_pred, signal_params, debug=False)
    strat_ret = signal * y_true

    vol = strat_ret.std(ddof=1)
    if vol == 0 or not np.isfinite(vol):
        return 0.0

    mean_ret = strat_ret.mean()
    sharpe_daily = mean_ret / vol
    sharpe_annual = float(sharpe_daily * EVAL_CFG.sharpe_annualization_factor)
    return sharpe_annual


def build_time_series_folds(
    dates: np.ndarray,
    n_folds: int,
    gap_days: int = 3,
) -> list[dict]:
    """
    Bangun fold time-series expanding window sederhana berbasis date_id.

    - Setiap fold:
        * train: semua tanggal sebelum window val, dikurangi gap_days terakhir
        * val  : satu window tanggal setelah train
    """
    unique_dates = np.unique(dates)
    unique_dates.sort()

    n_dates = len(unique_dates)
    if n_dates < (n_folds + 1):
        raise ValueError(f"Jumlah tanggal ({n_dates}) terlalu sedikit untuk n_folds={n_folds}.")

    fold_size = n_dates // (n_folds + 1)  # 1 blok awal + n_folds blok val

    folds: list[dict] = []
    for k in range(n_folds):
        # Definisi window tanggal untuk fold k
        val_start_idx = (k + 1) * fold_size
        val_end_idx   = (k + 2) * fold_size if k < n_folds - 1 else n_dates

        train_end_idx = max(0, val_start_idx - gap_days)
        train_dates = unique_dates[:train_end_idx]
        val_dates   = unique_dates[val_start_idx:val_end_idx]

        train_mask = np.isin(dates, train_dates)
        val_mask   = np.isin(dates, val_dates)

        train_idx = np.where(train_mask)[0]
        val_idx   = np.where(val_mask)[0]

        if train_idx.size == 0 or val_idx.size == 0:
            continue

        folds.append(
            {
                "fold_idx": k,
                "train_idx": train_idx,
                "val_idx": val_idx,
                "train_start_date": int(train_dates.min()) if train_dates.size > 0 else int(unique_dates[0]),
                "train_end_date": int(train_dates.max()) if train_dates.size > 0 else int(unique_dates[0]),
                "val_start_date": int(val_dates.min()),
                "val_end_date": int(val_dates.max()),
            }
        )

    if not folds:
        raise RuntimeError("Gagal membentuk time-series folds. Cek konfigurasi n_folds / gap_days.")

    return folds


# ------------------------------------------------------------
# 10.1 Bangun time-series folds
# ------------------------------------------------------------
ts_folds = build_time_series_folds(
    dates=dates_train_np,
    n_folds=EVAL_CFG.n_folds,
    gap_days=3,
)

if not IS_COMP_RERUN:
    print(f"[TS-CV] Built {len(ts_folds)} folds (n_folds target = {EVAL_CFG.n_folds})")
    for f in ts_folds:
        print(
            f"  Fold {f['fold_idx']}: "
            f"train {f['train_start_date']}→{f['train_end_date']} "
            f"| val {f['val_start_date']}→{f['val_end_date']} "
            f"| n_train={len(f['train_idx'])}, n_val={len(f['val_idx'])}"
        )


# ------------------------------------------------------------
# 10.2 Grid search alpha berbasis Sharpe internal
# ------------------------------------------------------------

candidate_alphas = enet_params.alphas  # dari konfigurasi global
alpha_sharpe_scores: dict[float, float] = {}

for alpha in candidate_alphas:
    fold_sharpes: list[float] = []

    for f in ts_folds:
        tr_idx = f["train_idx"]
        va_idx = f["val_idx"]

        X_tr, y_tr = X_train_np[tr_idx], y_train_np[tr_idx]
        X_va, y_va = X_train_np[va_idx], y_train_np[va_idx]

        # Model per-fold (ElasticNet)
        model_fold = ElasticNet(
            alpha=float(alpha),
            l1_ratio=enet_params.l1_ratio,
            max_iter=enet_params.max_iter,
            fit_intercept=enet_params.fit_intercept,
            random_state=enet_params.random_state,
        )
        model_fold.fit(X_tr, y_tr)

        y_va_pred = model_fold.predict(X_va)
        sharpe_val = compute_sharpe(y_true=y_va, y_pred=y_va_pred, signal_params=ret_signal_params)
        fold_sharpes.append(sharpe_val)

    if fold_sharpes:
        alpha_sharpe_scores[float(alpha)] = float(np.mean(fold_sharpes))

# Pilih alpha dengan Sharpe rata-rata tertinggi
if not alpha_sharpe_scores:
    raise RuntimeError("Tidak ada skor Sharpe yang berhasil dihitung dalam TS-CV.")

best_alpha = max(alpha_sharpe_scores, key=alpha_sharpe_scores.get)
best_sharpe = alpha_sharpe_scores[best_alpha]

# (Opsional) lihat beberapa alpha terbaik saat lokal
if not IS_COMP_RERUN:
    sorted_alphas = sorted(alpha_sharpe_scores.items(), key=lambda kv: kv[1], reverse=True)
    print("\n[TS-CV] Top 5 alpha berdasarkan Sharpe (annualised):")
    for a, s in sorted_alphas[:5]:
        print(f"  alpha={a:.6f} | Sharpe={s:.4f}")

    print(f"\n[TS-CV] Best alpha (by Sharpe) : {best_alpha:.6f}")
    print(f"[TS-CV] Best Sharpe (mean CV)  : {best_sharpe:.4f}")


# ------------------------------------------------------------
# 10.3 Fit final ElasticNet di seluruh training data
# ------------------------------------------------------------
model: ElasticNet = ElasticNet(
    alpha=best_alpha,
    l1_ratio=enet_params.l1_ratio,
    max_iter=enet_params.max_iter,
    fit_intercept=enet_params.fit_intercept,
    random_state=enet_params.random_state,
)
model.fit(X_train_np, y_train_np)

# Quick sanity check: performa di training (bukan metric kompetisi, hanya cek leak/bug)
y_pred_train = model.predict(X_train_np)
r2_train = r2_score(y_train_np, y_pred_train)
mse_train = mean_squared_error(y_train_np, y_pred_train)

print("\n=== ElasticNet Baseline Fitted (TS-CV Sharpe-tuned) ===")
print(f"Best alpha (Sharpe-CV): {best_alpha:.6f}")
print(f"L1 ratio              : {enet_params.l1_ratio}")
print(f"Train R²              : {r2_train:.6f}")
print(f"Train MSE             : {mse_train:.6e}")
print(f"(Internal CV Sharpe   : {best_sharpe:.4f} annualised)")


[TS-CV] Built 5 folds (n_folds target = 5)
  Fold 0: train 1512→2757 | val 2761→4009 | n_train=1246, n_val=1249
  Fold 1: train 1512→4006 | val 4010→5258 | n_train=2495, n_val=1249
  Fold 2: train 1512→5255 | val 5259→6507 | n_train=3744, n_val=1249
  Fold 3: train 1512→6504 | val 6508→7756 | n_train=4993, n_val=1249
  Fold 4: train 1512→7753 | val 7757→9010 | n_train=6242, n_val=1264

[TS-CV] Top 5 alpha berdasarkan Sharpe (annualised):
  alpha=0.000266 | Sharpe=0.3673
  alpha=0.000305 | Sharpe=0.3667
  alpha=0.000351 | Sharpe=0.3658
  alpha=0.000231 | Sharpe=0.3657
  alpha=0.000404 | Sharpe=0.3610

[TS-CV] Best alpha (by Sharpe) : 0.000266
[TS-CV] Best Sharpe (mean CV)  : 0.3673

=== ElasticNet Baseline Fitted (TS-CV Sharpe-tuned) ===
Best alpha (Sharpe-CV): 0.000266
L1 ratio              : 0.5
Train R²              : 0.014995
Train MSE             : 1.212204e-04
(Internal CV Sharpe   : 0.3673 annualised)


## Prediction Function via Kaggle Server

In [12]:
# ============================================================
# 11. PREDICTION FUNCTION FOR KAGGLE EVALUATION API
# ============================================================

def predict(test: pl.DataFrame) -> float:
    """
    Fungsi prediksi yang dipanggil oleh Kaggle Evaluation API.

    Alur:
    1) Rename kolom target (lagged_forward_returns -> target) jika perlu.
    2) Terapkan feature engineering (create_example_dataset).
    3) Pilih fitur sesuai FEATURES dan lakukan scaling dengan scaler yang sudah di-fit.
    4) Prediksi expected excess return pakai model.
    5) Konversi return -> sinyal trading via convert_ret_to_signal.
    6) Return satu nilai float (sinyal) untuk batch pertama.

    Catatan:
    - DefaultInferenceServer biasanya memanggil fungsi ini dengan
      1 baris per call, tapi kita tetap handle jika >1 baris.
    """
    # 1. Pastikan kolom target ada
    if "target" not in test.columns:
        if "lagged_forward_returns" in test.columns:
            test = test.rename({"lagged_forward_returns": "target"})
        else:
            raise KeyError(
                "Test dataframe harus memiliki kolom 'target' atau 'lagged_forward_returns'."
            )

    # 2. Pastikan 'date_id' ada
    if "date_id" not in test.columns:
        raise KeyError("Kolom 'date_id' wajib ada di dataframe test.")

    # 3. Feature engineering (U1, U2, imputasi EWM, subset fitur, drop_null)
    df = create_example_dataset(test)

    # Jika setelah FE tidak ada baris (misal semua null dan ter-drop),
    # kembalikan sinyal netral 1.0 agar tidak crash saat evaluasi.
    if df.height == 0:
        return float(1.0)

    # 4. Ambil hanya fitur yang sudah didefinisikan di training
    X_test = df.select(FEATURES)

    # Sanity-check: pastikan semua fitur ada
    missing_feats = [f for f in FEATURES if f not in X_test.columns]
    if missing_feats:
        raise KeyError(
            f"Fitur berikut hilang di data FE test: {missing_feats}"
        )

    # 5. Sklearn pakai numpy array
    X_test_np = X_test.to_numpy()
    X_test_scaled_np = scaler.transform(X_test_np)

    # 6. Prediksi expected excess return
    raw_pred = model.predict(X_test_scaled_np)  # array shape (n_samples,)

    # 7. Konversi ke sinyal trading
    signal_arr = convert_ret_to_signal(raw_pred, ret_signal_params)

    # 8. Ambil satu nilai (biasanya satu baris per call)
    return float(signal_arr[0])


## Launch Server

In [13]:
# ============================================================
# 12. START KAGGLE EVALUATION SERVER
# ============================================================

inference_server = kei.DefaultInferenceServer(predict)

if IS_COMP_RERUN:
    # Mode ini dipakai saat SUBMIT notebook ke kompetisi
    print("Detected competition rerun environment. Starting inference server...")
    inference_server.serve()
else:
    # Mode lokal / saat run manual di Notebook
    print("Running in LOCAL GATEWAY mode for debugging...")
    print(f"Using input dir: {INPUT_DIR}")
    inference_server.run_local_gateway((str(INPUT_DIR),))

    # Setelah lokal gateway selesai, cek apakah submission.parquet sudah dibuat
    sub_path = Path("submission.parquet")
    if sub_path.exists():
        print("\nsubmission.parquet generated at:", sub_path.resolve())
    else:
        print("\nWARNING: submission.parquet not found in working directory.")


Running in LOCAL GATEWAY mode for debugging...
Using input dir: /kaggle/input/hull-tactical-market-prediction

submission.parquet generated at: /kaggle/working/submission.parquet


  pl.col(col).rolling_mean(window_size=w, min_periods=1).alias(rm_name)
  pl.col(col).rolling_std(window_size=w, min_periods=1).alias(vol_name)
