## Imports

In [1]:
# ============================================================
# 1. Imports & Global Setup
# ============================================================
from __future__ import annotations

import os
import datetime
import random
import warnings
from pathlib import Path
from dataclasses import dataclass, asdict
from typing import List, Dict, Tuple, Optional, Any

import numpy as np
import polars as pl

from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.preprocessing import StandardScaler

from tqdm.auto import tqdm  # lebih aman di notebook

import kaggle_evaluation.default_inference_server as kei
# nantinya dipakai sebagai: kei.DefaultInferenceServer(...)

# ============================================================
# 2. Reproducibility
# ============================================================
SEED: int = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# optional: kalau nanti pakai model lain yang ada random_state, pakai SEED ini

# ============================================================
# 3. Warning / Logging Setup (minimal)
# ============================================================
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)


## Project Directory Structure

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hull-tactical-market-prediction/train.csv
/kaggle/input/hull-tactical-market-prediction/test.csv
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_inference_server.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/default_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/templates.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/base_gateway.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/relay.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/kaggle_evaluation.proto
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/__init__.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2.py
/kaggle/input/hull-tactical-market-prediction/kaggle_evaluation/core/generated/kaggle_evaluation_pb2_grpc.py
/kaggl

## Configurations

In [3]:
# ============================================================
# 1. IMPORTS & GLOBAL SETUP
# ============================================================
from __future__ import annotations

import os
import datetime
import random
import warnings
from pathlib import Path
from dataclasses import dataclass, asdict, field
from typing import List, Dict, Tuple, Optional, Any

import numpy as np
import polars as pl
from tqdm.auto import tqdm

from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression, Ridge, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error

# Optional: tree-based model (akan dipakai nanti untuk naikkan skor)
try:
    import lightgbm as lgb
    HAS_LGBM = True
except ImportError:
    HAS_LGBM = False

import kaggle_evaluation.default_inference_server as kei

# Reproducibility
SEED: int = 42
np.random.seed(SEED)
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)

# Sedikit bersihin warning yang kurang penting
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)


# ============================================================
# 2. PROJECT DIRECTORY STRUCTURE & CONFIG
# ============================================================

# Nama kompetisi (dipakai untuk bikin folder kerja terpisah)
COMP_NAME: str = "hull-tactical-market-prediction"

# Nama eksperimen (bisa kamu ganti saat coba konfigurasi lain)
EXPERIMENT_NAME: str = f"enet_lgbm_v1_{datetime.datetime.now().strftime('%Y%m%d_%H%M')}"

# ---- Input & Working Dirs ----
INPUT_DIR: Path = Path("/kaggle/input") / COMP_NAME
WORK_DIR: Path  = Path("/kaggle/working") / COMP_NAME

# Pastikan WORK_DIR ada (supaya semua output ngumpul di sini)
WORK_DIR.mkdir(parents=True, exist_ok=True)

# File data utama
TRAIN_PATH: Path = INPUT_DIR / "train.csv"
TEST_PATH: Path  = INPUT_DIR / "test.csv"

# Folder resmi dari Kaggle Evaluation API (source & copy ke working jika perlu)
KAGGLE_EVAL_SRC: Path  = INPUT_DIR / "kaggle_evaluation"
KAGGLE_EVAL_WORK: Path = WORK_DIR / "kaggle_evaluation"
KAGGLE_EVAL_WORK.mkdir(parents=True, exist_ok=True)

# ---- Output structure (untuk hasil eksperimen) ----
OUT_DIR: Path        = WORK_DIR / "outputs"
MODEL_DIR: Path      = OUT_DIR / "models"       # simpan model, scaler, dsb.
FEATURE_DIR: Path    = OUT_DIR / "features"     # simpan dataset hasil FE (opsional)
LOG_DIR: Path        = OUT_DIR / "logs"         # catatan eksperimen, metrik
SUBMISSION_DIR: Path = OUT_DIR / "submissions"  # submission lokal untuk dicek

for p in [OUT_DIR, MODEL_DIR, FEATURE_DIR, LOG_DIR, SUBMISSION_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# Flag environment: apakah ini run dalam mode kompetisi (rerun) atau lokal
IS_COMP_RERUN: bool = os.getenv("KAGGLE_IS_COMPETITION_RERUN") is not None


# ============================================================
# 3. RETURNS -> SIGNAL CONFIGS
# ============================================================

@dataclass(frozen=True)
class SignalConfig:
    """
    Konfigurasi untuk mengubah prediksi return -> sinyal trading harian.

    min_signal / max_signal : batas posisi (0 = cash, 2 = 2x leverage)
    multiplier              : skala sensitivitas sinyal terhadap prediksi return
    """
    min_signal: float = 0.0
    max_signal: float = 2.0
    multiplier: float = 400.0


# Beberapa preset sinyal yang nanti bisa kamu coba via validasi Sharpe
SIGNAL_PRESETS: Dict[str, SignalConfig] = {
    "baseline": SignalConfig(min_signal=0.0, max_signal=2.0, multiplier=400.0),
    "conservative": SignalConfig(min_signal=0.5, max_signal=1.5, multiplier=200.0),
    "aggressive": SignalConfig(min_signal=0.0, max_signal=2.0, multiplier=600.0),
}

ACTIVE_SIGNAL_KEY: str = "baseline"  # ganti di sini kalau mau coba preset lain
SIGNAL_CFG: SignalConfig = SIGNAL_PRESETS[ACTIVE_SIGNAL_KEY]


# ============================================================
# 4. MODEL & EVALUATION CONFIGS
# ============================================================

@dataclass(frozen=True)
class ModelConfig:
    """
    Konfigurasi utama untuk model ElasticNet baseline.
    Akan jadi 'core signal' yang stabil dan cepat.
    """
    cv_folds: int = 10
    l1_ratio: float = 0.5
    alphas: np.ndarray = field(
        default_factory=lambda: np.logspace(-4, 2, 100)
    )
    max_iter: int = 1_000_000
    random_state: int = SEED

ENET_CFG = ModelConfig()


@dataclass(frozen=True)
class GBMConfig:
    """
    Konfigurasi default untuk model tree-based (LightGBM).

    Model ini yang nanti akan kita pakai untuk menangkap non-linearitas
    dan interaksi fitur, kemudian di-ensemble dengan ElasticNet.
    """
    num_leaves: int = 31
    max_depth: int = -1          # -1 = unlimited, tapi kita nanti bisa kecilkan
    learning_rate: float = 0.03
    n_estimators: int = 500
    subsample: float = 0.8       # row subsampling
    colsample_bytree: float = 0.8
    reg_alpha: float = 0.0
    reg_lambda: float = 1.0
    random_state: int = SEED

GBM_CFG = GBMConfig()


@dataclass(frozen=True)
class EvaluationConfig:
    """
    Konfigurasi evaluasi internal (Sharpe, validasi time-series).
    """
    n_folds: int = 5                 # jumlah fold time-series CV
    val_fraction: float = 0.2        # porsi akhir data untuk pure hold-out (opsional)
    sharpe_annualization_factor: float = np.sqrt(252.0)  # 252 hari bursa setahun

EVAL_CFG = EvaluationConfig()


In [4]:
print("WORK_DIR :", WORK_DIR)
print("TRAIN_PATH exists:", TRAIN_PATH.exists())
print("TEST_PATH exists :", TEST_PATH.exists())
print("Signal config    :", SIGNAL_CFG)
print("Model config     :", ENET_CFG)

WORK_DIR : /kaggle/working/hull-tactical-market-prediction
TRAIN_PATH exists: True
TEST_PATH exists : True
Signal config    : SignalConfig(min_signal=0.0, max_signal=2.0, multiplier=400.0)
Model config     : ModelConfig(cv_folds=10, l1_ratio=0.5, alphas=array([1.00000000e-04, 1.14975700e-04, 1.32194115e-04, 1.51991108e-04,
       1.74752840e-04, 2.00923300e-04, 2.31012970e-04, 2.65608778e-04,
       3.05385551e-04, 3.51119173e-04, 4.03701726e-04, 4.64158883e-04,
       5.33669923e-04, 6.13590727e-04, 7.05480231e-04, 8.11130831e-04,
       9.32603347e-04, 1.07226722e-03, 1.23284674e-03, 1.41747416e-03,
       1.62975083e-03, 1.87381742e-03, 2.15443469e-03, 2.47707636e-03,
       2.84803587e-03, 3.27454916e-03, 3.76493581e-03, 4.32876128e-03,
       4.97702356e-03, 5.72236766e-03, 6.57933225e-03, 7.56463328e-03,
       8.69749003e-03, 1.00000000e-02, 1.14975700e-02, 1.32194115e-02,
       1.51991108e-02, 1.74752840e-02, 2.00923300e-02, 2.31012970e-02,
       2.65608778e-02, 3.05385551e-0

## Dataclasses Helpers

In [5]:
# ============================================================
# 5. DATACLASSES HELPERS
# ============================================================

@dataclass
class DatasetOutput:
    """
    Paket hasil preprocessing dataset.

    - X_train, y_train : data untuk melatih model
    - X_test,  y_test  : data untuk evaluasi (mock test / hold-out)
    - scaler           : StandardScaler yang sudah di-fit pada X_train
    - feature_names    : daftar nama fitur (opsional, tapi berguna untuk debugging)
    """
    X_train: pl.DataFrame
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler
    feature_names: list[str] | None = None


@dataclass(frozen=True)
class ElasticNetParameters:
    """
    Parameter yang dipakai ketika membangun ElasticNetCV / ElasticNet.

    Default-nya diambil dari ENET_CFG (ModelConfig) supaya konsisten
    dengan konfigurasi global, tapi tetap bisa dioverride kalau perlu.
    """
    l1_ratio: float = ENET_CFG.l1_ratio
    cv: int = ENET_CFG.cv_folds
    alphas: np.ndarray = field(
        default_factory=lambda: ENET_CFG.alphas.copy()
    )
    max_iter: int = ENET_CFG.max_iter

    def __post_init__(self) -> None:
        if not (0.0 <= self.l1_ratio <= 1.0):
            raise ValueError(
                "ElasticNet l1_ratio harus berada di dalam interval [0, 1]."
            )


@dataclass(frozen=True)
class RetToSignalParameters:
    """
    Parameter untuk mengubah prediksi return -> sinyal trading harian.

    Default diambil dari SIGNAL_CFG supaya selaras dengan konfigurasi global.
    """
    signal_multiplier: float = SIGNAL_CFG.multiplier
    min_signal: float = SIGNAL_CFG.min_signal
    max_signal: float = SIGNAL_CFG.max_signal

    def __post_init__(self) -> None:
        if self.min_signal >= self.max_signal:
            raise ValueError(
                "min_signal harus lebih kecil daripada max_signal."
            )


## Set the Parameters

In [6]:
# ============================================================
# 5. DATACLASSES HELPERS + PARAMETER OBJECTS
# ============================================================

@dataclass
class DatasetOutput:
    """
    Paket hasil preprocessing dataset.

    - X_train, y_train : data untuk melatih model
    - X_test,  y_test  : data untuk evaluasi (mock test / hold-out)
    - scaler           : StandardScaler yang sudah di-fit pada X_train
    - feature_names    : daftar nama fitur (opsional, untuk debugging/analisis)
    - dates_train      : date_id untuk baris X_train (opsional, untuk TS-CV)
    - dates_test       : date_id untuk baris X_test  (opsional, untuk TS-CV)
    """
    X_train: pl.DataFrame
    X_test: pl.DataFrame
    y_train: pl.Series
    y_test: pl.Series
    scaler: StandardScaler
    feature_names: list[str] | None = None
    dates_train: pl.Series | None = None
    dates_test: pl.Series | None = None


@dataclass(frozen=True)
class ElasticNetParameters:
    """
    Parameter yang dipakai ketika membangun ElasticNetCV / ElasticNet.

    Default diambil dari ENET_CFG (ModelConfig) supaya konsisten dengan
    konfigurasi global, tapi bisa dioverride kalau perlu.
    """
    l1_ratio: float = ENET_CFG.l1_ratio
    cv: int = ENET_CFG.cv_folds
    alphas: np.ndarray = field(
        default_factory=lambda: ENET_CFG.alphas.copy()
    )
    max_iter: int = ENET_CFG.max_iter
    random_state: int = ENET_CFG.random_state
    n_jobs: int = -1
    fit_intercept: bool = True

    def __post_init__(self) -> None:
        if not (0.0 <= self.l1_ratio <= 1.0):
            raise ValueError(
                "ElasticNet l1_ratio harus berada di dalam interval [0, 1]."
            )


@dataclass(frozen=True)
class RetToSignalParameters:
    """
    Parameter untuk mengubah prediksi return -> sinyal trading harian.

    Default diambil dari SIGNAL_CFG supaya selaras dengan konfigurasi global.
    """
    signal_multiplier: float = SIGNAL_CFG.multiplier
    min_signal: float = SIGNAL_CFG.min_signal
    max_signal: float = SIGNAL_CFG.max_signal

    def __post_init__(self) -> None:
        if self.min_signal >= self.max_signal:
            raise ValueError(
                "min_signal harus lebih kecil daripada max_signal."
            )


@dataclass(frozen=True)
class GBMParameters:
    """
    Parameter untuk model tree-based (LightGBM) yang akan kita gunakan
    sebagai pelengkap model linear (ElasticNet/Ridge).

    Default diambil dari GBM_CFG (GBMConfig).
    """
    num_leaves: int = GBM_CFG.num_leaves
    max_depth: int = GBM_CFG.max_depth
    learning_rate: float = GBM_CFG.learning_rate
    n_estimators: int = GBM_CFG.n_estimators
    subsample: float = GBM_CFG.subsample
    colsample_bytree: float = GBM_CFG.colsample_bytree
    reg_alpha: float = GBM_CFG.reg_alpha
    reg_lambda: float = GBM_CFG.reg_lambda
    random_state: int = GBM_CFG.random_state


# ------------------------------------------------------------
# Instansiasi objek parameter yang akan dipakai di pipeline
# ------------------------------------------------------------

ret_signal_params = RetToSignalParameters()
enet_params       = ElasticNetParameters()
gbm_params: Optional[GBMParameters] = GBMParameters() if HAS_LGBM else None


## Dataset Loading/Creating Helper Functions

In [7]:
# ============================================================
# 6. DATA LOADING & PREPROCESSING HELPERS
# ============================================================

def load_trainset(path: Path = TRAIN_PATH, drop_last_n: int = 10) -> pl.DataFrame:
    """
    Load dan praproses training dataset.

    - Mengganti nama kolom target menjadi 'target'
      (market_forward_excess_returns -> target).
    - Meng-cast semua kolom selain 'date_id' ke Float64.
    - Sorting berdasarkan date_id.
    - Opsional: membuang N baris terakhir (drop_last_n) untuk
      menghindari kebocoran saat mock test.

    Args:
        path (Path): lokasi file train.csv.
        drop_last_n (int): jumlah baris terakhir yang dibuang.

    Returns:
        pl.DataFrame: DataFrame training yang sudah rapi.
    """
    df = (
        pl.read_csv(path)
        .rename({"market_forward_excess_returns": "target"})
        .with_columns(
            pl.col("date_id").cast(pl.Int32, strict=False)
        )
        .with_columns(
            pl.exclude("date_id").cast(pl.Float64, strict=False)
        )
        .sort("date_id")
    )

    if drop_last_n > 0:
        df = df.head(-drop_last_n)

    return df


def load_testset(path: Path = TEST_PATH) -> pl.DataFrame:
    """
    Load dan praproses test/mock dataset.

    - Mengganti nama 'lagged_forward_returns' -> 'target' agar
      struktur mirip train (meski tidak dipakai sebagai ground truth).
    - Meng-cast semua kolom selain 'date_id' ke Float64.
    - Sorting berdasarkan date_id.

    Args:
        path (Path): lokasi file test.csv.

    Returns:
        pl.DataFrame: DataFrame test yang sudah rapi.
    """
    df = (
        pl.read_csv(path)
        .rename({"lagged_forward_returns": "target"})
        .with_columns(
            pl.col("date_id").cast(pl.Int32, strict=False)
        )
        .with_columns(
            pl.exclude("date_id").cast(pl.Float64, strict=False)
        )
        .sort("date_id")
    )
    return df


def create_example_dataset(df: pl.DataFrame) -> pl.DataFrame:
    """
    Membuat fitur baseline + beberapa fitur tambahan (momentum & rolling),
    lalu membersihkan DataFrame.

    Fitur baru:
        - U1 = I2 - I1
        - U2 = M11 / mean(I2, I9, I7)
        - Untuk setiap fitur dasar F di base_vars:
            * F_diff1 : F_t - F_{t-1}   (momentum harian)
            * F_rm5   : rolling mean 5 hari

    Setelah itu:
        - Pilih subset kolom:
            ['date_id', 'target'] + semua fitur di vars_to_keep
        - Imputasi missing dengan exponential weighted mean (EWM)
        - Drop baris yang masih mengandung null
        - Sort by date_id

    Args:
        df (pl.DataFrame): input Polars DataFrame (train+test gabungan).

    Returns:
        pl.DataFrame: DataFrame dengan fitur baru & tanpa null.
    """
    # Fitur dasar yang akan dipakai sebagai anchor untuk FE
    base_vars: List[str] = [
        "S2", "E2", "E3", "P9", "S1", "S5", "I2", "P8",
        "P10", "P12", "P13",
    ]

    # Kolom yang wajib ada untuk menghitung U1 & U2
    required_base_cols = ["I1", "I2", "M11", "I7", "I9"] + base_vars
    missing = [c for c in required_base_cols if c not in df.columns]
    if missing:
        raise KeyError(f"Kolom berikut hilang di DataFrame: {missing}")

    # Pastikan urutan berdasarkan waktu
    df = df.sort("date_id")

    # Fitur U1 & U2
    df_feat = df.with_columns(
        (pl.col("I2") - pl.col("I1")).alias("U1"),
        (
            pl.col("M11")
            / ((pl.col("I2") + pl.col("I9") + pl.col("I7")) / 3.0)
        ).alias("U2"),
    )

    # Buat fitur momentum & rolling mean 5 hari untuk setiap base_var
    derived_vars: List[str] = []
    for col in base_vars:
        diff_name = f"{col}_diff1"
        rm5_name  = f"{col}_rm5"

        df_feat = df_feat.with_columns(
            (pl.col(col) - pl.col(col).shift(1)).alias(diff_name),
            pl.col(col).rolling_mean(window_size=5, min_periods=1).alias(rm5_name),
        )

        derived_vars.extend([diff_name, rm5_name])

    # Kumpulan semua fitur yang akan kita pakai
    vars_to_keep: List[str] = base_vars + derived_vars + ["U1", "U2"]

    # Imputasi missing dengan EWM untuk semua fitur
    df_feat = (
        df_feat
        .select(["date_id", "target"] + vars_to_keep)
        .with_columns(
            [
                pl.col(col).fill_null(pl.col(col).ewm_mean(com=0.5))
                for col in vars_to_keep
            ]
        )
        .drop_nulls()
        .sort("date_id")
    )

    return df_feat


def join_train_test_dataframes(train: pl.DataFrame, test: pl.DataFrame) -> pl.DataFrame:
    """
    Menggabungkan train dan test berdasarkan kolom yang sama
    (untuk memastikan feature engineering konsisten).

    Args:
        train (pl.DataFrame): DataFrame training mentah.
        test (pl.DataFrame): DataFrame test mentah.

    Returns:
        pl.DataFrame: DataFrame hasil concatenation vertical train+test
                      pada kolom-kolom yang sama.
    """
    common_columns: list[str] = [
        col for col in train.columns if col in test.columns
    ]

    if "date_id" not in common_columns:
        raise KeyError("'date_id' harus ada di kedua DataFrame.")

    return (
        pl.concat(
            [train.select(common_columns), test.select(common_columns)],
            how="vertical",
        )
        .sort("date_id")
    )


def split_dataset(train: pl.DataFrame, test: pl.DataFrame, features: list[str]) -> DatasetOutput:
    """
    Memisahkan data menjadi fitur (X) dan target (y), lalu melakukan scaling.

    Args:
        train (pl.DataFrame): DataFrame training yang sudah diproses.
        test (pl.DataFrame): DataFrame test yang sudah diproses.
        features (list[str]): Daftar nama fitur yang digunakan model.

    Returns:
        DatasetOutput: Dataclass berisi X_train, y_train, X_test, y_test,
                       scaler yang sudah di-fit, feature_names,
                       dan date_id masing-masing set.
    """
    # Pastikan kolom wajib ada
    for col in ["date_id", "target"]:
        if col not in train.columns or col not in test.columns:
            raise KeyError(f"Kolom wajib '{col}' hilang di train/test.")

    # Simpan date_id untuk keperluan time-series CV nanti
    dates_train = train.get_column("date_id")
    dates_test  = test.get_column("date_id")

    X_train = train.drop(["date_id", "target"])
    y_train = train.get_column("target")

    X_test = test.drop(["date_id", "target"])
    y_test = test.get_column("target")

    scaler = StandardScaler()

    # Sklearn lebih nyaman kalau dikasih numpy array
    X_train_np = X_train.to_numpy()
    X_test_np  = X_test.to_numpy()

    # fit_transform pada train
    X_train_scaled_np = scaler.fit_transform(X_train_np)
    X_train_scaled = pl.from_numpy(X_train_scaled_np, schema=features)

    # transform pada test
    X_test_scaled_np = scaler.transform(X_test_np)
    X_test_scaled = pl.from_numpy(X_test_scaled_np, schema=features)

    return DatasetOutput(
        X_train=X_train_scaled,
        X_test=X_test_scaled,
        y_train=y_train,
        y_test=y_test,
        scaler=scaler,
        feature_names=features,
        dates_train=dates_train,
        dates_test=dates_test,
    )


## Converting Return Prediction to Signal

Here is an example of a potential function used to convert a prediction based on the market forward excess return to a daily signal position. 

In [8]:
# ============================================================
# 7. RETURN -> SIGNAL MAPPING
# ============================================================

def convert_ret_to_signal(
    ret_arr: np.ndarray | float | list[float],
    params: RetToSignalParameters
) -> np.ndarray:
    """
    Convert raw model predictions (expected excess returns) into a trading signal.

    Mapping dasar (mode linear):
        signal = clip( ret * signal_multiplier + 1, min_signal, max_signal )

    Di mana:
        - signal ≈ 1  : posisi netral / benchmark
        - signal < 1  : underweight (kurang dari pasar)
        - signal > 1  : overweight (lebih agresif dari pasar)

    Args:
        ret_arr:
            Predicted returns (bisa scalar, list, atau numpy array).
        params (RetToSignalParameters):
            Parameter scaling dan clipping (min/max signal, multiplier).

    Returns:
        np.ndarray:
            Array sinyal trading dengan shape 1D (n_samples,),
            sudah di-clip di [min_signal, max_signal].
            (Kalau input scalar, tetap dikembalikan array 1 elemen.)
    """
    # Pastikan dalam bentuk numpy array float (1D)
    ret_arr = np.asarray(ret_arr, dtype=float).reshape(-1)

    # Sanity-check: tidak boleh ada NaN / inf
    if not np.all(np.isfinite(ret_arr)):
        raise ValueError(
            "ret_arr mengandung nilai non-finite (NaN/inf). "
            "Pastikan prediksi model sudah dibersihkan dulu."
        )

    # Mapping linear dari return -> posisi
    raw_signal = ret_arr * params.signal_multiplier + 1.0

    # Clip supaya tidak keluar dari range yang diizinkan
    signal = np.clip(raw_signal, params.min_signal, params.max_signal)

    # (Opsional) Debug ringan saat run lokal
    if not IS_COMP_RERUN:
        # hanya log sangat singkat, supaya tidak flood output
        s_min, s_max, s_mean = float(signal.min()), float(signal.max()), float(signal.mean())
        print(f"[DEBUG signal] min={s_min:.3f}, max={s_max:.3f}, mean={s_mean:.3f}")

    return signal


## Looking at the Data

In [9]:
# ============================================================
# 8. QUICK DATA CHECK: TRAIN & TEST (LOCAL ONLY)
# ============================================================

# Bagian ini hanya dijalankan saat TIDAK rerun kompetisi,
# supaya tidak menambah waktu eksekusi saat submit.
if not IS_COMP_RERUN:
    # Load data mentah
    train: pl.DataFrame = load_trainset()
    test: pl.DataFrame  = load_testset()

    print("=== SHAPE ===")
    print("Train shape:", train.shape)
    print("Test shape :", test.shape)

    # Range date_id untuk memastikan urut dan tidak kosong
    print("\n=== DATE RANGE ===")
    print(
        "Train date_id range:",
        int(train["date_id"].min()),
        "→",
        int(train["date_id"].max()),
    )
    print(
        "Test  date_id range:",
        int(test["date_id"].min()),
        "→",
        int(test["date_id"].max()),
    )

    # Cek beberapa baris terakhir train
    print("\n=== TRAIN SAMPLE (tail 3) ===")
    print(train.tail(3))

    # Cek beberapa kolom penting di test (date_id, is_scored, target)
    cols_to_show_test = [
        c for c in ["date_id", "is_scored", "target"] if c in test.columns
    ]
    print("\n=== TEST SAMPLE (head 5) ===")
    print(test.select(cols_to_show_test).head(5))

    # Statistik dasar target di train
    print("\n=== TARGET STATS (train.target) ===")
    print(train.select("target").describe())

    # Sedikit info missing values di train
    print("\n=== TOP 10 NULL COUNTS (TRAIN) ===")
    null_counts_train = (
        train.null_count()
        .melt(variable_name="column", value_name="n_null")
        .sort("n_null", descending=True)
        .head(10)
    )
    print(null_counts_train)

    # Dan di test
    print("\n=== TOP 10 NULL COUNTS (TEST) ===")
    null_counts_test = (
        test.null_count()
        .melt(variable_name="column", value_name="n_null")
        .sort("n_null", descending=True)
        .head(10)
    )
    print(null_counts_test)


=== SHAPE ===
Train shape: (9011, 98)
Test shape : (10, 99)

=== DATE RANGE ===
Train date_id range: 0 → 9010
Test  date_id range: 8980 → 8989

=== TRAIN SAMPLE (tail 3) ===
shape: (3, 98)
┌─────────┬─────┬─────┬─────┬───┬───────────┬─────────────────┬────────────────┬───────────┐
│ date_id ┆ D1  ┆ D2  ┆ D3  ┆ … ┆ V9        ┆ forward_returns ┆ risk_free_rate ┆ target    │
│ ---     ┆ --- ┆ --- ┆ --- ┆   ┆ ---       ┆ ---             ┆ ---            ┆ ---       │
│ i32     ┆ f64 ┆ f64 ┆ f64 ┆   ┆ f64       ┆ f64             ┆ f64            ┆ f64       │
╞═════════╪═════╪═════╪═════╪═══╪═══════════╪═════════════════╪════════════════╪═══════════╡
│ 9008    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.530228 ┆ -0.002897       ┆ 0.0001525      ┆ -0.003362 │
│ 9009    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.512769 ┆ -0.027028       ┆ 0.000153       ┆ -0.027493 │
│ 9010    ┆ 0.0 ┆ 0.0 ┆ 0.0 ┆ … ┆ -0.015503 ┆ 0.015344        ┆ 0.000153       ┆ 0.014879  │
└─────────┴─────┴─────┴─────┴───┴───────────┴─────────────────┴────

  .melt(variable_name="column", value_name="n_null")
  .melt(variable_name="column", value_name="n_null")


## Generating the Train and Test

In [10]:
# ============================================================
# 9. FEATURE ENGINEERING + TRAIN/TEST SPLIT
# ============================================================

# 1) Selalu load train & test mentah di sini
#    (jangan bergantung ke cell EDA yang hanya jalan saat not IS_COMP_RERUN)
train_raw: pl.DataFrame = load_trainset()
test_raw: pl.DataFrame  = load_testset()

# 2) Gabungkan train & test mentah pada kolom yang sama (supaya FE konsisten)
df_all: pl.DataFrame = join_train_test_dataframes(train_raw, test_raw)

# 3) Terapkan feature engineering (U1, U2, momentum, rolling, EWM fill, drop_null)
df_fe: pl.DataFrame = create_example_dataset(df=df_all)

# 4) Kembalikan lagi ke train_fe dan test_fe berdasarkan date_id awal
train_ids = train_raw.get_column("date_id").unique()
test_ids  = test_raw.get_column("date_id").unique()

train_fe: pl.DataFrame = df_fe.filter(pl.col("date_id").is_in(train_ids))
test_fe: pl.DataFrame  = df_fe.filter(pl.col("date_id").is_in(test_ids))

# Pastikan sudah terurut waktu
train_fe = train_fe.sort("date_id")
test_fe  = test_fe.sort("date_id")

# 5) Definisikan daftar fitur (semua kolom kecuali 'date_id' dan 'target')
FEATURES: list[str] = sorted(
    [col for col in test_fe.columns if col not in ["date_id", "target"]]
)

# 6) Split menjadi X/y + scaling, dibungkus dalam DatasetOutput
dataset: DatasetOutput = split_dataset(
    train=train_fe,
    test=test_fe,
    features=FEATURES,
)

X_train: pl.DataFrame      = dataset.X_train
X_test: pl.DataFrame       = dataset.X_test
y_train: pl.Series         = dataset.y_train
y_test: pl.Series          = dataset.y_test
scaler: StandardScaler     = dataset.scaler
feature_names: list[str]   = dataset.feature_names or FEATURES
dates_train: pl.Series     = dataset.dates_train
dates_test: pl.Series      = dataset.dates_test

# (Opsional) Quick check – hanya saat lokal, supaya tidak nambah waktu di kompetisi
if not IS_COMP_RERUN:
    print("=== FE & SPLIT SUMMARY ===")
    print("X_train shape :", X_train.shape)
    print("X_test  shape :", X_test.shape)
    print("y_train length:", y_train.len())
    print("y_test  length:", y_test.len())
    print("Num features  :", len(FEATURES))
    print("Train dates   :", int(dates_train.min()), "→", int(dates_train.max()))
    print("Test  dates   :", int(dates_test.min()),  "→", int(dates_test.max()))

    assert X_train.height == y_train.len(), "X_train dan y_train tidak sebaris!"
    assert X_test.height == y_test.len(), "X_test dan y_test tidak sebaris!"


=== FE & SPLIT SUMMARY ===
X_train shape : (7509, 35)
X_test  shape : (20, 35)
y_train length: 7509
y_test  length: 20
Num features  : 35
Train dates   : 1512 → 9010
Test  dates   : 8980 → 8989


  pl.col(col).rolling_mean(window_size=5, min_periods=1).alias(rm5_name),


## Fitting the Model 

In [11]:
# ============================================================
# 10. FIT ELASTICNET BASELINE (WITH CV)
# ============================================================
from sklearn.metrics import r2_score, mean_squared_error

# Sklearn lebih aman kalau dikasih numpy array
X_train_np = X_train.to_numpy()
y_train_np = y_train.to_numpy()

# Siapkan argumen untuk ElasticNetCV dari enet_params + tambahan
enet_cv_kwargs = asdict(enet_params).copy()
enet_cv_kwargs.update(
    {
        "fit_intercept": True,
        "random_state": SEED,
        "n_jobs": -1,   # pakai semua core yang tersedia
    }
)

# 1) Cross-validated ElasticNet untuk cari alpha terbaik
model_cv: ElasticNetCV = ElasticNetCV(**enet_cv_kwargs)
model_cv.fit(X_train_np, y_train_np)

best_alpha: float = float(model_cv.alpha_)

# 2) Fit final ElasticNet dengan alpha terbaik
model: ElasticNet = ElasticNet(
    alpha=best_alpha,
    l1_ratio=enet_params.l1_ratio,
    max_iter=enet_params.max_iter,
    fit_intercept=True,
    random_state=SEED,
)
model.fit(X_train_np, y_train_np)

# ------------------------------------------------------------
# Quick sanity check: performa di training
# ------------------------------------------------------------
y_pred_train = model.predict(X_train_np)
r2_train = r2_score(y_train_np, y_pred_train)
mse_train = mean_squared_error(y_train_np, y_pred_train)

print("=== ElasticNet Baseline Fitted ===")
print(f"Best alpha (CV) : {best_alpha:.6f}")
print(f"L1 ratio        : {enet_params.l1_ratio}")
print(f"CV folds        : {enet_params.cv}")
print(f"Train R²        : {r2_train:.6f}")
print(f"Train MSE       : {mse_train:.6e}")


=== ElasticNet Baseline Fitted ===
Best alpha (CV) : 0.000305
L1 ratio        : 0.5
CV folds        : 10
Train R²        : 0.014019
Train MSE       : 1.213405e-04


## Prediction Function via Kaggle Server

In [12]:
# ============================================================
# 11. PREDICTION FUNCTION FOR KAGGLE EVALUATION API
# ============================================================

def predict(test: pl.DataFrame) -> float:
    """
    Fungsi prediksi yang dipanggil oleh Kaggle Evaluation API.

    Alur:
    1) Rename kolom target (lagged_forward_returns -> target) jika perlu.
    2) Terapkan feature engineering (create_example_dataset).
    3) Pilih fitur sesuai FEATURES dan lakukan scaling dengan scaler yang sudah di-fit.
    4) Prediksi expected excess return pakai model.
    5) Konversi return -> sinyal trading via convert_ret_to_signal.
    6) Return satu nilai float (sinyal) untuk batch pertama.

    Catatan:
    - DefaultInferenceServer biasanya memanggil fungsi ini dengan
      1 baris per call, tapi kita tetap handle jika >1 baris.
    """
    # 1. Pastikan kolom target ada
    if "target" not in test.columns:
        if "lagged_forward_returns" in test.columns:
            test = test.rename({"lagged_forward_returns": "target"})
        else:
            raise KeyError(
                "Test dataframe harus memiliki kolom 'target' atau 'lagged_forward_returns'."
            )

    # 2. Pastikan 'date_id' ada
    if "date_id" not in test.columns:
        raise KeyError("Kolom 'date_id' wajib ada di dataframe test.")

    # 3. Feature engineering (U1, U2, imputasi EWM, subset fitur, drop_null)
    df = create_example_dataset(test)

    # Jika setelah FE tidak ada baris (misal semua null dan ter-drop),
    # kembalikan sinyal netral 1.0 agar tidak crash saat evaluasi.
    if df.height == 0:
        return float(1.0)

    # 4. Ambil hanya fitur yang sudah didefinisikan di training
    X_test = df.select(FEATURES)

    # Sanity-check: pastikan semua fitur ada
    missing_feats = [f for f in FEATURES if f not in X_test.columns]
    if missing_feats:
        raise KeyError(
            f"Fitur berikut hilang di data FE test: {missing_feats}"
        )

    # 5. Sklearn pakai numpy array
    X_test_np = X_test.to_numpy()
    X_test_scaled_np = scaler.transform(X_test_np)

    # 6. Prediksi expected excess return
    raw_pred = model.predict(X_test_scaled_np)  # array shape (n_samples,)

    # 7. Konversi ke sinyal trading
    signal_arr = convert_ret_to_signal(raw_pred, ret_signal_params)

    # 8. Ambil satu nilai (biasanya satu baris per call)
    return float(signal_arr[0])


## Launch Server

In [13]:
# ============================================================
# 12. START KAGGLE EVALUATION SERVER
# ============================================================

inference_server = kei.DefaultInferenceServer(predict)

if IS_COMP_RERUN:
    # Mode ini dipakai saat SUBMIT notebook ke kompetisi
    print("Detected competition rerun environment. Starting inference server...")
    inference_server.serve()
else:
    # Mode lokal / saat run manual di Notebook
    print("Running in LOCAL GATEWAY mode for debugging...")
    print(f"Using input dir: {INPUT_DIR}")
    inference_server.run_local_gateway((str(INPUT_DIR),))

    # Setelah lokal gateway selesai, cek apakah submission.parquet sudah dibuat
    sub_path = Path("submission.parquet")
    if sub_path.exists():
        print("\nsubmission.parquet generated at:", sub_path.resolve())
    else:
        print("\nWARNING: submission.parquet not found in working directory.")


Running in LOCAL GATEWAY mode for debugging...
Using input dir: /kaggle/input/hull-tactical-market-prediction

submission.parquet generated at: /kaggle/working/submission.parquet


  pl.col(col).rolling_mean(window_size=5, min_periods=1).alias(rm5_name),
