# Prediction Function via Kaggle Server

In [1]:
# =========================================================
# SINGLE SUBMISSION CELL — Prediction + Inference Server
# =========================================================
import os, warnings
from typing import List

import numpy as np
import pandas as pd
import polars as pl
import kaggle_evaluation.default_inference_server as deis

warnings.filterwarnings("ignore")

# ---------------------------------------------------------
# Helpers
# ---------------------------------------------------------
def _to_float(x, lo=-1.0, hi=1.0) -> float:
    """Convert to float safely, clip to [lo, hi]."""
    try:
        val = float(np.asarray(x).ravel()[0])
    except Exception:
        val = 0.0
    if not np.isfinite(val):
        val = 0.0
    return float(np.clip(val, lo, hi))


def _select_numeric_pl(df: pl.DataFrame, exclude: List[str] = None) -> pl.DataFrame:
    """Select numeric columns from Polars DF using Pandas fallback."""
    exclude = set(exclude or [])
    pdf = df.to_pandas()
    num = pdf.select_dtypes(include=[np.number])
    for c in list(num.columns):
        if c in exclude:
            num.drop(columns=c, inplace=True)
    return pl.from_pandas(num)

# ---------------------------------------------------------
# predict() — dipanggil sekali per timestep oleh gateway
# ---------------------------------------------------------
def predict(test: pl.DataFrame) -> float:
    """
    Mengembalikan satu float allocation ∈ [-1, 1].
    Robust terhadap fitur hilang, artefak tidak tersedia, atau model error.
    """

    # Normalisasi nama target (jika ada)
    if "lagged_forward_returns" in test.columns and "target" not in test.columns:
        test = test.rename({"lagged_forward_returns": "target"})

    # Buat dataset jika ada pipeline custom
    if "create_example_dataset" in globals():
        try:
            df = create_example_dataset(test)
        except Exception:
            df = test
    else:
        df = test

    # Pilih fitur
    if "FEATURES" in globals():
        feats = [c for c in FEATURES if c in df.columns]
        if feats:
            X_pl = df.select(feats)
        else:
            X_pl = _select_numeric_pl(df, exclude=["target"])
    else:
        X_pl = _select_numeric_pl(df, exclude=["target"])

    # Scaling (opsional)
    X_np = X_pl.to_numpy()
    if "scaler" in globals() and hasattr(scaler, "transform"):
        try:
            X_np = scaler.transform(X_np)
        except Exception:
            pass

    # Prediksi model (jika tersedia)
    raw_pred = None
    if "model" in globals() and hasattr(model, "predict"):
        try:
            raw_pred = np.asarray(model.predict(X_np)).ravel()[0]
        except Exception:
            raw_pred = None

    # Fallback jika model tidak ada
    if raw_pred is None:
        if "target" in df.columns:
            try:
                raw_pred = float(df.select("target").to_numpy().ravel()[-1])
            except Exception:
                raw_pred = 0.0
        else:
            raw_pred = 0.0

    # Konversi sinyal (opsional)
    if "convert_ret_to_signal" in globals() and "ret_signal_params" in globals():
        try:
            sig = convert_ret_to_signal(raw_pred, ret_signal_params)
        except Exception:
            sig = raw_pred
    else:
        sig = raw_pred

    return _to_float(sig, lo=-1.0, hi=1.0)

# ---------------------------------------------------------
# Launch Inference Server
# ---------------------------------------------------------
IS_RERUN = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))
server = deis.DefaultInferenceServer(predict)

if IS_RERUN:
    server.serve()
else:
    # Warm-up (opsional)
    try:
        _ = predict(pl.DataFrame({"date_id":[0], "D1":[0.0]}))
    except Exception:
        pass
    server.run_local_gateway(("/kaggle/input/hull-tactical-market-prediction/",))

# Tujuan & Artefak

In [2]:
# ============================================
# Tahap 0 — Tujuan & Artefak (Kaggle)
# ============================================
import os, json, sys, hashlib, platform, textwrap, random
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

# ============================================================
# 1) Lokasi data & direktori kerja
# ============================================================
DATA_DIR  = "/kaggle/input/hull-tactical-market-prediction"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV  = f"{DATA_DIR}/test.csv"

ROOT = "/kaggle/working/hull-tactical-market-prediction"

OUT_DIRS = {
    "preprocessed": f"{ROOT}/outputs/preprocessed",
    "features":     f"{ROOT}/outputs/features",
    "signals":      f"{ROOT}/outputs/signals",
    "models":       f"{ROOT}/outputs/models",
    "allocations":  f"{ROOT}/outputs/allocations",
    "backtests":    f"{ROOT}/outputs/backtests",
    "kaggle_eval":  f"{ROOT}/kaggle_evaluation",
    "logs":         f"{ROOT}/outputs",
}

for p in OUT_DIRS.values():
    Path(p).mkdir(parents=True, exist_ok=True)

META_JSON = f"{ROOT}/outputs/project_meta.json"
PROGRESS  = f"{ROOT}/outputs/progress.json"

# ============================================================
# 2) Tujuan proyek (ringkas)
# ============================================================
GOALS = textwrap.dedent("""
    Tujuan:
    1) Membangun pipeline prediksi harian anti-leak untuk Hull Tactical Market Prediction.
    2) Menghasilkan alokasi ('allocation') untuk setiap date_id pada test set.
    3) Menjaga reprodusibilitas (seed, versi library) & dokumentasi artefak setiap tahap.

    Artefak inti:
    - outputs/preprocessed/   : hasil hygiene/pembersihan awal
    - outputs/features/       : fitur turunan (momentum, risk, PCA rolling)
    - outputs/signals/        : sinyal OOF + sinyal test
    - outputs/models/         : model terlatih
    - outputs/allocations/    : bobot hasil mapping sinyal
    - outputs/backtests/      : hasil backtest & stress test
    - kaggle_evaluation/      : submission.csv & audit
""").strip()

print(GOALS, "\n")

# ============================================================
# 3) Seed & hashing
# ============================================================
SEED = 2025

def set_seed(seed=2025):
    random.seed(seed)
    np.random.seed(seed)

set_seed(SEED)

def sha1sum(path, chunksize: int = 1 << 20):
    """SHA1 cepat & stabil (hemat memori)."""
    h = hashlib.sha1()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(chunksize), b""):
            h.update(chunk)
    return h.hexdigest()

# ============================================================
# 4) Ringkasan dataset
# ============================================================
def quick_stats(csv_path: str):
    """Ringkasan cepat tanpa load full CSV (aman untuk file besar)."""
    out = {"path": csv_path}

    if not os.path.isfile(csv_path):
        out["error"] = "File tidak ditemukan"
        return out

    try:
        # ambil kolom untuk metadata ringan
        try:
            tmp = pd.read_csv(csv_path, usecols=["date_id"])
        except Exception:
            sample = pd.read_csv(csv_path, nrows=1000)
            cols = list(sample.columns)
            if "date_id" in cols:
                tmp = pd.read_csv(csv_path, usecols=["date_id"])
            else:
                tmp = sample

        # hitung jumlah baris cepat
        n_rows = sum(1 for _ in open(csv_path, "rb")) - 1
        out.update({
            "sha1": sha1sum(csv_path),
            "rows": int(n_rows),
            "has_date_id": "date_id" in tmp.columns,
        })

        # properti date_id
        if "date_id" in tmp.columns:
            out["date_id_min"] = int(tmp["date_id"].min())
            out["date_id_max"] = int(tmp["date_id"].max())
            out["date_id_nunique"] = int(tmp["date_id"].nunique())

        # ambil kolom
        header = pd.read_csv(csv_path, nrows=3)
        out["columns"] = list(header.columns)

        return out

    except Exception as e:
        out["error"] = repr(e)
        return out

train_info = quick_stats(TRAIN_CSV)
test_info  = quick_stats(TEST_CSV)

print("Ringkasan DATASET")
print("-"*60)
for info in (train_info, test_info):
    print(f"File     : {info.get('path')}")
    if "error" in info:
        print(f"Status   : ERROR → {info['error']}\n")
        continue
    print(f"SHA1     : {info['sha1']}")
    print(f"Rows     : {info['rows']}")
    print(f"date_id? : {info['has_date_id']}")
    if info.get("has_date_id"):
        print(f"date_id  : [{info['date_id_min']}, {info['date_id_max']}] "
              f"(unique={info['date_id_nunique']})")
    print(f"Columns  : {info['columns']}\n")

# ============================================================
# 5) Snapshot lingkungan & artefak
# ============================================================
def safe_version(pkg):
    try:
        mod = __import__(pkg)
        return getattr(mod, "__version__", "unknown")
    except Exception:
        return "not-installed"

ENV = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "pandas": safe_version("pandas"),
    "numpy": safe_version("numpy"),
    "scipy": safe_version("scipy"),
    "sklearn": safe_version("sklearn"),
    "lightgbm": safe_version("lightgbm"),
    "torch": safe_version("torch"),
    "xgboost": safe_version("xgboost"),
}

ARTEFACTS = {
    "preprocessed_train": f"{OUT_DIRS['preprocessed']}/train_preprocessed.parquet",
    "preprocessed_test":  f"{OUT_DIRS['preprocessed']}/test_preprocessed.parquet",
    "features_train":     f"{OUT_DIRS['features']}/train_features.parquet",
    "features_test":      f"{OUT_DIRS['features']}/test_features.parquet",
    "baseline_oof":       f"{OUT_DIRS['signals']}/baseline_oof.csv",
    "baseline_test":      f"{OUT_DIRS['signals']}/test_signal_baseline.csv",
    "alloc_stage3_test":  f"{OUT_DIRS['allocations']}/test_allocation_stage3.csv",
    "submission":         f"{OUT_DIRS['kaggle_eval']}/submission.csv",
}

meta = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "seed": SEED,
    "data": {"train": train_info, "test": test_info},
    "env": ENV,
    "root": ROOT,
    "goals": GOALS,
    "artefacts": ARTEFACTS,
}

Path(META_JSON).write_text(json.dumps(meta, indent=2), encoding="utf-8")

# progress.json
try:
    progress = json.loads(Path(PROGRESS).read_text()) if os.path.isfile(PROGRESS) else {}
except Exception:
    progress = {}

progress["objectives_and_artefacts"] = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "env": ENV,
    "artefacts": ARTEFACTS,
}

Path(PROGRESS).write_text(json.dumps(progress, indent=2))

print("-"*60)
print(f"Meta ditulis ke : {META_JSON}")
print(f"Progress log   : {PROGRESS}")
print("Struktur OUT   :")
for k, v in OUT_DIRS.items():
    print(f"  - {k:<12} -> {v}")

print("\nTahap 0 selesai. Lanjut ke 'Higiene Data & Target'.")


Tujuan:
1) Membangun pipeline prediksi harian anti-leak untuk Hull Tactical Market Prediction.
2) Menghasilkan alokasi ('allocation') untuk setiap date_id pada test set.
3) Menjaga reprodusibilitas (seed, versi library) & dokumentasi artefak setiap tahap.

Artefak inti:
- outputs/preprocessed/   : hasil hygiene/pembersihan awal
- outputs/features/       : fitur turunan (momentum, risk, PCA rolling)
- outputs/signals/        : sinyal OOF + sinyal test
- outputs/models/         : model terlatih
- outputs/allocations/    : bobot hasil mapping sinyal
- outputs/backtests/      : hasil backtest & stress test
- kaggle_evaluation/      : submission.csv & audit 

Ringkasan DATASET
------------------------------------------------------------
File     : /kaggle/input/hull-tactical-market-prediction/train.csv
SHA1     : d6cc777a2d19a1c36e28ff49f6e33549dbc046d6
Rows     : 9021
date_id? : True
date_id  : [0, 9020] (unique=9021)
Columns  : ['date_id', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', '

# Higiene Data & Target

In [3]:
# ============================================
# Tahap 1 — Higiene Data & Target
# ============================================
import os, json, re, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# ============================================================
# 1) Lokasi file & folder output
# ============================================================
DATA_DIR  = "/kaggle/input/hull-tactical-market-prediction"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV  = f"{DATA_DIR}/test.csv"

ROOT      = "/kaggle/working/hull-tactical-market-prediction"
OUT_PRE   = f"{ROOT}/outputs/preprocessed"
OUT_LOG   = f"{ROOT}/outputs"

Path(OUT_PRE).mkdir(parents=True, exist_ok=True)

TRAIN_PP       = f"{OUT_PRE}/train_preprocessed.parquet"
TEST_PP        = f"{OUT_PRE}/test_preprocessed.parquet"
HYGIENE_META   = f"{OUT_LOG}/hygiene_meta.json"
PROGRESS       = f"{OUT_LOG}/progress.json"

# ============================================================
# 2) Helper umum
# ============================================================
def to_snake(s: str) -> str:
    s = re.sub(r"[\/\s\-]+", "_", s.strip())
    s = re.sub(r"(?<=[a-z0-9])([A-Z])", r"_\1", s)
    s = re.sub(r"__+", "_", s)
    return s.lower()


def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Normalisasi kolom → snake_case & buang duplikasi."""
    mapper = {c: to_snake(c) for c in df.columns}
    df = df.rename(columns=mapper)
    return df.loc[:, ~df.columns.duplicated()]


def is_numeric_series(s: pd.Series) -> bool:
    return pd.api.types.is_integer_dtype(s) or pd.api.types.is_float_dtype(s)


def n_missing(s: pd.Series) -> int:
    return int(s.isna().sum())


# ============================================================
# 3) Load train/test + normalisasi kolom
# ============================================================
train = normalize_columns(pd.read_csv(TRAIN_CSV))
test  = normalize_columns(pd.read_csv(TEST_CSV))

if "date_id" not in train.columns or "date_id" not in test.columns:
    raise ValueError("Kolom `date_id` wajib ada di train & test.")

# cast aman date_id → int64
for df in (train, test):
    if not pd.api.types.is_integer_dtype(df["date_id"]):
        # jika float tetapi bulat, tetap aman
        arr = df["date_id"].to_numpy()
        if np.isclose(arr % 1, 0).all():
            df["date_id"] = arr.astype(np.int64)
        else:
            df["date_id"] = np.round(arr).astype(np.int64)

# urutkan dan dedup
train = train.sort_values("date_id").drop_duplicates().reset_index(drop=True)
test  = test.sort_values("date_id").drop_duplicates().reset_index(drop=True)


# ============================================================
# 4) Deteksi target otomatis
# ============================================================
CAND_PATTERNS = [
    r"^target$",
    r"^y$",
    r"label",
    r"(ret|return).*fwd",
    r"forward",
    r"market_forward_excess_returns",
    r"^allocation$"
]

def looks_like_target(col: str) -> bool:
    c = col.lower()
    return any(re.search(p, c) for p in CAND_PATTERNS)

target_candidates = [c for c in train.columns
                     if c not in test.columns and looks_like_target(c) and is_numeric_series(train[c])]

TARGET_COL = target_candidates[0] if target_candidates else None


# ============================================================
# 5) Clean dataframe: NaN, Inf, kolom all-NaN, kolom konstan
# ============================================================
def clean_frame(df: pd.DataFrame) -> pd.DataFrame:
    df = df.replace([np.inf, -np.inf], np.nan)

    # kolom all-NaN
    drop_nan = [c for c in df.columns if df[c].isna().all()]
    if drop_nan:
        df = df.drop(columns=drop_nan)

    # kolom konstan
    drop_const = []
    for c in df.columns:
        if c == "date_id" or (TARGET_COL is not None and c == TARGET_COL):
            continue
        try:
            if df[c].nunique(dropna=True) <= 1:
                drop_const.append(c)
        except Exception:
            pass

    if drop_const:
        df = df.drop(columns=drop_const)

    return df

train = clean_frame(train)
test  = clean_frame(test)


# ============================================================
# 6) Feature alignment (ANTI-LEAK)
# ============================================================
num_cols_train = [
    c for c in train.columns
    if c != "date_id"
    and (TARGET_COL is None or c != TARGET_COL)
    and is_numeric_series(train[c])
]

num_cols_test = [
    c for c in test.columns
    if c != "date_id"
    and is_numeric_series(test[c])
]

shared_num = sorted(set(num_cols_train).intersection(num_cols_test))

kept_cols_train = ["date_id"] + ([TARGET_COL] if TARGET_COL else []) + shared_num
kept_cols_test  = ["date_id"] + shared_num

train_pp = train[kept_cols_train].copy()
test_pp  = test[kept_cols_test].copy()


# ============================================================
# 7) Laporan missing & simpan artefak
# ============================================================
def missing_report(df):
    rows = []
    for c in df.columns:
        nm = n_missing(df[c])
        rows.append({
            "col": c,
            "dtype": str(df[c].dtype),
            "n_missing": nm,
            "frac_missing": nm / len(df) if len(df) else 0.0,
            "n_unique": df[c].nunique(dropna=True)
        })
    return pd.DataFrame(rows).sort_values(["frac_missing", "n_missing"], ascending=[False, False])


rep_train = missing_report(train_pp)
rep_test  = missing_report(test_pp)

train_pp.to_parquet(TRAIN_PP, index=False)
test_pp.to_parquet(TEST_PP, index=False)

# head sample untuk audit
train_head_csv = f"{OUT_PRE}/_train_head.csv"
test_head_csv  = f"{OUT_PRE}/_test_head.csv"
train_pp.head(5).to_csv(train_head_csv, index=False)
test_pp.head(5).to_csv(test_head_csv, index=False)


# ============================================================
# 8) Meta JSON & progress log
# ============================================================
meta = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "target_col_detected": TARGET_COL,
    "n_rows": {"train": len(train_pp), "test": len(test_pp)},
    "n_shared_numeric_features": len(shared_num),
    "shared_numeric_features_sample": shared_num[:500],
    "missing_summary": {
        "train_top10": rep_train.head(10).to_dict(orient="records"),
        "test_top10":  rep_test.head(10).to_dict(orient="records"),
    },
}

Path(HYGIENE_META).write_text(json.dumps(meta, indent=2))

try:
    progress = json.loads(Path(PROGRESS).read_text()) if os.path.isfile(PROGRESS) else {}
except Exception:
    progress = {}

progress["hygiene_data_target"] = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "target_col": TARGET_COL,
    "train_pp": TRAIN_PP,
    "test_pp": TEST_PP,
    "n_shared_numeric_features": len(shared_num),
}

Path(PROGRESS).write_text(json.dumps(progress, indent=2))


# ============================================================
# 9) Output ringkas
# ============================================================
print("=== HIGIENE DATA & TARGET — SELESAI ===")
print(f"- Target terdeteksi : {TARGET_COL}")
print(f"- Jumlah fitur numerik (shared) : {len(shared_num)}")
print(f"- Train (clean) : {train_pp.shape} -> {TRAIN_PP}")
print(f"- Test  (clean) : {test_pp.shape}  -> {TEST_PP}")
print(f"- Meta     : {HYGIENE_META}")
print(f"- Snapshot : {train_head_csv} | {test_head_csv}")


=== HIGIENE DATA & TARGET — SELESAI ===
- Target terdeteksi : forward_returns
- Jumlah fitur numerik (shared) : 89
- Train (clean) : (9021, 91) -> /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/train_preprocessed.parquet
- Test  (clean) : (10, 90)  -> /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/test_preprocessed.parquet
- Meta     : /kaggle/working/hull-tactical-market-prediction/outputs/hygiene_meta.json
- Snapshot : /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/_train_head.csv | /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/_test_head.csv


# LightGBM Baseline

In [4]:
# ============================================
# Tahap 2 — LightGBM Baseline (Revisi Stabil)
# ============================================
import os, json, math, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm.auto import tqdm

warnings.filterwarnings("ignore")

# ============================================================
# 1) Lokasi file
# ============================================================
ROOT     = "/kaggle/working/hull-tactical-market-prediction"
PRE_DIR  = f"{ROOT}/outputs/preprocessed"
OUT_SIG  = f"{ROOT}/outputs/signals"
OUT_MOD  = f"{ROOT}/outputs/models"
OUT_MET  = f"{ROOT}/outputs/metrics"
OUT_LOG  = f"{ROOT}/outputs"

TRAIN_PP = f"{PRE_DIR}/train_preprocessed.parquet"
TEST_PP  = f"{PRE_DIR}/test_preprocessed.parquet"
HYGIENE_META = f"{OUT_LOG}/hygiene_meta.json"
PROGRESS = f"{OUT_LOG}/progress.json"

for d in [OUT_SIG, OUT_MOD, OUT_MET]:
    Path(d).mkdir(parents=True, exist_ok=True)

SEED = 2025
np.random.seed(SEED)

# ============================================================
# 2) Helper umum
# ============================================================
def rmse_np(y_true, y_pred):
    return float(np.sqrt(np.mean((np.asarray(y_true)-np.asarray(y_pred))**2)))

def mae_np(y_true, y_pred):
    return float(np.mean(np.abs(np.asarray(y_true)-np.asarray(y_pred))))

def corr_safe(a, b):
    a = np.asarray(a, float)
    b = np.asarray(b, float)
    if a.std()==0 or b.std()==0:
        return 0.0
    return float(np.corrcoef(a,b)[0,1])


def build_time_folds(df, date_col="date_id", train_len=2520, valid_len=252, step=252):
    """Time-series rolling folds stabil—anti-index error."""
    dates = np.array(sorted(df[date_col].unique()))
    folds = []
    n = len(dates)

    # batasan aman
    train_len = max(252, min(train_len, n//2))
    valid_len = max(64,  min(valid_len, n//8))
    step      = max(32,  min(step, valid_len))

    for va_end in range(train_len + valid_len - 1, n, step):
        va_start = va_end - valid_len + 1
        tr_start = va_start - train_len
        if tr_start < 0:
            continue
        tr_dates = set(dates[tr_start:va_start])
        va_dates = set(dates[va_start:va_end+1])

        tr_idx = df.index[df[date_col].isin(tr_dates)].to_numpy()
        va_idx = df.index[df[date_col].isin(va_dates)].to_numpy()
        if len(tr_idx)==0 or len(va_idx)==0:
            continue
        folds.append((tr_idx, va_idx))

    return folds


def strip_problem_features(df, features, freq_threshold=0.995, std_threshold=1e-12):
    """Hindari fitur yang memicu warning LightGBM."""
    keep=[]
    dropped=[]
    for c in features:
        s=df[c]
        nun = s.nunique(dropna=False)
        if nun<=1:
            dropped.append((c,"constant"))
            continue

        vc=s.value_counts(dropna=False)
        if (vc.iloc[0]/len(s))>=freq_threshold:
            dropped.append((c,"quasi_constant"))
            continue

        if pd.api.types.is_numeric_dtype(s):
            st=float(s.std(ddof=0))
            if st < std_threshold:
                dropped.append((c,"low_variance"))
                continue

        keep.append(c)
    return keep, dropped


# ============================================================
# 3) Load data
# ============================================================
assert os.path.isfile(TRAIN_PP) and os.path.isfile(TEST_PP), "Jalankan Tahap 1 dulu."
train = pd.read_parquet(TRAIN_PP)
test  = pd.read_parquet(TEST_PP)

meta = json.loads(Path(HYGIENE_META).read_text()) if os.path.isfile(HYGIENE_META) else {}
TARGET_COL = meta.get("target_col_detected")
if TARGET_COL is None or TARGET_COL not in train.columns:
    raise ValueError("Target tidak terdeteksi di train_preprocessed.")

# ============================================================
# 4) Feature selection (after hygiene)
# ============================================================
FEATURES_RAW = [c for c in train.columns if c not in ("date_id", TARGET_COL)]
FEATURES, dropped_info = strip_problem_features(train, FEATURES_RAW)

# cast float32 → stabil di LightGBM
train[FEATURES] = train[FEATURES].astype("float32")
test[FEATURES]  = test[FEATURES].astype("float32")

# ============================================================
# 5) Build rolling folds
# ============================================================
FOLDS = build_time_folds(train)
if len(FOLDS)==0:
    # fallback aman
    dates = sorted(train["date_id"].unique())
    cut   = int(0.8*len(dates))
    tr=set(dates[:cut]); va=set(dates[cut:])
    tr_idx = train.index[train["date_id"].isin(tr)].to_numpy()
    va_idx = train.index[train["date_id"].isin(va)].to_numpy()
    FOLDS=[(tr_idx,va_idx)]

# ============================================================
# 6) LightGBM params
# ============================================================
# Matikan logger bawaan (menekan spam warning)
try:
    lgb.register_logger(lambda *args, **kwargs: None)
except:
    pass

lgb_params = dict(
    n_estimators=5000,
    learning_rate=0.018,
    num_leaves=96,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.85,
    subsample_freq=1,
    colsample_bytree=0.85,
    reg_alpha=1e-3,
    reg_lambda=1e-2,
    max_bin=255,
    objective="regression",
    random_state=SEED,
    n_jobs=-1,
    feature_pre_filter=False,
    verbosity=-1
)

# ============================================================
# 7) Training per-fold
# ============================================================
oof = np.zeros(len(train), float)
test_pred = np.zeros(len(test), float)
fold_metrics=[]
fi_all=[]
X_test = test[FEATURES].copy()
y_true = train[TARGET_COL].astype(float).values

pbar = tqdm(range(len(FOLDS)), desc="Training folds", leave=True)

for k,(tr_idx,va_idx) in enumerate(FOLDS, start=1):
    X_tr, X_va = train.loc[tr_idx, FEATURES], train.loc[va_idx, FEATURES]
    y_tr, y_va = y_true[tr_idx], y_true[va_idx]

    model = lgb.LGBMRegressor(**lgb_params)
    eval_res={}

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(300, verbose=False),
            lgb.record_evaluation(eval_res)
        ]
    )

    best_iter = model.best_iteration_ or lgb_params["n_estimators"]
    y_hat = model.predict(X_va, num_iteration=best_iter)
    oof[va_idx] = y_hat

    rm = rmse_np(y_va, y_hat)
    ma = mae_np(y_va, y_hat)
    co = corr_safe(y_va, y_hat)

    fold_metrics.append({
        "fold": k,
        "rmse": rm,
        "mae": ma,
        "corr": co,
        "best_iter": best_iter,
        "n_tr": len(tr_idx),
        "n_va": len(va_idx)
    })

    # feature importance
    try:
        fi_all.append(pd.DataFrame({
            "feature": FEATURES,
            "gain": model.booster_.feature_importance("gain", iteration=best_iter),
            "split": model.booster_.feature_importance("split", iteration=best_iter),
            "fold": k
        }))
    except:
        pass

    # test prediction accumulate
    test_pred += model.predict(X_test, num_iteration=best_iter) / len(FOLDS)

    pbar.set_postfix({"fold_rmse": f"{rm:.6f}", "best_iter": best_iter})
    pbar.update(1)

pbar.close()

# ============================================================
# 8) Overall metrics
# ============================================================
oof_rmse = rmse_np(y_true, oof)
oof_mae  = mae_np(y_true, oof)
oof_corr = corr_safe(y_true, oof)

# ============================================================
# 9) Save artifacts
# ============================================================
# OOF
oof_df = train[["date_id"]].copy()
oof_df["y_true"] = y_true
oof_df["y_pred"] = oof
oof_path = f"{OUT_SIG}/lgbm_oof.csv"
oof_df.to_csv(oof_path, index=False)

# Fold metrics
fold_df = pd.DataFrame(fold_metrics)
fold_path = f"{OUT_MET}/lgbm_fold_metrics.csv"
fold_df.to_csv(fold_path, index=False)

# Test signal
test_df = test[["date_id"]].copy()
test_df["lgbm_signal"] = test_pred
test_sig_path = f"{OUT_SIG}/test_signal_lgbm.csv"
test_df.to_csv(test_sig_path, index=False)

# FI
if fi_all:
    fi_df = pd.concat(fi_all, ignore_index=True)
    fi_path = f"{OUT_MET}/lgbm_feature_importance.csv"
    fi_df.to_csv(fi_path, index=False)
else:
    fi_path=None

# Dropped features
drop_info_path = f"{OUT_MET}/lgbm_dropped_features.json"
Path(drop_info_path).write_text(json.dumps(
    [{"feature":f, "reason":r} for f,r in dropped_info], indent=2
))

# Save last booster
try:
    last_model_path = f"{OUT_MOD}/lgbm_last_model.txt"
    model.booster_.save_model(last_model_path)
except:
    last_model_path=None

# Update progress
try:
    progress=json.loads(Path(PROGRESS).read_text())
except:
    progress={}

progress["lgbm_baseline"]={
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "features_used": len(FEATURES),
    "features_dropped": len(dropped_info),
    "folds": len(FOLDS),
    "oof_rmse": oof_rmse,
    "oof_mae": oof_mae,
    "oof_corr": oof_corr,
}
Path(PROGRESS).write_text(json.dumps(progress, indent=2))


# ============================================================
# 10) Output ringkas
# ============================================================
print("=== LIGHTGBM BASELINE — SELESAI ===")
print(f"Target          : {TARGET_COL}")
print(f"Fitur (raw)     : {len(FEATURES_RAW)} → dipakai: {len(FEATURES)}, dibuang: {len(dropped_info)}")
print(f"Folds           : {len(FOLDS)}")
print(f"OOF RMSE        : {oof_rmse:.6f}")
print(f"OOF MAE         : {oof_mae:.6f}")
print(f"OOF Corr        : {oof_corr:.6f}")
print(f"OOF CSV         : {oof_path}")
print(f"Fold metrics    : {fold_path}")
print(f"Test signal     : {test_sig_path}")
print(f"FI              : {fi_path}")
print(f"Dropped feats   : {drop_info_path}")
print(f"Last model      : {last_model_path}")


Training folds:   0%|          | 0/25 [00:00<?, ?it/s]

=== LIGHTGBM BASELINE — SELESAI ===
Target          : forward_returns
Fitur (raw)     : 89 → dipakai: 89, dibuang: 0
Folds           : 25
OOF RMSE        : 0.010518
OOF MAE         : 0.007477
OOF Corr        : 0.067584
OOF CSV         : /kaggle/working/hull-tactical-market-prediction/outputs/signals/lgbm_oof.csv
Fold metrics    : /kaggle/working/hull-tactical-market-prediction/outputs/metrics/lgbm_fold_metrics.csv
Test signal     : /kaggle/working/hull-tactical-market-prediction/outputs/signals/test_signal_lgbm.csv
FI              : /kaggle/working/hull-tactical-market-prediction/outputs/metrics/lgbm_feature_importance.csv
Dropped feats   : /kaggle/working/hull-tactical-market-prediction/outputs/metrics/lgbm_dropped_features.json
Last model      : /kaggle/working/hull-tactical-market-prediction/outputs/models/lgbm_last_model.txt


# Kalibrasi Prediksi

In [5]:
# ============================================
# Tahap 3 — Kalibrasi Prediksi (Revisi Stabil)
# ============================================
import os, json, pickle, warnings, glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr

warnings.filterwarnings("ignore")

# ============================================================
# 0) Temukan folder outputs secara otomatis
# ============================================================
BASE_SEARCH = "/kaggle/working"

def _find_outputs_dir(base=BASE_SEARCH):
    """Deteksi folder `outputs` secara otomatis & akurat."""
    # 1) Override environment
    env = os.environ.get("HULL_OUT_DIR", "").strip()
    if env and os.path.isdir(env):
        return env
    
    # 2) Lokasi umum pipeline ini
    cand = os.path.join(base, "hull-tactical-market-prediction", "outputs")
    if os.path.isdir(cand):
        return cand

    # 3) Cari seluruh folder bernama "outputs"
    all_outs = glob.glob(os.path.join(base, "**", "outputs"), recursive=True)

    if not all_outs:
        raise FileNotFoundError("Tidak ada folder `outputs` di /kaggle/working.")

    # 4) Ranking berdasarkan isi (signals, metrics, oof)
    ranked = []
    for d in all_outs:
        files = glob.glob(os.path.join(d, "**", "*.csv"), recursive=True)
        score = 0
        score += 2000 if os.path.isdir(os.path.join(d, "signals")) else 0
        score += 1000 if os.path.isdir(os.path.join(d, "metrics")) else 0
        score += sum(1 for f in files if "oof" in os.path.basename(f).lower())
        ranked.append((score, d))

    ranked.sort(reverse=True)
    return ranked[0][1]

OUT_DIR = _find_outputs_dir()
CAL_DIR = os.path.join(OUT_DIR, "calibration")
Path(CAL_DIR).mkdir(parents=True, exist_ok=True)

print(f"[INFO] Outputs dir: {OUT_DIR}")

# ============================================================
# 1) Konfigurasi
# ============================================================
ID_COL = "date_id"

TARGET_CANDS = [
    "forward_returns", "target", "y_true",
    "market_forward_excess_returns"
]

PRED_CANDS = [
    "y_pred","prediction","pred","signal","lgbm_signal",
    "oof_pred","score","allocation"
]

TEST_PRED_CANDS = [
    "prediction","signal","lgbm_signal",
    "pred","score","allocation"
]

CAL_OOF_CSV   = f"{CAL_DIR}/oof_calibrated.csv"
CAL_TEST_CSV  = f"{CAL_DIR}/test_calibrated.csv"
CAL_MODEL_PKL = f"{CAL_DIR}/iso_calibrator.pkl"
CAL_SUMMARY   = f"{CAL_DIR}/calibration_summary.json"

# ============================================================
# 2) Helper metric
# ============================================================
def _metrics(y, p):
    y = np.asarray(y)
    p = np.asarray(p)

    return {
        "rmse": float(np.sqrt(mean_squared_error(y, p))),
        "mae":  float(mean_absolute_error(y, p)),
        "corr": float(np.corrcoef(y, p)[0,1]) if y.std()>0 and p.std()>0 else 0.0,
        "spearman": float(spearmanr(y, p, nan_policy="omit").correlation)
    }

def _pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

def _read_any(path):
    return pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path)

# ============================================================
# 3) Deteksi OOF & Test signal otomatis
# ============================================================
def _find_oof_and_test(out_dir):
    files = glob.glob(os.path.join(out_dir, "**", "*.csv"), recursive=True) + \
            glob.glob(os.path.join(out_dir, "**", "*.parquet"), recursive=True)

    if not files:
        raise FileNotFoundError("Tidak ada artefak sinyal. Jalankan baseline dulu.")

    best_oof = None
    best_test = None

    # Cari OOF
    for fp in files:
        try:
            df = _read_any(fp)
            pred = _pick_col(df, PRED_CANDS)
            tgt  = _pick_col(df, TARGET_CANDS)
            if pred is None or tgt is None:
                continue

            score = len(df)
            if ID_COL in df.columns:
                score += 50000
            if "oof" in fp.lower():
                score += 10000

            if best_oof is None or score > best_oof[0]:
                best_oof = (score, fp, pred, tgt)
        except:
            pass

    # Cari Test
    for fp in files:
        try:
            df = _read_any(fp)
            pred = _pick_col(df, TEST_PRED_CANDS)
            if pred is None or ID_COL not in df.columns:
                continue

            score = len(df)
            if "test" in fp.lower():
                score += 5000

            if best_test is None or score > best_test[0]:
                best_test = (score, fp, pred)
        except:
            pass

    if best_oof is None:
        raise FileNotFoundError("OOF tidak ditemukan.")
    if best_test is None:
        raise FileNotFoundError("Test signal tidak ditemukan.")

    return best_oof, best_test

(o_score, oof_fp, OOF_PRED, OOF_TGT), (t_score, test_fp, TEST_PRED) = _find_oof_and_test(OUT_DIR)

print(f"[INFO] OOF  : {oof_fp}")
print(f"        pred={OOF_PRED} | target={OOF_TGT}")
print(f"[INFO] Test : {test_fp}")
print(f"        pred={TEST_PRED}")

oof_df  = _read_any(oof_fp).copy()
test_df = _read_any(test_fp).copy()

x_oof = oof_df[OOF_PRED].astype("float64").to_numpy()
y_oof = oof_df[OOF_TGT].astype("float64").to_numpy()

# Noise kecil untuk stabilisasi isotonic
rng = np.random.RandomState(2025)
x_fit = x_oof + 1e-9 * rng.randn(len(x_oof))

m_raw = _metrics(y_oof, x_oof)

# ============================================================
# 4) Fit calibrator (Isotonic → fallback Linear)
# ============================================================
use_model = "isotonic"

try:
    iso = IsotonicRegression(increasing=True, out_of_bounds="clip")
    iso.fit(x_fit, y_oof)
    y_iso = iso.predict(x_fit)
    if np.std(y_iso) < 1e-12:
        raise RuntimeError("Isotonic output flat.")
    calibrator = iso
except Exception as e:
    warnings.warn(f"Isotonic gagal ({e}) → Fallback LinearRegression.")
    use_model = "linear"
    lr = LinearRegression()
    lr.fit(x_fit.reshape(-1,1), y_oof)
    calibrator = lr

def _apply(model, arr):
    arr = np.asarray(arr, float)
    return model.predict(arr) if use_model=="isotonic" else model.predict(arr.reshape(-1,1))

# ============================================================
# 5) Apply + normalisasi + clipping
# ============================================================
oof_df["calibrated"]  = _apply(calibrator, oof_df[OOF_PRED].to_numpy())
test_df["calibrated"] = _apply(calibrator, test_df[TEST_PRED].to_numpy())

# Centering
mean_oof = float(oof_df["calibrated"].mean())
oof_df["calibrated"]  -= mean_oof
test_df["calibrated"] -= mean_oof

# Soft clip 5 sigma
sd = float(oof_df["calibrated"].std(ddof=0) + 1e-12)
lo, hi = -5*sd, 5*sd

oof_df["calibrated"]  = oof_df["calibrated"].clip(lo, hi)
test_df["calibrated"] = test_df["calibrated"].clip(lo, hi)

m_cal = _metrics(y_oof, oof_df["calibrated"])

# ============================================================
# 6) Simpan artefak
# ============================================================
oof_cols  = [c for c in [ID_COL, OOF_TGT, OOF_PRED, "calibrated"] if c in oof_df.columns]
test_cols = [c for c in [ID_COL, TEST_PRED, "calibrated"] if c in test_df.columns]

oof_df[oof_cols].to_csv(CAL_OOF_CSV, index=False, float_format="%.9f")

test_df[test_cols] \
    .rename(columns={TEST_PRED:"prediction_raw",
                     "calibrated":"prediction_calibrated"}) \
    .to_csv(CAL_TEST_CSV, index=False, float_format="%.9f")

with open(CAL_MODEL_PKL, "wb") as f:
    pickle.dump({"model": calibrator, "kind": use_model}, f)

summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "outputs_dir": OUT_DIR,
    "oof_file": oof_fp,
    "test_file": test_fp,
    "oof_pred": OOF_PRED,
    "oof_target": OOF_TGT,
    "test_pred": TEST_PRED,
    "calibrator": use_model,
    "raw_metrics": m_raw,
    "calibrated_metrics": m_cal,
    "clip_sigma": 5,
    "clip_bounds": [lo, hi],
    "outputs": {
        "oof_calibrated": CAL_OOF_CSV,
        "test_calibrated": CAL_TEST_CSV,
        "calibrator_pkl": CAL_MODEL_PKL
    }
}
Path(CAL_SUMMARY).write_text(json.dumps(summary, indent=2))

print("\n=== KALIBRASI — SELESAI ===")
print(f"- Calibrator      : {use_model}")
print(f"- OOF raw         : RMSE={m_raw['rmse']:.6f} | MAE={m_raw['mae']:.6f} | Corr={m_raw['corr']:.4f}")
print(f"- OOF calibrated  : RMSE={m_cal['rmse']:.6f} | MAE={m_cal['mae']:.6f} | Corr={m_cal['corr']:.4f}")
print(f"- Files           : {CAL_OOF_CSV}, {CAL_TEST_CSV}, {CAL_MODEL_PKL}")


[INFO] Outputs dir: /kaggle/working/hull-tactical-market-prediction/outputs
[INFO] OOF  : /kaggle/working/hull-tactical-market-prediction/outputs/signals/lgbm_oof.csv
        pred=y_pred | target=y_true
[INFO] Test : /kaggle/working/hull-tactical-market-prediction/outputs/signals/test_signal_lgbm.csv
        pred=lgbm_signal

=== KALIBRASI — SELESAI ===
- Calibrator      : isotonic
- OOF raw         : RMSE=0.010518 | MAE=0.007477 | Corr=0.0676
- OOF calibrated  : RMSE=0.010512 | MAE=0.007490 | Corr=0.0930
- Files           : /kaggle/working/hull-tactical-market-prediction/outputs/calibration/oof_calibrated.csv, /kaggle/working/hull-tactical-market-prediction/outputs/calibration/test_calibrated.csv, /kaggle/working/hull-tactical-market-prediction/outputs/calibration/iso_calibrator.pkl


# Mapping Sinyal

In [6]:
# ============================================
# Tahap 4 — Mapping Sinyal → Allocation (Final Revisi Stabil)
# ============================================
import os, json, glob, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# ============================================================
# 0) Temukan folder outputs otomatis (stabil & deterministic)
# ============================================================
BASE_SEARCH = "/kaggle/working"

def _find_outputs_dir(base=BASE_SEARCH):
    # Prioritas 1: environment override
    env = os.environ.get("HULL_OUT_DIR", "").strip()
    if env and os.path.isdir(env):
        return env

    # Prioritas 2: struktur yang umum dipakai
    cand = os.path.join(base, "hull-tactical-market-prediction", "outputs")
    if os.path.isdir(cand):
        return cand

    # Prioritas 3: scan seluruh /kaggle/working
    all_outs = glob.glob(os.path.join(base, "**", "outputs"), recursive=True)
    if not all_outs:
        raise FileNotFoundError("Folder 'outputs' tidak ditemukan. Baseline harus dijalankan dulu.")

    ranked = []
    for d in all_outs:
        files = glob.glob(os.path.join(d, "**", "*.csv"), recursive=True)
        score = 0
        # weighting stabil
        score += 2000 if os.path.isdir(os.path.join(d, "calibration")) else 0
        score += 1500 if os.path.isdir(os.path.join(d, "signals")) else 0
        score += sum(1 for f in files if "test" in f.lower()) * 20
        score += sum(1 for f in files if "oof"  in f.lower()) * 10
        ranked.append((score, d))

    ranked.sort(key=lambda x: x[0], reverse=True)
    return ranked[0][1]

OUT_DIR = _find_outputs_dir()
SIG_DIR = os.path.join(OUT_DIR, "signals")
CAL_DIR = os.path.join(OUT_DIR, "calibration")
ALLOC_DIR = os.path.join(OUT_DIR, "allocations")
Path(ALLOC_DIR).mkdir(parents=True, exist_ok=True)

print(f"[INFO] outputs dir => {OUT_DIR}")

# ============================================================
# 1) Muat sinyal test & OOF
# ============================================================
ID_COL = "date_id"

TEST_CANDS = [
    os.path.join(CAL_DIR, "test_calibrated.csv"),
    os.path.join(SIG_DIR, "test_signal_lgbm.csv"),
]

OOF_CANDS = [
    os.path.join(CAL_DIR, "oof_calibrated.csv"),
    os.path.join(SIG_DIR, "lgbm_oof.csv"),
]

def _read_first(paths):
    for p in paths:
        if os.path.isfile(p):
            try:
                return pd.read_csv(p)
            except:
                pass
    raise FileNotFoundError(f"Gagal memuat file dari kandidat: {paths}")

test_df = _read_first(TEST_CANDS).copy()
oof_df  = _read_first(OOF_CANDS).copy()

# Pastikan date_id ada
if ID_COL not in test_df.columns:
    raise ValueError(f"Kolom wajib `{ID_COL}` tidak ditemukan pada test_df.")

# deteksi kolom prediksi
def _pick(df, cands):
    for c in cands:
        if c in df.columns:
            return c
    return None

pred_test_col = _pick(test_df, [
    "prediction_calibrated", "prediction_raw",
    "prediction", "signal", "y_pred", "allocation"
])
pred_oof_col  = _pick(oof_df, [
    "calibrated", "y_pred", "prediction", "signal"
])
tgt_oof_col   = _pick(oof_df, [
    "forward_returns","target","y_true","market_forward_excess_returns"
])

if pred_test_col is None:
    raise ValueError("Tidak menemukan kolom prediksi test.")
if pred_oof_col is None:
    raise ValueError("Tidak menemukan kolom prediksi OOF.")

print(f"[INFO] test pred => {pred_test_col}")
print(f"[INFO]  oof pred => {pred_oof_col} | target={tgt_oof_col or '-'}")

# ============================================================
# 2) Parameter mapping (aman, bisa di-tuning)
# ============================================================
PARAMS = {
    "k": 0.9,
    "m": 0.0,
    "alpha": 1.25,
    "eps": 0.02,
    "lam": 0.15,
    "clip": (-1.0, 1.0),
}

# ============================================================
# 3) Statistik robust dari OOF (anti-leak)
# ============================================================
def robust_center_scale(x):
    x = np.asarray(x, float)
    med = float(np.median(x))
    mad = float(np.median(np.abs(x - med))) + 1e-12
    rstd = 1.4826 * mad
    if rstd < 1e-12:
        rstd = float(np.std(x) + 1e-12)
    return med, rstd

mu_oof, sd_oof = robust_center_scale(oof_df[pred_oof_col].astype(float).to_numpy())
print(f"[INFO] robust OOF stats: mu={mu_oof:.5f}, sd={sd_oof:.5f}")

# ============================================================
# 4) Mapping fungsi ke allocation
# ============================================================
def soft_threshold(x, thr):
    ax = np.abs(x)
    return np.sign(x) * np.maximum(ax - thr, 0.0)

def map_to_allocation(pred_s, params):
    x = pred_s.astype(float).to_numpy()

    # standardisasi robust
    z = (x - (mu_oof + params["m"])) / (sd_oof + 1e-12)

    # non-linear tanh gain
    a = params["k"] * np.tanh(params["alpha"] * z)

    # dead-band
    a = soft_threshold(a, params["eps"])

    # smoothing EMA
    lam = params["lam"]
    if lam > 0:
        ema = np.zeros_like(a)
        ema[0] = a[0]
        for t in range(1, len(a)):
            ema[t] = (1 - lam) * ema[t-1] + lam * a[t]
        a = ema

    lo, hi = params["clip"]
    return np.clip(a, lo, hi)

# ============================================================
# 5) Terapkan ke test
# ============================================================
alloc = map_to_allocation(test_df[pred_test_col], PARAMS)

alloc_out = pd.DataFrame({
    "date_id": test_df[ID_COL].astype(int),
    "allocation": alloc.astype("float32")
}).sort_values("date_id").reset_index(drop=True)

# ============================================================
# 6) Simpan artefak
# ============================================================
ALLOC_CSV = os.path.join(ALLOC_DIR, "test_allocation_stage1.csv")
PARAMS_JSON = os.path.join(ALLOC_DIR, "mapping_params_stage1.json")
SUMMARY_JSON = os.path.join(ALLOC_DIR, "mapping_summary_stage1.json")

alloc_out.to_csv(ALLOC_CSV, index=False, float_format="%.9f")
Path(PARAMS_JSON).write_text(json.dumps(PARAMS, indent=2), encoding="utf-8")

summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "test_pred_col": pred_test_col,
    "oof_pred_col": pred_oof_col,
    "oof_target_col": tgt_oof_col,
    "oof_mu": mu_oof,
    "oof_sd": sd_oof,
    "params": PARAMS,
    "allocation_stats": {
        "min": float(alloc_out["allocation"].min()),
        "max": float(alloc_out["allocation"].max()),
        "mean": float(alloc_out["allocation"].mean()),
        "std": float(alloc_out["allocation"].std(ddof=0)),
        "n": int(len(alloc_out))
    },
    "paths": {
        "allocation_csv": ALLOC_CSV,
        "params_json": PARAMS_JSON
    }
}
Path(SUMMARY_JSON).write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("\n=== MAPPING SINYAL — SELESAI ===")
print(f"- Allocation CSV : {ALLOC_CSV}")
print(f"- Params JSON    : {PARAMS_JSON}")
print(f"- Range          : "
      f"min={summary['allocation_stats']['min']:.4f}, "
      f"max={summary['allocation_stats']['max']:.4f}, "
      f"mean={summary['allocation_stats']['mean']:.4f}, "
      f"std={summary['allocation_stats']['std']:.4f}")


[INFO] outputs dir => /kaggle/working/hull-tactical-market-prediction/outputs
[INFO] test pred => prediction_calibrated
[INFO]  oof pred => calibrated | target=y_true
[INFO] robust OOF stats: mu=-0.00007, sd=0.00000

=== MAPPING SINYAL — SELESAI ===
- Allocation CSV : /kaggle/working/hull-tactical-market-prediction/outputs/allocations/test_allocation_stage1.csv
- Params JSON    : /kaggle/working/hull-tactical-market-prediction/outputs/allocations/mapping_params_stage1.json
- Range          : min=-0.8800, max=-0.4594, mean=-0.6206, std=0.1153


# Backtest & Stress

In [7]:
# ============================================
# Tahap 5 — BACKTEST & STRESS TEST
# (auto-find, no look-ahead, robust)
# ============================================
import os, json, glob, math, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

# ---------- 0) Temukan outputs ----------
BASE_SEARCH = "/kaggle/working"

def _find_outputs_dir(base=BASE_SEARCH):
    env = os.environ.get("HULL_OUT_DIR", "").strip()
    if env and os.path.isdir(env):
        return env
    cand = os.path.join(base, "hull-tactical-market-prediction", "outputs")
    if os.path.isdir(cand):
        return cand
    all_outs = glob.glob(os.path.join(base, "**", "outputs"), recursive=True)
    if not all_outs:
        raise FileNotFoundError("Folder 'outputs' tidak ditemukan. Jalankan tahap sebelumnya.")
    # pilih yang “kaya” artefak
    scored = []
    for d in all_outs:
        files = glob.glob(os.path.join(d, "**", "*.csv"), recursive=True)
        score = 0
        score += 1000 if os.path.isdir(os.path.join(d, "signals")) else 0
        score += 500  if os.path.isdir(os.path.join(d, "calibration")) else 0
        score += 700  if os.path.isdir(os.path.join(d, "allocations")) else 0
        score += sum(1 for f in files if "oof" in f.lower())
        scored.append((score, d))
    scored.sort(reverse=True)
    return scored[0][1]

OUT_DIR    = _find_outputs_dir()
CAL_DIR    = os.path.join(OUT_DIR, "calibration")
ALLOC_DIR  = os.path.join(OUT_DIR, "allocations")
BT_DIR     = os.path.join(OUT_DIR, "backtests")
Path(BT_DIR).mkdir(parents=True, exist_ok=True)
print(f"[INFO] outputs: {OUT_DIR}")

# ---------- 1) Muat artefak ----------
# OOF terkalibrasi (punya kolom target & prediksi)
OOF_CANDIDATES = [
    os.path.join(CAL_DIR, "oof_calibrated.csv"),
    os.path.join(OUT_DIR, "calibration", "oof_calibrated.csv"),
]
# params mapping & summary (untuk konsistensi mu/sd)
PARAMS_JSON  = os.path.join(ALLOC_DIR, "mapping_params_stage1.json")
SUMMARY_JSON = os.path.join(ALLOC_DIR, "mapping_summary_stage1.json")

def _read_first(paths):
    for p in paths:
        if os.path.isfile(p):
            return pd.read_csv(p)
    raise FileNotFoundError(f"Tidak menemukan file pada: {paths}")

oof = _read_first(OOF_CANDIDATES).copy()
assert os.path.isfile(PARAMS_JSON), f"Tidak menemukan: {PARAMS_JSON}"
assert os.path.isfile(SUMMARY_JSON), f"Tidak menemukan: {SUMMARY_JSON}"

PARAMS = json.loads(Path(PARAMS_JSON).read_text(encoding="utf-8"))
SUMMAR = json.loads(Path(SUMMARY_JSON).read_text(encoding="utf-8"))

# deteksi kolom
ID_COL = "date_id"
tgt_col = None
for c in ["forward_returns","target","y_true","market_forward_excess_returns"]:
    if c in oof.columns: tgt_col = c; break
pred_col = None
for c in ["calibrated","y_pred","prediction","signal"]:
    if c in oof.columns: pred_col = c; break

assert tgt_col is not None, "Kolom target OOF tidak ditemukan."
assert pred_col is not None, "Kolom prediksi OOF tidak ditemukan."

# ---------- 2) Re-map OOF → allocation dengan PARAMS yang sama ----------
mu_oof = float(SUMMAR.get("oof_mu", 0.0))
sd_oof = float(SUMMAR.get("oof_sd", 1.0))
if sd_oof <= 0: sd_oof = float(oof[pred_col].std(ddof=0) + 1e-12)

def soft_threshold(x, thr):
    ax = np.abs(x)
    return np.sign(x) * np.maximum(ax - thr, 0.0)

def map_to_allocation(x: np.ndarray, params):
    # standardisasi pakai mu/sd OOF dari tahap mapping
    z = (x - (mu_oof + params["m"])) / (sd_oof + 1e-12)
    a = params["k"] * np.tanh(params["alpha"] * z)
    a = soft_threshold(a, params["eps"])
    lam = float(params["lam"])
    if lam > 0:
        ema = np.zeros_like(a)
        ema[0] = a[0]
        for t in range(1, len(a)):
            ema[t] = (1 - lam) * ema[t-1] + lam * a[t]
        a = ema
    lo, hi = params.get("clip", (-1.0, 1.0))
    return np.clip(a, lo, hi)

x = oof[pred_col].astype("float64").to_numpy()
alloc_oof = map_to_allocation(x, PARAMS)
oof["allocation"] = alloc_oof.astype("float32")

# ---------- 3) Hitung metrik kinerja ----------
TRADING_DAYS = 252

# daily return strategi
y = oof[tgt_col].astype("float64").to_numpy()
ret = (alloc_oof * y)
ret = np.nan_to_num(ret, nan=0.0, posinf=0.0, neginf=0.0)

def sharpe_annual(r):
    s = float(np.std(r, ddof=0))
    if s < 1e-12: return 0.0
    return float(np.mean(r) / s * math.sqrt(TRADING_DAYS))

def cagr(r):
    # asumsi 1 hari = 1 langkah trading
    cum = float(np.prod(1.0 + r))
    yrs = len(r) / TRADING_DAYS
    if yrs <= 0: return 0.0
    return float(cum ** (1.0 / yrs) - 1.0)

def max_drawdown(r):
    equity = np.cumprod(1.0 + r)
    peak = np.maximum.accumulate(equity)
    dd = equity / peak - 1.0
    return float(dd.min()), dd

def turnover(a):
    # rata-rata |Δ alloc|
    if len(a) <= 1: return 0.0
    return float(np.mean(np.abs(np.diff(a))))

sharpe = sharpe_annual(ret)
cagr_v = cagr(ret)
mdd, dd_series = max_drawdown(ret)
turn = turnover(alloc_oof)

# occupancy pada batas
lo, hi = PARAMS.get("clip", (-1.0, 1.0))
bound_eps = 1e-6
bound_lo = float(np.mean(alloc_oof <= lo + bound_eps) * 100.0)
bound_hi = float(np.mean(alloc_oof >= hi - bound_eps) * 100.0)

# ---------- 4) Simpan daily perf & rolling metrics ----------
# siapkan index tanggal jika ada; jika tidak, gunakan date_id
if ID_COL in oof.columns:
    idx = oof[ID_COL].astype(int).to_numpy()
else:
    idx = np.arange(len(oof), dtype=int)

daily = pd.DataFrame({
    "date_id": idx,
    "allocation": alloc_oof.astype("float32"),
    "target": y.astype("float32"),
    "strategy_ret": ret.astype("float32"),
})
daily["equity"] = (1.0 + daily["strategy_ret"]).cumprod()

DAILY_CSV = os.path.join(BT_DIR, "daily_perf.csv")
daily.to_csv(DAILY_CSV, index=False, float_format="%.9f")

# Rolling 63d
WIN = 63
def rolling_sharpe(arr, win=WIN):
    out = np.full(len(arr), np.nan, dtype=float)
    for i in range(win-1, len(arr)):
        w = arr[i-win+1:i+1]
        s = np.std(w, ddof=0)
        out[i] = 0.0 if s < 1e-12 else (np.mean(w)/s) * math.sqrt(TRADING_DAYS)
    return out

def rolling_cagr(arr, win=WIN):
    out = np.full(len(arr), np.nan, dtype=float)
    yrs_win = win / TRADING_DAYS
    for i in range(win-1, len(arr)):
        w = arr[i-win+1:i+1]
        total = np.prod(1.0 + w)
        out[i] = total ** (1.0 / yrs_win) - 1.0
    return out

roll = pd.DataFrame({
    "date_id": idx,
    "roll63_sharpe": rolling_sharpe(ret, WIN),
    "roll63_cagr":   rolling_cagr(ret, WIN),
})
ROLLING_CSV = os.path.join(BT_DIR, "rolling_63d_metrics.csv")
roll.to_csv(ROLLING_CSV, index=False, float_format="%.9f")

# ---------- 5) Stress tests ----------
def window_stats(arr, w):
    # cari window terburuk/terbaik berdasarkan return kumulatif
    if len(arr) < w: 
        return None
    best = (-1e18, -1, -1)  # (cumret, start, end)
    worst= ( 1e18, -1, -1)
    cumprod = np.ones(len(arr)+1, dtype=float)
    cumprod[1:] = np.cumprod(1.0 + arr)
    for i in range(len(arr)-w+1):
        j = i + w
        cr = cumprod[j]/cumprod[i] - 1.0
        if cr > best[0]:  best = (cr, i, j-1)
        if cr < worst[0]: worst = (cr, i, j-1)
    return {"best": {"cumret": float(best[0]), "start_idx": int(best[1]), "end_idx": int(best[2])},
            "worst":{"cumret": float(worst[0]),"start_idx": int(worst[1]),"end_idx": int(worst[2])}}

stress = {
    "window_5d":  window_stats(ret, 5),
    "window_21d": window_stats(ret, 21),
    "window_63d": window_stats(ret, 63),
}

STRESS_JSON = os.path.join(BT_DIR, "stress_tests.json")
Path(STRESS_JSON).write_text(json.dumps(stress, indent=2), encoding="utf-8")

# ---------- 6) Ringkasan ----------
SUMMARY_JSON = os.path.join(BT_DIR, "backtest_summary.json")
summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "oof_file": [p for p in OOF_CANDIDATES if os.path.isfile(p)][0],
    "params_file": PARAMS_JSON,
    "metrics": {
        "sharpe_ann": float(sharpe),
        "cagr": float(cagr_v),
        "max_drawdown": float(mdd),
        "turnover": float(turn),
        "bound_lo_%": bound_lo,
        "bound_hi_%": bound_hi,
        "days": int(len(ret)),
    },
    "paths": {
        "daily_perf_csv": DAILY_CSV,
        "rolling_63d_csv": ROLLING_CSV,
        "summary_json": SUMMARY_JSON,
        "stress_json": STRESS_JSON
    }
}
Path(SUMMARY_JSON).write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("\n=== BACKTEST & STRESS TEST — SELESAI ===")
print(f"- Daily perf : {DAILY_CSV}")
print(f"- Rolling(63): {ROLLING_CSV}")
print(f"- Summary    : {SUMMARY_JSON}")
print(f"- Stress     : {STRESS_JSON}")
m = summary["metrics"]
print(f"- Days       : {m['days']}")
print(f"- Sharpe ann : {m['sharpe_ann']:.3f} | CAGR {m['cagr']*100:.3f}% | MaxDD {m['max_drawdown']*100:.2f}%")
print(f"- Turnover   : {m['turnover']:.3f} | Bound@lo {m['bound_lo_%']:.1f}% | Bound@hi {m['bound_hi_%']:.1f}%")


[INFO] outputs: /kaggle/working/hull-tactical-market-prediction/outputs

=== BACKTEST & STRESS TEST — SELESAI ===
- Daily perf : /kaggle/working/hull-tactical-market-prediction/outputs/backtests/daily_perf.csv
- Rolling(63): /kaggle/working/hull-tactical-market-prediction/outputs/backtests/rolling_63d_metrics.csv
- Summary    : /kaggle/working/hull-tactical-market-prediction/outputs/backtests/backtest_summary.json
- Stress     : /kaggle/working/hull-tactical-market-prediction/outputs/backtests/stress_tests.json
- Days       : 9021
- Sharpe ann : 0.327 | CAGR 2.014% | MaxDD -39.42%
- Turnover   : 0.030 | Bound@lo 0.0% | Bound@hi 0.0%


# Tunning

In [8]:
# ============================================
# Tahap 6 — Tuning LightGBM (Optuna, clean output)
# ============================================
import os, io, json, stat, gc, math, warnings, contextlib, glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import lightgbm as lgb

# --- Anti-spam: bungkam stderr (hilangkan "1 warning generated.") ---
warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

@contextlib.contextmanager
def suppress_stderr():
    buf = io.StringIO()
    with contextlib.redirect_stderr(buf):
        yield

def lgb_train_silent(params, dtrain, valid_sets, valid_names,
                     num_boost_round, early_stopping_rounds=200, log_period=50):
    callbacks = [
        lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False),
        lgb.log_evaluation(period=log_period),
    ]
    params = {**params, "verbosity": -1}
    with suppress_stderr():
        model = lgb.train(
            params, dtrain,
            num_boost_round=num_boost_round,
            valid_sets=valid_sets, valid_names=valid_names,
            callbacks=callbacks,
        )
    return model

# --- Optuna logging agar tidak ramai ---
try:
    import optuna
    optuna.logging.set_verbosity(optuna.logging.WARNING)
except Exception:
    optuna = None

# ---------------- Config ----------------
ROOT   = "/kaggle/working/hull-tactical-market-prediction"
INPUT  = "/kaggle/input/hull-tactical-market-prediction"
OUTDIR = f"{ROOT}/outputs/tuning_lgbm"
Path(OUTDIR).mkdir(parents=True, exist_ok=True)

ID_COL   = "date_id"
TARGET_CANDS = ["forward_returns","target","y_true","market_forward_excess_returns"]
BAN_COLS = {ID_COL, "row_id", "investment_id"} | set(TARGET_CANDS)

N_FOLDS     = 25            # rolling time-based
NUM_BOOST   = 10_000
EARLY_STOP  = 600
LOG_PERIOD  = 100           # set 0 kalau ingin tanpa progress
RSTATE      = 2025
N_TRIALS    = 40            # ubah sesuai waktu
METRIC_NAME = "corr_mean"

# --------- Load data (pakai yang sudah ada di kernel bila tersedia) ---------
def _safe_load_df(default_path_csv, fallback_var_name):
    if fallback_var_name in globals():
        return globals()[fallback_var_name].copy()
    return pd.read_csv(default_path_csv)

train = _safe_load_df(f"{INPUT}/train.csv", "train")
test  = _safe_load_df(f"{INPUT}/test.csv",  "test")

# --- Tentukan target & fitur yang tersedia ---
tgt_col = next((c for c in TARGET_CANDS if c in train.columns), None)
if tgt_col is None:
    raise ValueError(f"Tidak menemukan kolom target. Kandidat: {TARGET_CANDS}")

num_cols_train = train.select_dtypes(include=[np.number]).columns.tolist()
feat_cols = [c for c in num_cols_train if c not in BAN_COLS]

# Sinkronkan fitur dengan test (hindari KeyError)
feat_cols = [c for c in feat_cols if c in test.columns]
feat_cols = sorted(feat_cols)

# Isi NA & sort waktu
train[feat_cols] = train[feat_cols].fillna(0.0)
test[feat_cols]  = test[feat_cols].fillna(0.0)
train = train.sort_values(ID_COL).reset_index(drop=True)
test  = test.sort_values(ID_COL).reset_index(drop=True)

print(f"Fitur kandidat: {len(feat_cols)} | Target: {tgt_col}")

# --------- Time-based rolling folds ---------
def make_time_folds(df, n_folds=25, id_col=ID_COL):
    dates = np.sort(df[id_col].unique())
    # bagi tanggal menjadi n_folds segmen berurutan
    chunks = np.array_split(dates, n_folds)
    folds = []
    for seg in chunks:
        val_dates = np.array(seg)
        tr_idx = df.index[df[id_col] < val_dates.min()].to_numpy()
        va_idx = df.index[df[id_col].isin(val_dates)].to_numpy()
        if len(tr_idx) == 0 or len(va_idx) == 0:
            continue
        folds.append((tr_idx, va_idx))
    return folds

folds = make_time_folds(train, N_FOLDS, ID_COL)
print(f"Folds: {len(folds)}")

# --------- Objective util ---------
def pearson_corr(y, p):
    if y.size < 2: 
        return 0.0
    v = np.corrcoef(y, p)[0,1]
    if not np.isfinite(v):
        return 0.0
    return float(v)

def cv_score(params):
    """Kembalikan mean Pearson corr OOF untuk parameter LightGBM tertentu."""
    oof_pred = np.zeros(len(train), dtype=np.float64)
    fold_stats = []
    for f, (tr_idx, va_idx) in enumerate(folds, 1):
        X_tr, y_tr = train.loc[tr_idx, feat_cols], train.loc[tr_idx, tgt_col].astype(np.float64).values
        X_va, y_va = train.loc[va_idx, feat_cols], train.loc[va_idx, tgt_col].astype(np.float64).values

        dtr = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, free_raw_data=False)

        gbm = lgb_train_silent(
            params, dtr,
            valid_sets=[dva], valid_names=[f"fold{f:02d}"],
            num_boost_round=NUM_BOOST,
            early_stopping_rounds=EARLY_STOP,
            log_period=LOG_PERIOD,
        )
        pred = gbm.predict(X_va, num_iteration=gbm.best_iteration)
        oof_pred[va_idx] = pred

        corr = pearson_corr(y_va, pred)
        fold_stats.append(corr)

        del dtr, dva, gbm, X_tr, X_va, y_tr, y_va, pred
        gc.collect()

    score = float(np.mean(fold_stats))
    return score, oof_pred, fold_stats

# --------- Optuna search space ---------
def suggest_params(trial):
    return {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("lr", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "n_jobs": -1,
        "verbosity": -1,
        "seed": RSTATE,
        "feature_fraction_bynode": 1.0,
    }

# --------- Jalankan Optuna (optional) ---------
if optuna is not None:
    def objective(trial):
        params = suggest_params(trial)
        score, _, _ = cv_score(params)
        # maximize mean corr
        return score

    study = optuna.create_study(direction="maximize", study_name="lgbm_tuning_clean")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

    best_params = suggest_params(study.best_trial)  # materialize values (tanpa distributions)
    best_score  = float(study.best_value)
else:
    # fallback: satu set params "bagus" default jika optuna tidak tersedia
    best_params = {
        "objective":"regression","metric":"rmse","boosting_type":"gbdt",
        "learning_rate":0.05,"num_leaves":127,"min_data_in_leaf":64,
        "feature_fraction":0.85,"bagging_fraction":0.85,"bagging_freq":1,
        "lambda_l1":0.0,"lambda_l2":1.0,"max_depth":-1,"n_jobs":-1,
        "verbosity":-1,"seed":RSTATE
    }
    best_score = None

# --------- CV ulang dengan best params untuk simpan artefak ---------
score, oof_pred, fold_stats = cv_score(best_params)

oof_df = train[[ID_COL, tgt_col]].copy()
oof_df["y_pred"] = oof_pred.astype(np.float32)
oof_df.to_csv(f"{OUTDIR}/lgbm_oof_tuned.csv", index=False, float_format="%.9f")

# Refit full untuk infer test
dfull = lgb.Dataset(train[feat_cols], label=train[tgt_col].astype(np.float64).values, free_raw_data=False)
gbm_full = lgb_train_silent(
    best_params, dfull,
    valid_sets=[dfull], valid_names=["full"],
    num_boost_round=NUM_BOOST,
    early_stopping_rounds=EARLY_STOP,
    log_period=0,  # refit hening
)
test_pred = gbm_full.predict(test[feat_cols], num_iteration=gbm_full.best_iteration)

test_sig = test[[ID_COL]].copy()
test_sig["prediction"] = test_pred.astype(np.float32)
test_sig.to_csv(f"{OUTDIR}/test_signal_lgbm_tuned.csv", index=False, float_format="%.9f")

# --------- Simpan ringkasan ---------
summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "target": tgt_col,
    "n_features": len(feat_cols),
    "n_folds": len(folds),
    "metric": METRIC_NAME,
    "mean_corr_oof": float(score),
    "fold_corrs": [float(x) for x in fold_stats],
    "best_params": best_params,
    "num_boost_round": NUM_BOOST,
    "early_stopping_rounds": EARLY_STOP,
    "log_period": LOG_PERIOD,
    "artifacts": {
        "oof_csv": f"{OUTDIR}/lgbm_oof_tuned.csv",
        "test_signal_csv": f"{OUTDIR}/test_signal_lgbm_tuned.csv",
        "best_params_json": f"{OUTDIR}/best_params.json",
        "model_full_txt": f"{OUTDIR}/lgbm_full_tuned.txt",
    }
}
Path(f"{OUTDIR}/best_params.json").write_text(json.dumps(summary["best_params"], indent=2), encoding="utf-8")
gbm_full.save_model(f"{OUTDIR}/lgbm_full_tuned.txt")
Path(f"{OUTDIR}/summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("\n=== TUNING LGBM (CLEAN) — SELESAI ===")
if best_score is not None:
    print(f"- Optuna best corr (study): {best_score:.6f}")
print(f"- OOF mean corr (recalc)  : {score:.6f}")
print(f"- Fitur dipakai           : {len(feat_cols)}")
print(f"- OOF CSV                 : {OUTDIR}/lgbm_oof_tuned.csv")
print(f"- Test signal             : {OUTDIR}/test_signal_lgbm_tuned.csv")
print(f"- Params JSON             : {OUTDIR}/best_params.json")


Fitur kandidat: 89 | Target: forward_returns
Folds: 24
[100]	fold01's rmse: 0.00712788
[200]	fold01's rmse: 0.00712788
[300]	fold01's rmse: 0.00712788
[400]	fold01's rmse: 0.00712788
[500]	fold01's rmse: 0.00712788
[600]	fold01's rmse: 0.00712788
[100]	fold02's rmse: 0.00581698
[200]	fold02's rmse: 0.00581698
[300]	fold02's rmse: 0.00581698
[400]	fold02's rmse: 0.00581698
[500]	fold02's rmse: 0.00581698
[600]	fold02's rmse: 0.00581698
[100]	fold03's rmse: 0.00593723
[200]	fold03's rmse: 0.00593723
[300]	fold03's rmse: 0.00593723
[400]	fold03's rmse: 0.00593723
[500]	fold03's rmse: 0.00593723
[600]	fold03's rmse: 0.00593723
[100]	fold04's rmse: 0.00788853
[200]	fold04's rmse: 0.00788853
[300]	fold04's rmse: 0.00788853
[400]	fold04's rmse: 0.00788853
[500]	fold04's rmse: 0.00788853
[600]	fold04's rmse: 0.00788853
[100]	fold05's rmse: 0.0110658
[200]	fold05's rmse: 0.0110658
[300]	fold05's rmse: 0.0110658
[400]	fold05's rmse: 0.0110658
[500]	fold05's rmse: 0.0110658
[600]	fold05's rmse: 0

# Validasi

In [9]:
# ============================================
# Tahap 7 — Validasi (OOF/Test/Submission)
# ============================================
import os, json, glob, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr

warnings.filterwarnings("ignore")

# ---------- Lokasi & artefak ----------
ROOT    = "/kaggle/working/hull-tactical-market-prediction"
INPUT   = "/kaggle/input/hull-tactical-market-prediction"
OUTDIR  = f"{ROOT}/outputs"
VALDIR  = f"{OUTDIR}/validation"
Path(VALDIR).mkdir(parents=True, exist_ok=True)

ID_COL        = "date_id"
TARGET_CANDS  = ["forward_returns","target","y_true","market_forward_excess_returns"]
PRED_CANDS    = ["calibrated","y_pred","prediction","signal","allocation","weight","pred"]
TEST_PRED_CANDS = ["prediction_calibrated","prediction","signal","y_pred","allocation","weight"]

# Artefak umum yang mungkin sudah ada
OOF_HINTS = [
    f"{OUTDIR}/calibration/oof_calibrated.csv",
    f"{OUTDIR}/tuning_lgbm/lgbm_oof_tuned.csv",
    f"{OUTDIR}/signals/lgbm_oof.csv",
]
TEST_HINTS = [
    f"{OUTDIR}/calibration/test_calibrated.csv",
    f"{OUTDIR}/tuning_lgbm/test_signal_lgbm_tuned.csv",
    f"{OUTDIR}/signals/test_signal_lgbm.csv",
]
SUB_HINTS = [
    f"{ROOT}/kaggle_evaluation/submission.csv",
    "/kaggle/working/submission.csv",
]

# ---------- Util ----------
def _metrics(y, p):
    rmse = float(np.sqrt(mean_squared_error(y, p)))
    mae  = float(mean_absolute_error(y, p))
    corr = float(np.corrcoef(y, p)[0,1]) if len(y) > 1 else float("nan")
    try:
        spr = float(spearmanr(y, p, nan_policy="omit").correlation)
    except Exception:
        spr = float("nan")
    return {"rmse": rmse, "mae": mae, "corr": corr, "spearman": spr}

def _pick_col(df, cands):
    for c in cands:
        if c in df.columns: return c
    return None

def _first_exists(paths):
    for p in paths:
        if os.path.isfile(p): return p
    return None

def _list_all_csvs(base):
    files = glob.glob(f"{base}/**/*.csv", recursive=True) + glob.glob(f"{base}/**/*.parquet", recursive=True)
    return files

def _read_any(fp):
    return pd.read_parquet(fp) if fp.endswith(".parquet") else pd.read_csv(fp)

# ---------- 1) Muat data train (untuk target & sanity) ----------
train = pd.read_csv(f"{INPUT}/train.csv")
tgt_col = next((c for c in TARGET_CANDS if c in train.columns), None)
if tgt_col is None:
    raise ValueError(f"Tidak menemukan kolom target pada train. Kandidat: {TARGET_CANDS}")

# ---------- 2) Temukan OOF & Test secara otomatis ----------
oof_fp  = _first_exists(OOF_HINTS)
test_fp = _first_exists(TEST_HINTS)

# fallback: cari seluruh CSV bila hint tidak ada (mis. nama folder berbeda)
if oof_fp is None or test_fp is None:
    for fp in _list_all_csvs(OUTDIR):
        try:
            df = _read_any(fp)
            if oof_fp is None:
                if (_pick_col(df, PRED_CANDS) is not None) and (_pick_col(df, TARGET_CANDS) is not None):
                    oof_fp = fp
            if test_fp is None:
                if (_pick_col(df, TEST_PRED_CANDS) is not None) and (ID_COL in df.columns):
                    test_fp = fp
            if oof_fp and test_fp: break
        except Exception:
            pass

if oof_fp is None:
    raise FileNotFoundError("Tidak menemukan OOF (cari di outputs/**). Jalankan baseline/tuning/kalibrasi dulu.")
if test_fp is None:
    raise FileNotFoundError("Tidak menemukan prediksi test (cari di outputs/**).")

print(f"[INFO] OOF  : {oof_fp}")
print(f"[INFO] Test : {test_fp}")

oof  = _read_any(oof_fp).copy()
test = _read_any(test_fp).copy()

# Tentukan kolom prediksi
oof_pred_col  = _pick_col(oof,  PRED_CANDS)
test_pred_col = _pick_col(test, TEST_PRED_CANDS)
if oof_pred_col is None:
    raise ValueError(f"Tidak menemukan kolom prediksi pada OOF. Kandidat: {PRED_CANDS}")
if test_pred_col is None:
    raise ValueError(f"Tidak menemukan kolom prediksi pada Test. Kandidat: {TEST_PRED_CANDS}")

# Pastikan target tersedia di OOF: jika tidak, map dari train via (date_id)
if _pick_col(oof, TARGET_CANDS) is None:
    if ID_COL in oof.columns and ID_COL in train.columns:
        oof = oof.merge(train[[ID_COL, tgt_col]], on=ID_COL, how="left", validate="m:1")
    else:
        raise ValueError("OOF tidak punya target & tidak bisa merge via date_id.")

oof_tgt_col = _pick_col(oof, TARGET_CANDS)

# ---------- 3) Validasi OOF: metrik global & stability by time ----------
oof = oof.sort_values(ID_COL).reset_index(drop=True)
y = oof[oof_tgt_col].astype("float64").to_numpy()
p = oof[oof_pred_col].astype("float64").to_numpy()
m_global = _metrics(y, p)

# Bagi menjadi N irisan waktu berimbang
N_SLICES = 10
dates = np.sort(oof[ID_COL].unique())
chunks = np.array_split(dates, N_SLICES)
slice_rows = []
for i, seg in enumerate(chunks, 1):
    sub = oof[oof[ID_COL].isin(seg)]
    my = sub[oof_tgt_col].astype("float64").to_numpy()
    mp = sub[oof_pred_col].astype("float64").to_numpy()
    mm = _metrics(my, mp)
    mm.update({
        "slice": i,
        "date_min": int(seg.min()),
        "date_max": int(seg.max()),
        "n": int(sub.shape[0]),
        "pred_mean": float(sub[oof_pred_col].mean()),
        "pred_std": float(sub[oof_pred_col].std(ddof=0))
    })
    slice_rows.append(mm)

oof_slice_df = pd.DataFrame(slice_rows)
oof_slice_df.to_csv(f"{VALDIR}/oof_slice_metrics.csv", index=False)

# ---------- 4) Drift check: distribusi prediksi OOF vs Test ----------
# Normalisasi ringan: pastikan numeric & tanpa NaN/Inf
def _clean(x):
    x = pd.Series(x, dtype="float64").replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return x.to_numpy()

oo = _clean(oof[oof_pred_col])
tt = _clean(test[test_pred_col])

dist = {
    "oof": {"mean": float(np.mean(oo)), "std": float(np.std(oo, ddof=0)),
            "min": float(np.min(oo)), "max": float(np.max(oo)), "n": int(oof.shape[0])},
    "test": {"mean": float(np.mean(tt)), "std": float(np.std(tt, ddof=0)),
             "min": float(np.min(tt)), "max": float(np.max(tt)), "n": int(test.shape[0])},
    "ratio_std_test_to_oof": float((np.std(tt, ddof=0)+1e-12)/(np.std(oo, ddof=0)+1e-12)),
}

# ---------- 5) Cek submission (opsional) ----------
sub_fp = _first_exists(SUB_HINTS)
sub_info = None
if sub_fp and os.path.isfile(sub_fp):
    sub = pd.read_csv(sub_fp)
    ok_schema = (ID_COL in sub.columns) and (_pick_col(sub, ["allocation","prediction","signal"]) is not None)
    miss = None
    if ok_schema:
        # harap semua id test ada
        miss = sorted(set(test[ID_COL].unique().tolist()) - set(sub[ID_COL].unique().tolist()))
        miss = len(miss)
    sub_info = {
        "path": sub_fp,
        "ok_schema": bool(ok_schema),
        "rows": int(sub.shape[0]),
        "missing_ids_vs_test": int(miss if miss is not None else -1)
    }

# ---------- 6) Simpan ringkasan ----------
summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "files": {
        "oof": oof_fp,
        "test": test_fp,
        "submission": sub_info["path"] if sub_info else None
    },
    "columns": {
        "oof_pred": oof_pred_col,
        "oof_target": oof_tgt_col,
        "test_pred": test_pred_col
    },
    "oof_metrics_global": m_global,
    "oof_metrics_slices_csv": f"{VALDIR}/oof_slice_metrics.csv",
    "distribution_check": dist,
    "submission_check": sub_info
}
Path(f"{VALDIR}/validation_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")

# Simpan sampel residual untuk audit cepat
res = oof[[ID_COL, oof_tgt_col, oof_pred_col]].copy()
res["residual"] = res[oof_tgt_col].astype("float64") - res[oof_pred_col].astype("float64")
res.head(2000).to_csv(f"{VALDIR}/oof_residual_sample.csv", index=False, float_format="%.9f")

print("\n=== VALIDASI — SELESAI ===")
print(f"- OOF global : RMSE={m_global['rmse']:.6f} | MAE={m_global['mae']:.6f} | Corr={m_global['corr']:.4f} | Spearman={m_global['spearman']:.4f}")
print(f"- Slice CSV  : {VALDIR}/oof_slice_metrics.csv")
print(f"- Distribusi : OOF std={dist['oof']['std']:.6f} | Test std={dist['test']['std']:.6f} | Rasio std={dist['ratio_std_test_to_oof']:.3f}")
if sub_info:
    print(f"- Submission : {sub_info['path']} | ok_schema={sub_info['ok_schema']} | rows={sub_info['rows']} | missing_ids_vs_test={sub_info['missing_ids_vs_test']}")
print(f"- Ringkasan  : {VALDIR}/validation_summary.json")


[INFO] OOF  : /kaggle/working/hull-tactical-market-prediction/outputs/calibration/oof_calibrated.csv
[INFO] Test : /kaggle/working/hull-tactical-market-prediction/outputs/calibration/test_calibrated.csv

=== VALIDASI — SELESAI ===
- OOF global : RMSE=0.010512 | MAE=0.007490 | Corr=0.0930 | Spearman=0.0555
- Slice CSV  : /kaggle/working/hull-tactical-market-prediction/outputs/validation/oof_slice_metrics.csv
- Distribusi : OOF std=0.000655 | Test std=0.000172 | Rasio std=0.262
- Ringkasan  : /kaggle/working/hull-tactical-market-prediction/outputs/validation/validation_summary.json


# Submission

In [10]:
# ============================================
# Tahap 8 — Submission (Final Anti-Error Version)
# 100% cocok format Hull Tactical (PARQUET + API Server)
# ============================================
import os, json, warnings, time
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd
import polars as pl
import kaggle_evaluation.default_inference_server

warnings.filterwarnings("ignore")

ROOT   = "/kaggle/working/hull-tactical-market-prediction"
INPUT  = "/kaggle/input/hull-tactical-market-prediction"
OUTDIR = f"{ROOT}/outputs"
KEDIR  = f"{ROOT}/kaggle_evaluation"
Path(KEDIR).mkdir(parents=True, exist_ok=True)

ID_COL     = "date_id"
SUBMIT_COL = "allocation"
CLIP_RANGE = (-1.0, 1.0)

CANDIDATES = [
    f"{OUTDIR}/mapped/test_allocation.csv",
    f"{OUTDIR}/calibration/test_calibrated.csv",
    f"{OUTDIR}/signals/test_signal_lgbm.csv",
    f"{OUTDIR}/tuning_lgbm/test_signal_lgbm_tuned.csv",
]

# -------------------------
# UTIL
# -------------------------
def _first_exists(paths):
    for p in paths:
        if os.path.isfile(p):
            return p
    return None

def _auto_pick_pred_col(df):
    for c in ["allocation","prediction_calibrated","prediction","signal","y_pred","weight","lgbm_pred"]:
        if c in df.columns:
            return c
    return None

def _read(path):
    return pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path)

def _standardize(X, mean, std):
    std = np.where(std == 0, 1.0, std)
    return (X - mean) / std

def _fit_ridge(X, y, l2=1e-2):
    XT = X.T
    A = XT @ X
    A += l2 * np.eye(A.shape[0])
    w = np.linalg.solve(A, XT @ y)
    return w

def _train_ridge_from_train_csv(train_csv_path: str):
    tr = pd.read_csv(train_csv_path)

    TARGET_CANDS = ["forward_returns","target","market_forward_excess_returns"]
    tgt = next((c for c in TARGET_CANDS if c in tr.columns), None)
    if tgt is None:
        raise RuntimeError("Target tidak ditemukan.")

    # hanya fitur numerik
    feat_cols = [
        c for c in tr.select_dtypes(include=[np.number]).columns 
        if c not in {ID_COL, tgt}
    ]

    X = tr[feat_cols].to_numpy(float)
    y = tr[tgt].to_numpy(float)

    mean = X.mean(axis=0)
    std  = X.std(axis=0, ddof=0)
    Xz   = _standardize(X, mean, std)

    y_mean = y.mean()
    w = _fit_ridge(Xz, y - y_mean, l2=1e-2)
    b = y_mean

    return {
        "feat_cols": feat_cols,
        "mean": mean,
        "std": std,
        "w": w,
        "b": b,
    }

def _predict_with_ridge_batch(df, ridge):
    cols = ridge["feat_cols"]
    for c in cols:
        if c not in df.columns:
            df[c] = 0.0

    X = df[cols].to_numpy(float)
    Xz = _standardize(X, ridge["mean"], ridge["std"])
    yhat = Xz @ ridge["w"] + ridge["b"]
    return np.tanh(yhat).astype("float32")


# ============================================
# 1) Build Final Submission
# ============================================
test_full = pd.read_csv(f"{INPUT}/test.csv")
test_ids  = np.sort(test_full[ID_COL].unique())

# 1a. pilih sumber prediksi terbaik
src = _first_exists(CANDIDATES)
pred_col, dfp = None, None

if src:
    try:
        dfp = _read(src)
        pred_col = _auto_pick_pred_col(dfp)
    except:
        src, pred_col, dfp = None, None, None


# 1b. Kalau tidak ada sinyal → fallback Ridge (selalu aman)
if src is None or pred_col is None:
    ridge = _train_ridge_from_train_csv(f"{INPUT}/train.csv")
    alloc = _predict_with_ridge_batch(test_full, ridge)

    sub = pd.DataFrame({
        ID_COL: test_full[ID_COL].astype("int64"),
        SUBMIT_COL: alloc
    })

else:
    # gunakan file sinyal
    dfp = dfp[[ID_COL, pred_col]].copy()
    dfp = dfp.drop_duplicates(ID_COL, keep="last")

    sub = pd.DataFrame({ID_COL: test_ids})
    sub = sub.merge(dfp.rename(columns={pred_col: SUBMIT_COL}), on=ID_COL, how="left")


# 1c. bersihkan nilai
sub[SUBMIT_COL] = sub[SUBMIT_COL].replace([np.inf, -np.inf], 0).fillna(0.0)
sub[SUBMIT_COL] = np.clip(sub[SUBMIT_COL], *CLIP_RANGE)

# 1d. format final HARUS hanya 2 kolom
sub = sub[[ID_COL, SUBMIT_COL]]
sub = sub.sort_values(ID_COL).reset_index(drop=True)
sub = sub.astype({ID_COL: "int64", SUBMIT_COL: "float32"})

# Check ukuran wajib 1260
assert sub.shape[0] == len(test_ids), "Jumlah baris submission tidak sama dengan test.csv"

# simpan parquet
SUB_PARQUET = "/kaggle/working/submission.parquet"
sub.to_parquet(SUB_PARQUET, index=False)

# arsip
sub.to_csv(f"{KEDIR}/submission.csv", index=False)
sub.to_csv(f"{KEDIR}/submission_audit.csv", index=False)

# ============================================
# 2) Inference Server (API)
# ============================================
STATE = {"ready": False}

def _ensure_model_ready():
    if STATE["ready"]:
        return
    info = _train_ridge_from_train_csv(f"{INPUT}/train.csv")
    STATE.update(info)
    STATE["ready"] = True
    print(f"[INIT] model inference siap: {len(info['feat_cols'])} fitur")

def _predict_numpy(X):
    z = _standardize(X, STATE["mean"], STATE["std"])
    p = z @ STATE["w"] + STATE["b"]
    return float(np.clip(np.tanh(p.mean()), -1.0, 1.0))

def predict(test: pl.DataFrame) -> float:
    _ensure_model_ready()
    cols = STATE["feat_cols"]
    df = test.select([pl.col(c) if c in test.columns else pl.lit(0).alias(c) for c in cols])
    X = df.to_numpy().astype("float64")
    return _predict_numpy(X)

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv("KAGGLE_IS_COMPETITION_RERUN"):
    inference_server.serve()
else:
    inference_server.run_local_gateway((INPUT,))

print("=== SUBMISSION FINAL — SELESAI ===")


[INIT] model inference siap: 96 fitur
=== SUBMISSION FINAL — SELESAI ===
