# Tujuan & Artefak

In [1]:
# ============================================
# Tahap 0 — Tujuan & Artefak (Kaggle)
# ============================================
import os, json, sys, hashlib, platform, textwrap, random
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

# -------------------------------
# 1) Lokasi data & working dir
# -------------------------------
DATA_DIR  = "/kaggle/input/hull-tactical-market-prediction"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV  = f"{DATA_DIR}/test.csv"

ROOT      = "/kaggle/working/hull-tactical-market-prediction"
OUT_DIRS  = {
    "preprocessed": f"{ROOT}/outputs/preprocessed",
    "features":     f"{ROOT}/outputs/features",
    "signals":      f"{ROOT}/outputs/signals",
    "models":       f"{ROOT}/outputs/models",
    "allocations":  f"{ROOT}/outputs/allocations",
    "backtests":    f"{ROOT}/outputs/backtests",
    "kaggle_eval":  f"{ROOT}/kaggle_evaluation",
    "logs":         f"{ROOT}/outputs",
}
for p in OUT_DIRS.values():
    Path(p).mkdir(parents=True, exist_ok=True)

META_JSON = f"{ROOT}/outputs/project_meta.json"
PROGRESS  = f"{ROOT}/outputs/progress.json"

# -------------------------------
# 2) Tujuan proyek (ringkas)
# -------------------------------
GOALS = textwrap.dedent("""
    Tujuan:
    1) Membangun pipeline prediksi sinyal harian yang anti-leak untuk kompetisi
       Hull Tactical Market Prediction.
    2) Menghasilkan alokasi (kolom 'allocation') untuk setiap `date_id` pada test set
       yang siap submit ke Kaggle.
    3) Menjaga reprodusibilitas (seed, versi library), serta jejak artefak tiap tahap.

    Artefak inti yang akan dibuat:
    - outputs/preprocessed/   : hasil hygiene & pembersihan awal (train/test)
    - outputs/features/       : fitur turunan (risk/momentum, PCA rolling, dsb.)
    - outputs/signals/        : OOF/test signal dari model baseline/“berat”
    - outputs/models/         : model terlatih / checkpoint
    - outputs/allocations/    : mapping sinyal → bobot (train/test)
    - outputs/backtests/      : metrik backtest & stress test
    - kaggle_evaluation/      : submission.csv (+ audit & meta)
""").strip()

print(GOALS, "\n")

# -------------------------------
# 3) Util: seed & hashing
# -------------------------------
SEED = 2025
def set_seed(seed: int = 2025):
    random.seed(seed)
    np.random.seed(seed)
set_seed(SEED)

def sha1sum(path, chunksize: int = 1 << 20):
    h = hashlib.sha1()
    with open(path, "rb") as f:
        for chunk in iter(lambda: f.read(chunksize), b""):
            h.update(chunk)
    return h.hexdigest()

# -------------------------------
# 4) Ringkasan dataset
# -------------------------------
def quick_stats(csv_path: str):
    assert os.path.isfile(csv_path), f"File tidak ditemukan: {csv_path}"
    cols = None
    try:
        # upaya hemat memori: ambil hanya date_id jika ada
        try:
            s = pd.read_csv(csv_path, usecols=["date_id"])
        except Exception:
            s = pd.read_csv(csv_path, nrows=1000)  # sample ringan untuk deteksi kolom
            cols = list(s.columns)
            s = pd.read_csv(csv_path, usecols=["date_id"]) if "date_id" in s.columns else pd.read_csv(csv_path, usecols=None)

        n_rows = sum(1 for _ in open(csv_path, "rb")) - 1  # cepat & stabil
        has_date = "date_id" in s.columns
        out = {
            "path": csv_path,
            "sha1": sha1sum(csv_path),
            "rows": int(n_rows),
            "has_date_id": bool(has_date),
        }
        if has_date:
            out["date_id_min"] = int(s["date_id"].min())
            out["date_id_max"] = int(s["date_id"].max())
            out["date_id_nunique"] = int(s["date_id"].nunique())
        if cols is None:
            # jika tadi langsung usecols=date_id berhasil, tetap tampilkan header sebenarnya
            head_df = pd.read_csv(csv_path, nrows=3)
            cols = list(head_df.columns)
        out["columns"] = cols if cols is not None else list(pd.read_csv(csv_path, nrows=3).columns)
        return out
    except Exception as e:
        return {"path": csv_path, "error": repr(e)}

train_info = quick_stats(TRAIN_CSV)
test_info  = quick_stats(TEST_CSV)

print("Ringkasan DATASET")
print("-"*60)
for info in (train_info, test_info):
    print(f"File     : {info.get('path')}")
    if "error" in info:
        print(f"Status   : ERROR → {info['error']}\n")
        continue
    print(f"SHA1     : {info.get('sha1')}")
    print(f"Rows     : {info.get('rows')}")
    print(f"date_id? : {info.get('has_date_id')}")
    if info.get("has_date_id"):
        print(f"date_id  : [{info.get('date_id_min')}, {info.get('date_id_max')}] "
              f"(unique={info.get('date_id_nunique')})")
    print(f"Columns  : {info.get('columns')}\n")

# -------------------------------
# 5) Snapshot lingkungan & artefak
# -------------------------------
def safe_version(pkg_name):
    try:
        mod = __import__(pkg_name)
        return getattr(mod, "__version__", "unknown")
    except Exception:
        return "not-installed"

ENV = {
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "pandas": safe_version("pandas"),
    "numpy": safe_version("numpy"),
    "scipy": safe_version("scipy"),
    "sklearn": safe_version("sklearn"),
    "lightgbm": safe_version("lightgbm"),
    "torch": safe_version("torch"),
    "xgboost": safe_version("xgboost"),
}

ARTEFACTS = {
    "preprocessed_train": f"{OUT_DIRS['preprocessed']}/train_preprocessed.parquet",
    "preprocessed_test":  f"{OUT_DIRS['preprocessed']}/test_preprocessed.parquet",
    "features_train":     f"{OUT_DIRS['features']}/train_features.parquet",
    "features_test":      f"{OUT_DIRS['features']}/test_features.parquet",
    "baseline_oof":       f"{OUT_DIRS['signals']}/baseline_oof.csv",
    "baseline_test":      f"{OUT_DIRS['signals']}/test_signal_baseline.csv",
    "alloc_stage3_test":  f"{OUT_DIRS['allocations']}/test_allocation_stage3.csv",
    "submission":         f"{OUT_DIRS['kaggle_eval']}/submission.csv",
}

meta = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "seed": SEED,
    "data": {"train": train_info, "test": test_info},
    "env": ENV,
    "root": ROOT,
    "goals": GOALS,
    "artefacts": ARTEFACTS,
}
Path(META_JSON).write_text(json.dumps(meta, indent=2), encoding="utf-8")

# progress.json (append-safe)
try:
    progress = json.loads(Path(PROGRESS).read_text(encoding="utf-8")) if os.path.isfile(PROGRESS) else {}
except Exception:
    progress = {}
progress["objectives_and_artefacts"] = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "env": ENV,
    "artefacts": ARTEFACTS,
}
Path(PROGRESS).write_text(json.dumps(progress, indent=2), encoding="utf-8")

print("-"*60)
print(f"Meta ditulis ke : {META_JSON}")
print(f"Progress log   : {PROGRESS}")
print("Struktur OUT   :")
for k, v in OUT_DIRS.items():
    print(f"  - {k:<12} -> {v}")
print("\nTahap 0 selesai. Lanjut ke 'Higiene Data & Target'.")


Tujuan:
1) Membangun pipeline prediksi sinyal harian yang anti-leak untuk kompetisi
   Hull Tactical Market Prediction.
2) Menghasilkan alokasi (kolom 'allocation') untuk setiap `date_id` pada test set
   yang siap submit ke Kaggle.
3) Menjaga reprodusibilitas (seed, versi library), serta jejak artefak tiap tahap.

Artefak inti yang akan dibuat:
- outputs/preprocessed/   : hasil hygiene & pembersihan awal (train/test)
- outputs/features/       : fitur turunan (risk/momentum, PCA rolling, dsb.)
- outputs/signals/        : OOF/test signal dari model baseline/“berat”
- outputs/models/         : model terlatih / checkpoint
- outputs/allocations/    : mapping sinyal → bobot (train/test)
- outputs/backtests/      : metrik backtest & stress test
- kaggle_evaluation/      : submission.csv (+ audit & meta) 

Ringkasan DATASET
------------------------------------------------------------
File     : /kaggle/input/hull-tactical-market-prediction/train.csv
SHA1     : d6cc777a2d19a1c36e28ff49f6e33549

# Higiene Data & Target

In [2]:
# ============================================
# Tahap 1 — Higiene Data & Target
# ============================================
import os, json, re, math, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

# --- Lokasi (ikuti Tahap 0) ---
DATA_DIR  = "/kaggle/input/hull-tactical-market-prediction"
TRAIN_CSV = f"{DATA_DIR}/train.csv"
TEST_CSV  = f"{DATA_DIR}/test.csv"

ROOT      = "/kaggle/working/hull-tactical-market-prediction"
OUT_PRE   = f"{ROOT}/outputs/preprocessed"
OUT_LOG   = f"{ROOT}/outputs"
Path(OUT_PRE).mkdir(parents=True, exist_ok=True)

TRAIN_PP  = f"{OUT_PRE}/train_preprocessed.parquet"
TEST_PP   = f"{OUT_PRE}/test_preprocessed.parquet"
HYGIENE_META = f"{OUT_LOG}/hygiene_meta.json"
PROGRESS  = f"{OUT_LOG}/progress.json"

# --- Helper ---
def to_snake(s: str) -> str:
    s2 = re.sub(r"[/\s\-]+", "_", s.strip())
    s2 = re.sub(r"(?<=[a-z0-9])([A-Z])", r"_\1", s2)
    s2 = re.sub(r"__+", "_", s2)
    return s2.lower()

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    mapper = {c: to_snake(c) for c in df.columns}
    df = df.rename(columns=mapper)
    # buang spasi/duplikat kolom bila ada
    df = df.loc[:, ~df.columns.duplicated()]
    return df

def is_numeric_series(s: pd.Series) -> bool:
    return pd.api.types.is_integer_dtype(s) or pd.api.types.is_float_dtype(s)

def n_missing(s: pd.Series) -> int:
    return int(s.isna().sum())

# --- 1) Load & normalisasi nama kolom ---
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

train = normalize_columns(train)
test  = normalize_columns(test)

# --- 2) Pastikan date_id ada & bertipe int ---
if "date_id" not in train.columns or "date_id" not in test.columns:
    raise ValueError("Kolom `date_id` wajib ada di train & test.")

# cast aman ke int (jika float tanpa desimal)
for df in (train, test):
    if not pd.api.types.is_integer_dtype(df["date_id"]):
        if np.isclose(df["date_id"] % 1, 0).all():
            df["date_id"] = df["date_id"].astype(np.int64)
        else:
            # jika tidak bulat semua, paksa int setelah round
            df["date_id"] = np.round(df["date_id"]).astype(np.int64)

# urut & dedup per date_id (jaga-jaga)
train = train.sort_values("date_id").drop_duplicates().reset_index(drop=True)
test  = test.sort_values("date_id").drop_duplicates().reset_index(drop=True)

# --- 3) Deteksi kandidat kolom target (hanya di train, tidak ada di test) ---
# Pola umum: target, y, label, (ret|return).*fwd, .*forward.*, market_forward_excess_returns, allocation
CAND_PATTERNS = [
    r"^target$",
    r"^y$",
    r"label",
    r"(ret|return).*fwd",
    r"forward",
    r"market_forward_excess_returns",
    r"^allocation$"  # bila train punya historis allocation sbg label
]
def looks_like_target(col: str) -> bool:
    c = col.lower()
    return any(re.search(p, c) for p in CAND_PATTERNS)

target_candidates = [c for c in train.columns
                     if c not in test.columns and looks_like_target(c) and is_numeric_series(train[c])]

TARGET_COL = target_candidates[0] if len(target_candidates) > 0 else None

# --- 4) Bersihkan nilai anomali global (Inf → NaN), drop kolom buruk ---
def clean_frame(df: pd.DataFrame) -> pd.DataFrame:
    df = df.replace([np.inf, -np.inf], np.nan)
    # buang kolom yang seluruhnya NaN
    all_nan_cols = [c for c in df.columns if df[c].isna().all()]
    if all_nan_cols:
        df = df.drop(columns=all_nan_cols)
    # buang kolom konstan
    const_cols = []
    for c in df.columns:
        # jangan drop date_id & calon target
        if c == "date_id" or c == TARGET_COL:
            continue
        s = df[c]
        try:
            if s.nunique(dropna=True) <= 1:
                const_cols.append(c)
        except Exception:
            pass
    if const_cols:
        df = df.drop(columns=const_cols)
    return df

train = clean_frame(train)
test  = clean_frame(test)

# --- 5) Selaraskan fitur numerik yang sama di train & test (anti-leak) ---
#    - simpan target (jika ada) di train, tapi JANGAN ikut ke test
num_cols_train = [c for c in train.columns if c not in ("date_id",) and (TARGET_COL is None or c != TARGET_COL) and is_numeric_series(train[c])]
num_cols_test  = [c for c in test.columns  if c != "date_id" and is_numeric_series(test[c])]

shared_num = sorted(list(set(num_cols_train).intersection(set(num_cols_test))))
kept_cols_train = ["date_id"] + ([TARGET_COL] if TARGET_COL else []) + shared_num
kept_cols_test  = ["date_id"] + shared_num

train_pp = train[kept_cols_train].copy()
test_pp  = test[kept_cols_test].copy()

# --- 6) Laporan ringkas higiene ---
def missing_report(df):
    rep = []
    for c in df.columns:
        rep.append({
            "col": c,
            "dtype": str(df[c].dtype),
            "n_missing": n_missing(df[c]),
            "frac_missing": float(n_missing(df[c]) / len(df)) if len(df) else 0.0,
            "n_unique": int(df[c].nunique(dropna=True))
        })
    return pd.DataFrame(rep).sort_values(["frac_missing","n_missing","col"], ascending=[False, False, True])

rep_train = missing_report(train_pp)
rep_test  = missing_report(test_pp)

# --- 7) Simpan artefak ---
train_pp.to_parquet(TRAIN_PP, index=False)
test_pp.to_parquet(TEST_PP, index=False)

# CSV kecil untuk audit cepat (5 baris pertama)
train_head_csv = f"{OUT_PRE}/_train_head.csv"
test_head_csv  = f"{OUT_PRE}/_test_head.csv"
train_pp.head(5).to_csv(train_head_csv, index=False)
test_pp.head(5).to_csv(test_head_csv, index=False)

# Meta JSON
meta = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "input": {"train_csv": TRAIN_CSV, "test_csv": TEST_CSV},
    "target_col_detected": TARGET_COL,
    "n_rows": {"train": int(len(train_pp)), "test": int(len(test_pp))},
    "n_shared_numeric_features": len(shared_num),
    "shared_numeric_features": shared_num[:2000],  # batasi panjang
    "dropped_because_const_or_allnan": list(set(train.columns) ^ set(train_pp.columns)) + list(set(test.columns) ^ set(test_pp.columns)),
    "missing_summary": {
        "train_top10": rep_train.head(10).to_dict(orient="records"),
        "test_top10":  rep_test.head(10).to_dict(orient="records"),
    }
}
Path(HYGIENE_META).write_text(json.dumps(meta, indent=2), encoding="utf-8")

# progress.json (append-safe)
try:
    progress = json.loads(Path(PROGRESS).read_text(encoding="utf-8")) if os.path.isfile(PROGRESS) else {}
except Exception:
    progress = {}
progress["hygiene_data_target"] = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "target_col": TARGET_COL,
    "train_pp": TRAIN_PP,
    "test_pp": TEST_PP,
    "n_shared_numeric_features": len(shared_num),
}
Path(PROGRESS).write_text(json.dumps(progress, indent=2), encoding="utf-8")

# --- 8) Output ringkas ---
print("=== HIGIENE DATA & TARGET — SELESAI ===")
print(f"- Target terdeteksi : {TARGET_COL}")
print(f"- Fitur numerik     : {len(shared_num)} kolom (shared train-test)")
print(f"- Train (clean)     : {train_pp.shape} -> {TRAIN_PP}")
print(f"- Test  (clean)     : {test_pp.shape}  -> {TEST_PP}")
print(f"- Meta hygiene      : {HYGIENE_META}")
print(f"- Snapshot head     : {train_head_csv} | {test_head_csv}")


=== HIGIENE DATA & TARGET — SELESAI ===
- Target terdeteksi : forward_returns
- Fitur numerik     : 89 kolom (shared train-test)
- Train (clean)     : (9021, 91) -> /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/train_preprocessed.parquet
- Test  (clean)     : (10, 90)  -> /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/test_preprocessed.parquet
- Meta hygiene      : /kaggle/working/hull-tactical-market-prediction/outputs/hygiene_meta.json
- Snapshot head     : /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/_train_head.csv | /kaggle/working/hull-tactical-market-prediction/outputs/preprocessed/_test_head.csv


# LightGBM Baseline

In [3]:
# ============================================
# Tahap 2 — LightGBM Baseline (Revisi Full)
# - Anti-spam warning & indikator training
# ============================================
import os, json, math, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm.auto import tqdm

ROOT     = "/kaggle/working/hull-tactical-market-prediction"
PRE_DIR  = f"{ROOT}/outputs/preprocessed"
OUT_SIG  = f"{ROOT}/outputs/signals"
OUT_MOD  = f"{ROOT}/outputs/models"
OUT_MET  = f"{ROOT}/outputs/metrics"
OUT_LOG  = f"{ROOT}/outputs"

TRAIN_PP = f"{PRE_DIR}/train_preprocessed.parquet"
TEST_PP  = f"{PRE_DIR}/test_preprocessed.parquet"
HYGIENE_META = f"{OUT_LOG}/hygiene_meta.json"
PROGRESS = f"{OUT_LOG}/progress.json"

for d in [OUT_SIG, OUT_MOD, OUT_MET]:
    Path(d).mkdir(parents=True, exist_ok=True)

SEED = 2025
np.random.seed(SEED)

# ---------- Utils ----------
def rmse_np(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mae_np(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.mean(np.abs(y_true - y_pred)))

def corr_safe(a, b):
    a = np.asarray(a, dtype=float); b = np.asarray(b, dtype=float)
    if a.std() == 0 or b.std() == 0:
        return 0.0
    return float(np.corrcoef(a, b)[0, 1])

def build_time_folds(df, date_col="date_id", train_len=2520, valid_len=252, step=252):
    dates = np.array(sorted(df[date_col].unique()))
    nD = len(dates)
    folds = []
    train_len = min(train_len, max(1, nD // 2))
    valid_len = min(valid_len, max(1, nD // 10))
    step = max(1, min(step, valid_len))
    for va_end in range(train_len + valid_len - 1, nD, step):
        va_start = va_end - valid_len + 1
        tr_start = va_start - train_len
        if tr_start < 0: 
            continue
        tr_dates = set(dates[tr_start:va_start])
        va_dates = set(dates[va_start:va_end+1])
        tr_idx = df.index[df[date_col].isin(tr_dates)].to_numpy()
        va_idx = df.index[df[date_col].isin(va_dates)].to_numpy()
        if len(tr_idx) == 0 or len(va_idx) == 0:
            continue
        folds.append((tr_idx, va_idx))
    return folds

def strip_problem_features(df, features, freq_threshold=0.995, std_threshold=1e-12):
    """
    Buang fitur yang sering memicu 'No further splits':
    - konstan (nunique <= 1)
    - quasi-konstan (proporsi nilai modus >= freq_threshold)
    - varians sangat kecil (std < std_threshold)
    """
    keep = []
    dropped = []
    for c in features:
        s = df[c]
        # nunique dihitung dengan NaN dianggap kategori sendiri
        nun = s.nunique(dropna=False)
        if nun <= 1:
            dropped.append((c, "constant"))
            continue
        # quasi-constant
        vc = s.value_counts(dropna=False)
        if (vc.iloc[0] / len(s)) >= freq_threshold:
            dropped.append((c, "quasi_constant"))
            continue
        # very low variance (numerik saja)
        try:
            if pd.api.types.is_numeric_dtype(s) and float(s.std(ddof=0)) < std_threshold:
                dropped.append((c, "low_variance"))
                continue
        except Exception:
            pass
        keep.append(c)
    return keep, dropped

# ---------- Load ----------
assert os.path.isfile(TRAIN_PP) and os.path.isfile(TEST_PP), "Jalankan tahap Higiene dulu."
train = pd.read_parquet(TRAIN_PP)
test  = pd.read_parquet(TEST_PP)

meta = json.loads(Path(HYGIENE_META).read_text(encoding="utf-8")) if os.path.isfile(HYGIENE_META) else {}
TARGET_COL = meta.get("target_col_detected", None)
if TARGET_COL is None or TARGET_COL not in train.columns:
    raise ValueError("Target tidak terdeteksi. Pastikan tahap Higiene sudah berjalan.")

# ---------- Feature set (dengan penyaringan anti-warning) ----------
FEATURES_RAW = [c for c in train.columns if c not in ("date_id", TARGET_COL)]
FEATURES, dropped_info = strip_problem_features(train, FEATURES_RAW)

# cast ke float32 untuk stabilitas & kecepatan
train[FEATURES] = train[FEATURES].astype("float32")
test[FEATURES]  = test[FEATURES].astype("float32")

# ---------- Folds ----------
FOLDS = build_time_folds(train, date_col="date_id", train_len=2520, valid_len=252, step=252)
if len(FOLDS) == 0:
    dates = sorted(train["date_id"].unique())
    cut = int(0.8 * len(dates))
    tr_idx = train.index[train["date_id"].isin(set(dates[:cut]))].to_numpy()
    va_idx = train.index[train["date_id"].isin(set(dates[cut:]))].to_numpy()
    FOLDS = [(tr_idx, va_idx)]

# ---------- Params ----------
# Nonaktifkan logger bawaan (supaya warning internal LightGBM tidak keluar)
try:
    lgb.register_logger(lambda *args, **kwargs: None)
except Exception:
    pass

lgb_params = dict(
    n_estimators=6000,
    learning_rate=0.02,
    num_leaves=96,
    max_depth=-1,
    min_child_samples=20,
    subsample=0.85,
    subsample_freq=1,
    colsample_bytree=0.85,
    reg_alpha=1e-3,
    reg_lambda=1e-2,
    max_bin=255,
    objective="regression",
    random_state=SEED,
    n_jobs=-1,
    # kurangi output bawaan
    verbosity=-1,
    feature_pre_filter=False,  # hindari prefilter agresif yang bisa memicu kondisi 'no split'
)

# ---------- Train per-fold (dengan indikator) ----------
oof = np.zeros(len(train), dtype=float)
test_pred = np.zeros(len(test), dtype=float)
fold_metrics = []
fi_all = []

X_test = test[FEATURES].copy()
y_true = train[TARGET_COL].astype(float).values

fold_pbar = tqdm(range(len(FOLDS)), desc="Folds", leave=True)
for k, (tr_idx, va_idx) in enumerate(FOLDS, start=1):
    X_tr = train.loc[tr_idx, FEATURES]
    y_tr = y_true[tr_idx]
    X_va = train.loc[va_idx, FEATURES]
    y_va = y_true[va_idx]

    model = lgb.LGBMRegressor(**lgb_params)

    # progress bar per-iter
    iters_total = lgb_params["n_estimators"]
    iter_pbar = tqdm(total=iters_total, desc=f"Fold {k}/{len(FOLDS)} | training", leave=False)

    eval_results = {}
    def tqdm_callback(env: lgb.callback.CallbackEnv):
        # Update bar ke iterasi terkini
        if env.iteration > iter_pbar.n:
            iter_pbar.update(env.iteration - iter_pbar.n)
        # Tampilkan RMSE val setiap 100 iter
        if env.iteration % 100 == 0 and len(env.evaluation_result_list) > 0:
            # ('valid_0', 'rmse', value, is_higher_better)
            for name, metric, value, _ in env.evaluation_result_list:
                if name == "valid_0" and metric.lower() == "rmse":
                    iter_pbar.set_postfix({"val_rmse": f"{value:.6f}"})
        # close jika sudah selesai (ketika early stopping)
        if env.iteration >= env.end_iteration:
            iter_pbar.close()
    tqdm_callback.order = 10  # jalankan belakangan

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="rmse",
        callbacks=[
            lgb.early_stopping(stopping_rounds=400, verbose=False),
            lgb.record_evaluation(eval_results),
            tqdm_callback
        ]
    )

    best_iter = int(model.best_iteration_) if model.best_iteration_ is not None else lgb_params["n_estimators"]
    y_hat = model.predict(X_va, num_iteration=best_iter)
    oof[va_idx] = y_hat

    rmse = rmse_np(y_va, y_hat)
    mae  = mae_np(y_va, y_hat)
    pcorr = corr_safe(y_va, y_hat)

    fold_metrics.append({
        "fold": k,
        "n_tr": int(len(tr_idx)),
        "n_va": int(len(va_idx)),
        "rmse": rmse,
        "mae": mae,
        "pearson": pcorr,
        "best_iter": best_iter,
        "dropped_features": len(dropped_info)
    })

    # simpan learning curve per-fold
    lc_path = f"{OUT_MET}/lgbm_learning_curve_fold{k}.csv"
    try:
        # eval_results: {'valid_0': {'rmse': [..]}}
        lc = pd.DataFrame({
            "iter": np.arange(1, len(eval_results["valid_0"]["rmse"]) + 1),
            "rmse": eval_results["valid_0"]["rmse"]
        })
        lc.to_csv(lc_path, index=False)
    except Exception:
        pass

    # feature importance
    try:
        fi_all.append(pd.DataFrame({
            "feature": FEATURES,
            "importance_gain": model.booster_.feature_importance(importance_type="gain", iteration=best_iter),
            "importance_split": model.booster_.feature_importance(importance_type="split", iteration=best_iter),
            "fold": k
        }))
    except Exception:
        pass

    # Test prediction (average across folds)
    test_pred += model.predict(X_test, num_iteration=best_iter) / len(FOLDS)

    fold_pbar.set_postfix({
        "last_val_rmse": f"{rmse:.6f}",
        "best_iter": best_iter
    })
    fold_pbar.update(1)

fold_pbar.close()

# ---------- Metrics OOF ----------
oof_rmse = rmse_np(y_true, oof)
oof_mae  = mae_np(y_true, oof)
oof_corr = corr_safe(y_true, oof)

# ---------- Save artifacts ----------
oof_df = train[["date_id"]].copy()
oof_df["y_true"] = y_true
oof_df["y_pred"] = oof
oof_path = f"{OUT_SIG}/lgbm_oof.csv"
oof_df.to_csv(oof_path, index=False, float_format="%.9f")

fold_df = pd.DataFrame(fold_metrics)
fold_path = f"{OUT_MET}/lgbm_fold_metrics.csv"
fold_df.to_csv(fold_path, index=False, float_format="%.9f")

test_df = test[["date_id"]].copy()
test_df["lgbm_signal"] = test_pred.astype(float)
test_sig_path = f"{OUT_SIG}/test_signal_lgbm.csv"
test_df.to_csv(test_sig_path, index=False, float_format="%.9f")

if fi_all:
    fi_df = pd.concat(fi_all, ignore_index=True)
    fi_path = f"{OUT_MET}/lgbm_feature_importance.csv"
    fi_df.to_csv(fi_path, index=False)
else:
    fi_path = None

# Simpan info fitur yang dibuang
drop_info_path = f"{OUT_MET}/lgbm_dropped_features.json"
Path(drop_info_path).write_text(json.dumps(
    [{"feature": f, "reason": r} for f, r in dropped_info], indent=2
), encoding="utf-8")

# Simpan booster terakhir (opsional)
try:
    last_model_path = f"{OUT_MOD}/lgbm_baseline_last.txt"
    model.booster_.save_model(last_model_path)
except Exception:
    last_model_path = None

# Progress log
try:
    progress = json.loads(Path(PROGRESS).read_text(encoding="utf-8")) if os.path.isfile(PROGRESS) else {}
except Exception:
    progress = {}
progress["lgbm_baseline_revised"] = {
    "timestamp": datetime.now().isoformat(timespec="seconds"),
    "features_raw": len(FEATURES_RAW),
    "features_used": len(FEATURES),
    "features_dropped": len(dropped_info),
    "folds": len(FOLDS),
    "oof_rmse": oof_rmse,
    "oof_mae": oof_mae,
    "oof_corr": oof_corr,
    "oof_path": oof_path,
    "fold_metrics": fold_path,
    "test_signal": test_sig_path,
    "feature_importance": fi_path,
    "dropped_features_json": drop_info_path,
    "model_last": last_model_path
}
Path(PROGRESS).write_text(json.dumps(progress, indent=2), encoding="utf-8")

print("=== LIGHTGBM BASELINE (REVISI) — SELESAI ===")
print(f"- Target        : {TARGET_COL}")
print(f"- Fitur (raw)   : {len(FEATURES_RAW)} | dipakai: {len(FEATURES)} | dibuang: {len(dropped_info)}")
print(f"- Folds         : {len(FOLDS)} (rolling time-based)")
print(f"- OOF           : RMSE={oof_rmse:.6f} | MAE={oof_mae:.6f} | Corr={oof_corr:.6f}")
print(f"- OOF csv       : {oof_path}")
print(f"- Fold metrics  : {fold_path}")
print(f"- LearningCurve : {OUT_MET}/lgbm_learning_curve_fold*.csv")
print(f"- Test signal   : {test_sig_path}")
print(f"- FI            : {fi_path}")
print(f"- Dropped feats : {drop_info_path}")
print(f"- Model last    : {last_model_path}")


Folds:   0%|          | 0/25 [00:00<?, ?it/s]

Fold 1/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 2/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 3/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 4/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 5/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 6/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 7/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 8/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 9/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 10/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 11/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 12/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 13/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 14/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 15/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 16/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 17/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 18/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 19/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 20/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 21/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 22/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 23/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 24/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

Fold 25/25 | training:   0%|          | 0/6000 [00:00<?, ?it/s]

=== LIGHTGBM BASELINE (REVISI) — SELESAI ===
- Target        : forward_returns
- Fitur (raw)   : 89 | dipakai: 89 | dibuang: 0
- Folds         : 25 (rolling time-based)
- OOF           : RMSE=0.010521 | MAE=0.007477 | Corr=0.065187
- OOF csv       : /kaggle/working/hull-tactical-market-prediction/outputs/signals/lgbm_oof.csv
- Fold metrics  : /kaggle/working/hull-tactical-market-prediction/outputs/metrics/lgbm_fold_metrics.csv
- LearningCurve : /kaggle/working/hull-tactical-market-prediction/outputs/metrics/lgbm_learning_curve_fold*.csv
- Test signal   : /kaggle/working/hull-tactical-market-prediction/outputs/signals/test_signal_lgbm.csv
- FI            : /kaggle/working/hull-tactical-market-prediction/outputs/metrics/lgbm_feature_importance.csv
- Dropped feats : /kaggle/working/hull-tactical-market-prediction/outputs/metrics/lgbm_dropped_features.json
- Model last    : /kaggle/working/hull-tactical-market-prediction/outputs/models/lgbm_baseline_last.txt


# Kalibrasi Prediksi

In [4]:
# ============================================
# Tahap 3 — Kalibrasi Prediksi (auto-find outputs)
# ============================================
import os, json, pickle, warnings, glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr

# ---------- 0) Temukan folder outputs secara otomatis ----------
BASE_SEARCH = "/kaggle/working"

def _find_outputs_dir(base=BASE_SEARCH):
    # Prioritas: ENV override -> pattern spesifik -> recursive glob
    env = os.environ.get("HULL_OUT_DIR", "").strip()
    if env and os.path.isdir(env):
        return env
    # Coba lokasi paling umum untuk notebook ini
    cand = os.path.join(base, "hull-tactical-market-prediction", "outputs")
    if os.path.isdir(cand):
        return cand
    # Fallback: cari semua folder bernama "outputs" di bawah /kaggle/working
    all_outs = glob.glob(os.path.join(base, "**", "outputs"), recursive=True)
    # Pilih yang berisi subfolder "signals" atau file *oof*.csv
    scored = []
    for d in all_outs:
        files = glob.glob(os.path.join(d, "**", "*.csv"), recursive=True)
        score = 0
        score += 1000 if os.path.isdir(os.path.join(d, "signals")) else 0
        score += 500 if os.path.isdir(os.path.join(d, "metrics")) else 0
        score += sum(1 for f in files if "oof" in os.path.basename(f).lower())
        scored.append((score, d))
    if scored:
        scored.sort(reverse=True)
        return scored[0][1]
    raise FileNotFoundError("Folder 'outputs' tidak ditemukan di bawah /kaggle/working. "
                            "Jalankan baseline terlebih dahulu.")

OUT_DIR = _find_outputs_dir()
CAL_DIR = os.path.join(OUT_DIR, "calibration")
Path(CAL_DIR).mkdir(parents=True, exist_ok=True)
print(f"[INFO] Using outputs dir: {OUT_DIR}")

# ---------- 1) Konfigurasi ----------
ID_COL = "date_id"
TARGET_CANDS = ["forward_returns","target","y_true","market_forward_excess_returns"]
PRED_CANDS   = ["y_pred","prediction","pred","signal","lgbm_pred","oof_pred","score","weight","allocation"]
TEST_PRED_CANDS = ["prediction","signal","y_pred","lgbm_pred","allocation","weight"]

CAL_OOF_CSV   = f"{CAL_DIR}/oof_calibrated.csv"
CAL_TEST_CSV  = f"{CAL_DIR}/test_calibrated.csv"
CAL_MODEL_PKL = f"{CAL_DIR}/iso_calibrator.pkl"
CAL_SUMMARY   = f"{CAL_DIR}/calibration_summary.json"

# ---------- 2) Util ----------
def _metrics(y, p):
    rmse = float(np.sqrt(mean_squared_error(y, p)))
    mae  = float(mean_absolute_error(y, p))
    corr = float(np.corrcoef(y, p)[0,1]) if len(y) > 1 else float("nan")
    try:
        spear = float(spearmanr(y, p, nan_policy="omit").correlation)
    except Exception:
        spear = float("nan")
    return {"rmse": rmse, "mae": mae, "corr": corr, "spearman": spear}

def _pick_col(df, cands):
    for c in cands:
        if c in df.columns:
            return c
    return None

def _list_files(base):
    return (glob.glob(os.path.join(base, "**", "*.csv"), recursive=True)
            + glob.glob(os.path.join(base, "**", "*.parquet"), recursive=True))

def _read_any(path):
    return pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path)

def _find_oof_and_test(out_dir: str):
    files = _list_files(out_dir)
    if not files:
        raise FileNotFoundError(f"Tidak ada artefak .csv/.parquet di {out_dir}.")

    oof_cand, test_cand = None, None

    # OOF: harus ada target + pred (idealnya punya date_id)
    for fp in files:
        try:
            df = _read_any(fp)
            pred = _pick_col(df, PRED_CANDS)
            tgt  = _pick_col(df, TARGET_CANDS)
            if pred is not None and tgt is not None:
                score = len(df) + (50000 if ID_COL in df.columns else 0) + (100 if "oof" in fp.lower() else 0)
                if (oof_cand is None) or (score > oof_cand[0]):
                    oof_cand = (score, fp, pred, tgt, (ID_COL in df.columns))
        except Exception:
            pass

    # Test: harus ada pred + date_id
    for fp in files:
        try:
            df = _read_any(fp)
            pred = _pick_col(df, TEST_PRED_CANDS)
            if pred is not None and (ID_COL in df.columns):
                score = len(df) + (100 if "test" in fp.lower() else 0)
                if (test_cand is None) or (score > test_cand[0]):
                    test_cand = (score, fp, pred)
        except Exception:
            pass

    if oof_cand is None:
        raise FileNotFoundError("OOF tidak ditemukan. Pastikan baseline menyimpan OOF (lihat log baseline).")
    if test_cand is None:
        raise FileNotFoundError("Test signal tidak ditemukan. Pastikan baseline menyimpan prediksi test.")

    return oof_cand, test_cand

# ---------- 3) Temukan & muat ----------
(o_score, oof_fp, OOF_PRED, OOF_TGT, has_id), (t_score, test_fp, TEST_PRED) = _find_oof_and_test(OUT_DIR)

print(f"[INFO] OOF file : {oof_fp}\n        pred={OOF_PRED} | target={OOF_TGT} | has_id={has_id}")
print(f"[INFO] Test file: {test_fp}\n        pred={TEST_PRED}")

oof_df  = _read_any(oof_fp).copy()
test_df = _read_any(test_fp).copy()

x_oof = oof_df[OOF_PRED].astype("float64").to_numpy()
y_oof = oof_df[OOF_TGT].astype("float64").to_numpy()

rng = np.random.RandomState(2025)
x_fit = x_oof + 1e-9 * rng.randn(len(x_oof))  # stabilisasi isotonic
m_raw = _metrics(y_oof, x_oof)

# ---------- 4) Fit calibrator (Isotonic → Linear fallback) ----------
use_model = "isotonic"
try:
    iso = IsotonicRegression(increasing=True, out_of_bounds="clip")
    iso.fit(x_fit, y_oof)
    if np.std(iso.predict(x_fit)) < 1e-12:
        raise RuntimeError("Isotonic output flat.")
    calibrator = iso
except Exception as e:
    warnings.warn(f"Gagal isotonic ({e}); fallback LinearRegression.")
    use_model = "linear"
    lr = LinearRegression()
    lr.fit(x_fit.reshape(-1,1), y_oof)
    calibrator = lr

def _apply(model, arr):
    return model.predict(arr) if use_model=="isotonic" else model.predict(arr.reshape(-1,1))

# ---------- 5) Apply + normalisasi ----------
oof_df["calibrated"]  = _apply(calibrator, oof_df[OOF_PRED].astype("float64").to_numpy())
test_df["calibrated"] = _apply(calibrator, test_df[TEST_PRED].astype("float64").to_numpy())

# Center ke mean OOF (umum untuk return)
oof_mean = float(oof_df["calibrated"].mean())
oof_df["calibrated"]  = oof_df["calibrated"]  - oof_mean
test_df["calibrated"] = test_df["calibrated"] - oof_mean

# Soft clip 5σ berdasarkan OOF
sd = float(oof_df["calibrated"].std(ddof=0) + 1e-12)
lo, hi = -5.0*sd, 5.0*sd
oof_df["calibrated"]  = oof_df["calibrated"].clip(lo, hi)
test_df["calibrated"] = test_df["calibrated"].clip(lo, hi)

m_cal = _metrics(y_oof, oof_df["calibrated"].to_numpy())

# ---------- 6) Simpan artefak ----------
oof_cols  = ([ID_COL] if ID_COL in oof_df.columns else []) + [OOF_TGT, OOF_PRED, "calibrated"]
test_cols = ([ID_COL] if ID_COL in test_df.columns else []) + [TEST_PRED, "calibrated"]

oof_df[oof_cols].to_csv(CAL_OOF_CSV, index=False, float_format="%.9f")
test_df[test_cols].rename(columns={TEST_PRED:"prediction_raw",
                                   "calibrated":"prediction_calibrated"}).to_csv(
    CAL_TEST_CSV, index=False, float_format="%.9f"
)

with open(CAL_MODEL_PKL, "wb") as f:
    pickle.dump({"model": calibrator, "kind": use_model}, f)

summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "outputs_dir": OUT_DIR,
    "oof_file": oof_fp,
    "test_file": test_fp,
    "oof_pred_col": OOF_PRED,
    "oof_target_col": OOF_TGT,
    "test_pred_col": TEST_PRED,
    "calibrator": use_model,
    "oof_metrics_raw": m_raw,
    "oof_metrics_calibrated": m_cal,
    "clip_sigma": 5.0,
    "clip_bounds": [lo, hi],
    "outputs": {
        "oof_calibrated_csv": CAL_OOF_CSV,
        "test_calibrated_csv": CAL_TEST_CSV,
        "calibrator_pkl": CAL_MODEL_PKL
    }
}
Path(CAL_SUMMARY).write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("\n=== KALIBRASI PREDIKSI — SELESAI ===")
print(f"- Calibrator      : {use_model}")
print(f"- OOF (raw)       : RMSE={m_raw['rmse']:.6f} | MAE={m_raw['mae']:.6f} | Corr={m_raw['corr']:.4f} | Spearman={m_raw['spearman']:.4f}")
print(f"- OOF (calibrated): RMSE={m_cal['rmse']:.6f} | MAE={m_cal['mae']:.6f} | Corr={m_cal['corr']:.4f} | Spearman={m_cal['spearman']:.4f}")
print(f"- Files           : {CAL_OOF_CSV}, {CAL_TEST_CSV}, {CAL_MODEL_PKL}")


[INFO] Using outputs dir: /kaggle/working/hull-tactical-market-prediction/outputs
[INFO] OOF file : /kaggle/working/hull-tactical-market-prediction/outputs/signals/lgbm_oof.csv
        pred=y_pred | target=y_true | has_id=True
[INFO] Test file: /kaggle/working/hull-tactical-market-prediction/outputs/signals/lgbm_oof.csv
        pred=y_pred

=== KALIBRASI PREDIKSI — SELESAI ===
- Calibrator      : isotonic
- OOF (raw)       : RMSE=0.010521 | MAE=0.007477 | Corr=0.0652 | Spearman=0.0323
- OOF (calibrated): RMSE=0.010513 | MAE=0.007492 | Corr=0.0886 | Spearman=0.0552
- Files           : /kaggle/working/hull-tactical-market-prediction/outputs/calibration/oof_calibrated.csv, /kaggle/working/hull-tactical-market-prediction/outputs/calibration/test_calibrated.csv, /kaggle/working/hull-tactical-market-prediction/outputs/calibration/iso_calibrator.pkl


# Mapping Sinyal

In [5]:
# ============================================
# Tahap 4 — Mapping Sinyal → Allocation
# (auto-find artefak, robust, dengan logging)
# ============================================
import os, json, glob, math, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

# ---------- 0) Temukan folder outputs ----------
BASE_SEARCH = "/kaggle/working"

def _find_outputs_dir(base=BASE_SEARCH):
    env = os.environ.get("HULL_OUT_DIR", "").strip()
    if env and os.path.isdir(env):
        return env
    cand = os.path.join(base, "hull-tactical-market-prediction", "outputs")
    if os.path.isdir(cand):
        return cand
    all_outs = glob.glob(os.path.join(base, "**", "outputs"), recursive=True)
    if not all_outs:
        raise FileNotFoundError("Folder 'outputs' tidak ditemukan. Jalankan baseline terlebih dahulu.")
    # pilih yang paling “kaya” artefak
    scored = []
    for d in all_outs:
        files = glob.glob(os.path.join(d, "**", "*.csv"), recursive=True)
        score = 0
        score += 1000 if os.path.isdir(os.path.join(d, "signals")) else 0
        score += 500 if os.path.isdir(os.path.join(d, "calibration")) else 0
        score += sum(1 for f in files if "test" in f.lower())
        scored.append((score, d))
    scored.sort(reverse=True)
    return scored[0][1]

OUT_DIR   = _find_outputs_dir()
SIG_DIR   = os.path.join(OUT_DIR, "signals")
CAL_DIR   = os.path.join(OUT_DIR, "calibration")
ALLOC_DIR = os.path.join(OUT_DIR, "allocations")
Path(ALLOC_DIR).mkdir(parents=True, exist_ok=True)
print(f"[INFO] outputs: {OUT_DIR}")

# ---------- 1) Muat sinyal (test) & OOF ----------
ID_COL = "date_id"
# prioritas: hasil kalibrasi → sinyal mentah baseline
TEST_CANDS = [
    os.path.join(CAL_DIR, "test_calibrated.csv"),
    os.path.join(SIG_DIR, "test_signal_lgbm.csv"),
    os.path.join(OUT_DIR, "calibration", "test_calibrated.csv"),
]
OOF_CANDS = [
    os.path.join(CAL_DIR, "oof_calibrated.csv"),
    os.path.join(SIG_DIR, "lgbm_oof.csv"),
]

def _read_first(paths):
    for p in paths:
        if os.path.isfile(p):
            return pd.read_csv(p)
    raise FileNotFoundError(f"Tidak menemukan file pada: {paths}")

test_df = _read_first(TEST_CANDS).copy()
oof_df  = _read_first(OOF_CANDS).copy()

# deteksi kolom prediksi
def _pick(df, cands):
    for c in cands:
        if c in df.columns: return c
    return None

pred_test_col = _pick(test_df, ["prediction_calibrated","prediction_raw","prediction","signal","y_pred","allocation","weight"])
pred_oof_col  = _pick(oof_df,  ["calibrated","y_pred","prediction","signal"])
tgt_oof_col   = _pick(oof_df,  ["forward_returns","target","y_true","market_forward_excess_returns"])

assert ID_COL in test_df.columns, f"Kolom {ID_COL} wajib ada di test."
assert pred_test_col is not None, "Tidak menemukan kolom prediksi test."
assert pred_oof_col  is not None, "Tidak menemukan kolom prediksi OOF."

print(f"[INFO] test pred col : {pred_test_col}")
print(f"[INFO] oof  pred col : {pred_oof_col} | tgt={tgt_oof_col or '-'}")

# ---------- 2) Parameter mapping ----------
# Heuristik awal yang aman (bisa dituning di tahap Tuning):
PARAMS = dict(
    k=0.9,        # skala akhir (amplitudo posisi)
    m=0.0,        # pergeseran (bias) sebelum standardisasi
    alpha=1.25,   # “gain” non-linear utk tanh
    eps=0.02,     # dead-band (soft-threshold) pada sinyal akhir
    lam=0.15,     # smoothing EMA pada allocation (0..1), makin kecil = makin halus
    clip=(-1.0, 1.0),  # batas keras
)

# ---------- 3) Statistik robust dari OOF ----------
def robust_center_scale(x: np.ndarray):
    x = np.asarray(x, float)
    med = float(np.median(x))
    mad = float(np.median(np.abs(x - med))) + 1e-12
    # konversi MAD → ~std (1.4826 * MAD), fallback ke std jika outlier sedikit
    rstd = 1.4826 * mad
    s = rstd if rstd > 1e-9 else float(np.std(x, ddof=0) + 1e-12)
    return med, s

mu_oof, sd_oof = robust_center_scale(oof_df[pred_oof_col].astype("float64").to_numpy())
print(f"[INFO] OOF center/scale (robust): mu={mu_oof:.6g}, sd={sd_oof:.6g}")

# ---------- 4) Mapping fungsi ----------
def soft_threshold(x, thr):
    # shrink ke nol di sekitar 0 (dead-band)
    ax = np.abs(x)
    return np.sign(x) * np.maximum(ax - thr, 0.0)

def map_to_allocation(pred_series: pd.Series, params: dict):
    x = pred_series.astype("float64").to_numpy()
    # 1) bias shift (m), 2) standardisasi pakai statistik OOF (tanpa look-ahead)
    z = (x - (mu_oof + params["m"])) / (sd_oof + 1e-12)
    # 3) non-linear squashing → tanh
    a = params["k"] * np.tanh(params["alpha"] * z)
    # 4) dead-band
    a = soft_threshold(a, params["eps"])
    # 5) smoothing EMA (mengurangi turnover)
    lam = float(params["lam"])
    if lam > 0:
        ema = np.zeros_like(a)
        ema[0] = a[0]
        for t in range(1, len(a)):
            ema[t] = (1 - lam) * ema[t-1] + lam * a[t]
        a = ema
    # 6) clip keras
    lo, hi = params["clip"]
    a = np.clip(a, lo, hi)
    return a

# ---------- 5) Terapkan ke test ----------
alloc = map_to_allocation(test_df[pred_test_col], PARAMS)
out = pd.DataFrame({
    "date_id": test_df[ID_COL].astype(int),
    "allocation": alloc.astype("float32")
}).sort_values("date_id").reset_index(drop=True)

# ---------- 6) Simpan + summary ----------
ALLOC_CSV   = os.path.join(ALLOC_DIR, "test_allocation_stage1.csv")
PARAMS_JSON = os.path.join(ALLOC_DIR, "mapping_params_stage1.json")
SUMMARY_JSON= os.path.join(ALLOC_DIR, "mapping_summary_stage1.json")

out.to_csv(ALLOC_CSV, index=False, float_format="%.9f")
Path(PARAMS_JSON).write_text(json.dumps(PARAMS, indent=2), encoding="utf-8")

summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "outputs_dir": OUT_DIR,
    "test_pred_col": pred_test_col,
    "oof_pred_col": pred_oof_col,
    "oof_target_col": tgt_oof_col,
    "oof_mu": mu_oof, "oof_sd": sd_oof,
    "params": PARAMS,
    "allocation_stats": {
        "min": float(out["allocation"].min()),
        "max": float(out["allocation"].max()),
        "mean": float(out["allocation"].mean()),
        "std": float(out["allocation"].std(ddof=0)),
        "n": int(len(out))
    },
    "paths": {"allocation_csv": ALLOC_CSV, "params_json": PARAMS_JSON}
}
Path(SUMMARY_JSON).write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("\n=== MAPPING SINYAL — SELESAI ===")
print(f"- File allocation : {ALLOC_CSV}")
print(f"- Param JSON      : {PARAMS_JSON}")
print(f"- Stats           : "
      f"min={summary['allocation_stats']['min']:.3f} | "
      f"max={summary['allocation_stats']['max']:.3f} | "
      f"mean={summary['allocation_stats']['mean']:.3f} | "
      f"std={summary['allocation_stats']['std']:.3f} | "
      f"n={summary['allocation_stats']['n']}")


[INFO] outputs: /kaggle/working/hull-tactical-market-prediction/outputs
[INFO] test pred col : prediction_calibrated
[INFO] oof  pred col : calibrated | tgt=y_true
[INFO] OOF center/scale (robust): mu=-0.000107642, sd=0.000704521

=== MAPPING SINYAL — SELESAI ===
- File allocation : /kaggle/working/hull-tactical-market-prediction/outputs/allocations/test_allocation_stage1.csv
- Param JSON      : /kaggle/working/hull-tactical-market-prediction/outputs/allocations/mapping_params_stage1.json
- Stats           : min=-0.869 | max=0.852 | mean=0.056 | std=0.241 | n=9021


# Backtest & Stress

In [6]:
# ============================================
# Tahap 5 — BACKTEST & STRESS TEST
# (auto-find, no look-ahead, robust)
# ============================================
import os, json, glob, math, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

# ---------- 0) Temukan outputs ----------
BASE_SEARCH = "/kaggle/working"

def _find_outputs_dir(base=BASE_SEARCH):
    env = os.environ.get("HULL_OUT_DIR", "").strip()
    if env and os.path.isdir(env):
        return env
    cand = os.path.join(base, "hull-tactical-market-prediction", "outputs")
    if os.path.isdir(cand):
        return cand
    all_outs = glob.glob(os.path.join(base, "**", "outputs"), recursive=True)
    if not all_outs:
        raise FileNotFoundError("Folder 'outputs' tidak ditemukan. Jalankan tahap sebelumnya.")
    # pilih yang “kaya” artefak
    scored = []
    for d in all_outs:
        files = glob.glob(os.path.join(d, "**", "*.csv"), recursive=True)
        score = 0
        score += 1000 if os.path.isdir(os.path.join(d, "signals")) else 0
        score += 500  if os.path.isdir(os.path.join(d, "calibration")) else 0
        score += 700  if os.path.isdir(os.path.join(d, "allocations")) else 0
        score += sum(1 for f in files if "oof" in f.lower())
        scored.append((score, d))
    scored.sort(reverse=True)
    return scored[0][1]

OUT_DIR    = _find_outputs_dir()
CAL_DIR    = os.path.join(OUT_DIR, "calibration")
ALLOC_DIR  = os.path.join(OUT_DIR, "allocations")
BT_DIR     = os.path.join(OUT_DIR, "backtests")
Path(BT_DIR).mkdir(parents=True, exist_ok=True)
print(f"[INFO] outputs: {OUT_DIR}")

# ---------- 1) Muat artefak ----------
# OOF terkalibrasi (punya kolom target & prediksi)
OOF_CANDIDATES = [
    os.path.join(CAL_DIR, "oof_calibrated.csv"),
    os.path.join(OUT_DIR, "calibration", "oof_calibrated.csv"),
]
# params mapping & summary (untuk konsistensi mu/sd)
PARAMS_JSON  = os.path.join(ALLOC_DIR, "mapping_params_stage1.json")
SUMMARY_JSON = os.path.join(ALLOC_DIR, "mapping_summary_stage1.json")

def _read_first(paths):
    for p in paths:
        if os.path.isfile(p):
            return pd.read_csv(p)
    raise FileNotFoundError(f"Tidak menemukan file pada: {paths}")

oof = _read_first(OOF_CANDIDATES).copy()
assert os.path.isfile(PARAMS_JSON), f"Tidak menemukan: {PARAMS_JSON}"
assert os.path.isfile(SUMMARY_JSON), f"Tidak menemukan: {SUMMARY_JSON}"

PARAMS = json.loads(Path(PARAMS_JSON).read_text(encoding="utf-8"))
SUMMAR = json.loads(Path(SUMMARY_JSON).read_text(encoding="utf-8"))

# deteksi kolom
ID_COL = "date_id"
tgt_col = None
for c in ["forward_returns","target","y_true","market_forward_excess_returns"]:
    if c in oof.columns: tgt_col = c; break
pred_col = None
for c in ["calibrated","y_pred","prediction","signal"]:
    if c in oof.columns: pred_col = c; break

assert tgt_col is not None, "Kolom target OOF tidak ditemukan."
assert pred_col is not None, "Kolom prediksi OOF tidak ditemukan."

# ---------- 2) Re-map OOF → allocation dengan PARAMS yang sama ----------
mu_oof = float(SUMMAR.get("oof_mu", 0.0))
sd_oof = float(SUMMAR.get("oof_sd", 1.0))
if sd_oof <= 0: sd_oof = float(oof[pred_col].std(ddof=0) + 1e-12)

def soft_threshold(x, thr):
    ax = np.abs(x)
    return np.sign(x) * np.maximum(ax - thr, 0.0)

def map_to_allocation(x: np.ndarray, params):
    # standardisasi pakai mu/sd OOF dari tahap mapping
    z = (x - (mu_oof + params["m"])) / (sd_oof + 1e-12)
    a = params["k"] * np.tanh(params["alpha"] * z)
    a = soft_threshold(a, params["eps"])
    lam = float(params["lam"])
    if lam > 0:
        ema = np.zeros_like(a)
        ema[0] = a[0]
        for t in range(1, len(a)):
            ema[t] = (1 - lam) * ema[t-1] + lam * a[t]
        a = ema
    lo, hi = params.get("clip", (-1.0, 1.0))
    return np.clip(a, lo, hi)

x = oof[pred_col].astype("float64").to_numpy()
alloc_oof = map_to_allocation(x, PARAMS)
oof["allocation"] = alloc_oof.astype("float32")

# ---------- 3) Hitung metrik kinerja ----------
TRADING_DAYS = 252

# daily return strategi
y = oof[tgt_col].astype("float64").to_numpy()
ret = (alloc_oof * y)
ret = np.nan_to_num(ret, nan=0.0, posinf=0.0, neginf=0.0)

def sharpe_annual(r):
    s = float(np.std(r, ddof=0))
    if s < 1e-12: return 0.0
    return float(np.mean(r) / s * math.sqrt(TRADING_DAYS))

def cagr(r):
    # asumsi 1 hari = 1 langkah trading
    cum = float(np.prod(1.0 + r))
    yrs = len(r) / TRADING_DAYS
    if yrs <= 0: return 0.0
    return float(cum ** (1.0 / yrs) - 1.0)

def max_drawdown(r):
    equity = np.cumprod(1.0 + r)
    peak = np.maximum.accumulate(equity)
    dd = equity / peak - 1.0
    return float(dd.min()), dd

def turnover(a):
    # rata-rata |Δ alloc|
    if len(a) <= 1: return 0.0
    return float(np.mean(np.abs(np.diff(a))))

sharpe = sharpe_annual(ret)
cagr_v = cagr(ret)
mdd, dd_series = max_drawdown(ret)
turn = turnover(alloc_oof)

# occupancy pada batas
lo, hi = PARAMS.get("clip", (-1.0, 1.0))
bound_eps = 1e-6
bound_lo = float(np.mean(alloc_oof <= lo + bound_eps) * 100.0)
bound_hi = float(np.mean(alloc_oof >= hi - bound_eps) * 100.0)

# ---------- 4) Simpan daily perf & rolling metrics ----------
# siapkan index tanggal jika ada; jika tidak, gunakan date_id
if ID_COL in oof.columns:
    idx = oof[ID_COL].astype(int).to_numpy()
else:
    idx = np.arange(len(oof), dtype=int)

daily = pd.DataFrame({
    "date_id": idx,
    "allocation": alloc_oof.astype("float32"),
    "target": y.astype("float32"),
    "strategy_ret": ret.astype("float32"),
})
daily["equity"] = (1.0 + daily["strategy_ret"]).cumprod()

DAILY_CSV = os.path.join(BT_DIR, "daily_perf.csv")
daily.to_csv(DAILY_CSV, index=False, float_format="%.9f")

# Rolling 63d
WIN = 63
def rolling_sharpe(arr, win=WIN):
    out = np.full(len(arr), np.nan, dtype=float)
    for i in range(win-1, len(arr)):
        w = arr[i-win+1:i+1]
        s = np.std(w, ddof=0)
        out[i] = 0.0 if s < 1e-12 else (np.mean(w)/s) * math.sqrt(TRADING_DAYS)
    return out

def rolling_cagr(arr, win=WIN):
    out = np.full(len(arr), np.nan, dtype=float)
    yrs_win = win / TRADING_DAYS
    for i in range(win-1, len(arr)):
        w = arr[i-win+1:i+1]
        total = np.prod(1.0 + w)
        out[i] = total ** (1.0 / yrs_win) - 1.0
    return out

roll = pd.DataFrame({
    "date_id": idx,
    "roll63_sharpe": rolling_sharpe(ret, WIN),
    "roll63_cagr":   rolling_cagr(ret, WIN),
})
ROLLING_CSV = os.path.join(BT_DIR, "rolling_63d_metrics.csv")
roll.to_csv(ROLLING_CSV, index=False, float_format="%.9f")

# ---------- 5) Stress tests ----------
def window_stats(arr, w):
    # cari window terburuk/terbaik berdasarkan return kumulatif
    if len(arr) < w: 
        return None
    best = (-1e18, -1, -1)  # (cumret, start, end)
    worst= ( 1e18, -1, -1)
    cumprod = np.ones(len(arr)+1, dtype=float)
    cumprod[1:] = np.cumprod(1.0 + arr)
    for i in range(len(arr)-w+1):
        j = i + w
        cr = cumprod[j]/cumprod[i] - 1.0
        if cr > best[0]:  best = (cr, i, j-1)
        if cr < worst[0]: worst = (cr, i, j-1)
    return {"best": {"cumret": float(best[0]), "start_idx": int(best[1]), "end_idx": int(best[2])},
            "worst":{"cumret": float(worst[0]),"start_idx": int(worst[1]),"end_idx": int(worst[2])}}

stress = {
    "window_5d":  window_stats(ret, 5),
    "window_21d": window_stats(ret, 21),
    "window_63d": window_stats(ret, 63),
}

STRESS_JSON = os.path.join(BT_DIR, "stress_tests.json")
Path(STRESS_JSON).write_text(json.dumps(stress, indent=2), encoding="utf-8")

# ---------- 6) Ringkasan ----------
SUMMARY_JSON = os.path.join(BT_DIR, "backtest_summary.json")
summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "oof_file": [p for p in OOF_CANDIDATES if os.path.isfile(p)][0],
    "params_file": PARAMS_JSON,
    "metrics": {
        "sharpe_ann": float(sharpe),
        "cagr": float(cagr_v),
        "max_drawdown": float(mdd),
        "turnover": float(turn),
        "bound_lo_%": bound_lo,
        "bound_hi_%": bound_hi,
        "days": int(len(ret)),
    },
    "paths": {
        "daily_perf_csv": DAILY_CSV,
        "rolling_63d_csv": ROLLING_CSV,
        "summary_json": SUMMARY_JSON,
        "stress_json": STRESS_JSON
    }
}
Path(SUMMARY_JSON).write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("\n=== BACKTEST & STRESS TEST — SELESAI ===")
print(f"- Daily perf : {DAILY_CSV}")
print(f"- Rolling(63): {ROLLING_CSV}")
print(f"- Summary    : {SUMMARY_JSON}")
print(f"- Stress     : {STRESS_JSON}")
m = summary["metrics"]
print(f"- Days       : {m['days']}")
print(f"- Sharpe ann : {m['sharpe_ann']:.3f} | CAGR {m['cagr']*100:.3f}% | MaxDD {m['max_drawdown']*100:.2f}%")
print(f"- Turnover   : {m['turnover']:.3f} | Bound@lo {m['bound_lo_%']:.1f}% | Bound@hi {m['bound_hi_%']:.1f}%")


[INFO] outputs: /kaggle/working/hull-tactical-market-prediction/outputs

=== BACKTEST & STRESS TEST — SELESAI ===
- Daily perf : /kaggle/working/hull-tactical-market-prediction/outputs/backtests/daily_perf.csv
- Rolling(63): /kaggle/working/hull-tactical-market-prediction/outputs/backtests/rolling_63d_metrics.csv
- Summary    : /kaggle/working/hull-tactical-market-prediction/outputs/backtests/backtest_summary.json
- Stress     : /kaggle/working/hull-tactical-market-prediction/outputs/backtests/stress_tests.json
- Days       : 9021
- Sharpe ann : 0.560 | CAGR 2.812% | MaxDD -14.05%
- Turnover   : 0.023 | Bound@lo 0.0% | Bound@hi 0.0%


# Tunning

In [7]:
# ============================================
# Tahap 6 — Tuning LightGBM (Optuna, clean output)
# ============================================
import os, io, json, stat, gc, math, warnings, contextlib, glob
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
import lightgbm as lgb

# --- Anti-spam: bungkam stderr (hilangkan "1 warning generated.") ---
warnings.filterwarnings("ignore")
os.environ["PYTHONWARNINGS"] = "ignore"

@contextlib.contextmanager
def suppress_stderr():
    buf = io.StringIO()
    with contextlib.redirect_stderr(buf):
        yield

def lgb_train_silent(params, dtrain, valid_sets, valid_names,
                     num_boost_round, early_stopping_rounds=200, log_period=50):
    callbacks = [
        lgb.early_stopping(stopping_rounds=early_stopping_rounds, verbose=False),
        lgb.log_evaluation(period=log_period),
    ]
    params = {**params, "verbosity": -1}
    with suppress_stderr():
        model = lgb.train(
            params, dtrain,
            num_boost_round=num_boost_round,
            valid_sets=valid_sets, valid_names=valid_names,
            callbacks=callbacks,
        )
    return model

# --- Optuna logging agar tidak ramai ---
try:
    import optuna
    optuna.logging.set_verbosity(optuna.logging.WARNING)
except Exception:
    optuna = None

# ---------------- Config ----------------
ROOT   = "/kaggle/working/hull-tactical-market-prediction"
INPUT  = "/kaggle/input/hull-tactical-market-prediction"
OUTDIR = f"{ROOT}/outputs/tuning_lgbm"
Path(OUTDIR).mkdir(parents=True, exist_ok=True)

ID_COL   = "date_id"
TARGET_CANDS = ["forward_returns","target","y_true","market_forward_excess_returns"]
BAN_COLS = {ID_COL, "row_id", "investment_id"} | set(TARGET_CANDS)

N_FOLDS     = 25            # rolling time-based
NUM_BOOST   = 10_000
EARLY_STOP  = 600
LOG_PERIOD  = 100           # set 0 kalau ingin tanpa progress
RSTATE      = 2025
N_TRIALS    = 40            # ubah sesuai waktu
METRIC_NAME = "corr_mean"

# --------- Load data (pakai yang sudah ada di kernel bila tersedia) ---------
def _safe_load_df(default_path_csv, fallback_var_name):
    if fallback_var_name in globals():
        return globals()[fallback_var_name].copy()
    return pd.read_csv(default_path_csv)

train = _safe_load_df(f"{INPUT}/train.csv", "train")
test  = _safe_load_df(f"{INPUT}/test.csv",  "test")

# --- Tentukan target & fitur yang tersedia ---
tgt_col = next((c for c in TARGET_CANDS if c in train.columns), None)
if tgt_col is None:
    raise ValueError(f"Tidak menemukan kolom target. Kandidat: {TARGET_CANDS}")

num_cols_train = train.select_dtypes(include=[np.number]).columns.tolist()
feat_cols = [c for c in num_cols_train if c not in BAN_COLS]

# Sinkronkan fitur dengan test (hindari KeyError)
feat_cols = [c for c in feat_cols if c in test.columns]
feat_cols = sorted(feat_cols)

# Isi NA & sort waktu
train[feat_cols] = train[feat_cols].fillna(0.0)
test[feat_cols]  = test[feat_cols].fillna(0.0)
train = train.sort_values(ID_COL).reset_index(drop=True)
test  = test.sort_values(ID_COL).reset_index(drop=True)

print(f"Fitur kandidat: {len(feat_cols)} | Target: {tgt_col}")

# --------- Time-based rolling folds ---------
def make_time_folds(df, n_folds=25, id_col=ID_COL):
    dates = np.sort(df[id_col].unique())
    # bagi tanggal menjadi n_folds segmen berurutan
    chunks = np.array_split(dates, n_folds)
    folds = []
    for seg in chunks:
        val_dates = np.array(seg)
        tr_idx = df.index[df[id_col] < val_dates.min()].to_numpy()
        va_idx = df.index[df[id_col].isin(val_dates)].to_numpy()
        if len(tr_idx) == 0 or len(va_idx) == 0:
            continue
        folds.append((tr_idx, va_idx))
    return folds

folds = make_time_folds(train, N_FOLDS, ID_COL)
print(f"Folds: {len(folds)}")

# --------- Objective util ---------
def pearson_corr(y, p):
    if y.size < 2: 
        return 0.0
    v = np.corrcoef(y, p)[0,1]
    if not np.isfinite(v):
        return 0.0
    return float(v)

def cv_score(params):
    """Kembalikan mean Pearson corr OOF untuk parameter LightGBM tertentu."""
    oof_pred = np.zeros(len(train), dtype=np.float64)
    fold_stats = []
    for f, (tr_idx, va_idx) in enumerate(folds, 1):
        X_tr, y_tr = train.loc[tr_idx, feat_cols], train.loc[tr_idx, tgt_col].astype(np.float64).values
        X_va, y_va = train.loc[va_idx, feat_cols], train.loc[va_idx, tgt_col].astype(np.float64).values

        dtr = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
        dva = lgb.Dataset(X_va, label=y_va, free_raw_data=False)

        gbm = lgb_train_silent(
            params, dtr,
            valid_sets=[dva], valid_names=[f"fold{f:02d}"],
            num_boost_round=NUM_BOOST,
            early_stopping_rounds=EARLY_STOP,
            log_period=LOG_PERIOD,
        )
        pred = gbm.predict(X_va, num_iteration=gbm.best_iteration)
        oof_pred[va_idx] = pred

        corr = pearson_corr(y_va, pred)
        fold_stats.append(corr)

        del dtr, dva, gbm, X_tr, X_va, y_tr, y_va, pred
        gc.collect()

    score = float(np.mean(fold_stats))
    return score, oof_pred, fold_stats

# --------- Optuna search space ---------
def suggest_params(trial):
    return {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "learning_rate": trial.suggest_float("lr", 0.01, 0.2, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 255),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "n_jobs": -1,
        "verbosity": -1,
        "seed": RSTATE,
        "feature_fraction_bynode": 1.0,
    }

# --------- Jalankan Optuna (optional) ---------
if optuna is not None:
    def objective(trial):
        params = suggest_params(trial)
        score, _, _ = cv_score(params)
        # maximize mean corr
        return score

    study = optuna.create_study(direction="maximize", study_name="lgbm_tuning_clean")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=False)

    best_params = suggest_params(study.best_trial)  # materialize values (tanpa distributions)
    best_score  = float(study.best_value)
else:
    # fallback: satu set params "bagus" default jika optuna tidak tersedia
    best_params = {
        "objective":"regression","metric":"rmse","boosting_type":"gbdt",
        "learning_rate":0.05,"num_leaves":127,"min_data_in_leaf":64,
        "feature_fraction":0.85,"bagging_fraction":0.85,"bagging_freq":1,
        "lambda_l1":0.0,"lambda_l2":1.0,"max_depth":-1,"n_jobs":-1,
        "verbosity":-1,"seed":RSTATE
    }
    best_score = None

# --------- CV ulang dengan best params untuk simpan artefak ---------
score, oof_pred, fold_stats = cv_score(best_params)

oof_df = train[[ID_COL, tgt_col]].copy()
oof_df["y_pred"] = oof_pred.astype(np.float32)
oof_df.to_csv(f"{OUTDIR}/lgbm_oof_tuned.csv", index=False, float_format="%.9f")

# Refit full untuk infer test
dfull = lgb.Dataset(train[feat_cols], label=train[tgt_col].astype(np.float64).values, free_raw_data=False)
gbm_full = lgb_train_silent(
    best_params, dfull,
    valid_sets=[dfull], valid_names=["full"],
    num_boost_round=NUM_BOOST,
    early_stopping_rounds=EARLY_STOP,
    log_period=0,  # refit hening
)
test_pred = gbm_full.predict(test[feat_cols], num_iteration=gbm_full.best_iteration)

test_sig = test[[ID_COL]].copy()
test_sig["prediction"] = test_pred.astype(np.float32)
test_sig.to_csv(f"{OUTDIR}/test_signal_lgbm_tuned.csv", index=False, float_format="%.9f")

# --------- Simpan ringkasan ---------
summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "target": tgt_col,
    "n_features": len(feat_cols),
    "n_folds": len(folds),
    "metric": METRIC_NAME,
    "mean_corr_oof": float(score),
    "fold_corrs": [float(x) for x in fold_stats],
    "best_params": best_params,
    "num_boost_round": NUM_BOOST,
    "early_stopping_rounds": EARLY_STOP,
    "log_period": LOG_PERIOD,
    "artifacts": {
        "oof_csv": f"{OUTDIR}/lgbm_oof_tuned.csv",
        "test_signal_csv": f"{OUTDIR}/test_signal_lgbm_tuned.csv",
        "best_params_json": f"{OUTDIR}/best_params.json",
        "model_full_txt": f"{OUTDIR}/lgbm_full_tuned.txt",
    }
}
Path(f"{OUTDIR}/best_params.json").write_text(json.dumps(summary["best_params"], indent=2), encoding="utf-8")
gbm_full.save_model(f"{OUTDIR}/lgbm_full_tuned.txt")
Path(f"{OUTDIR}/summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")

print("\n=== TUNING LGBM (CLEAN) — SELESAI ===")
if best_score is not None:
    print(f"- Optuna best corr (study): {best_score:.6f}")
print(f"- OOF mean corr (recalc)  : {score:.6f}")
print(f"- Fitur dipakai           : {len(feat_cols)}")
print(f"- OOF CSV                 : {OUTDIR}/lgbm_oof_tuned.csv")
print(f"- Test signal             : {OUTDIR}/test_signal_lgbm_tuned.csv")
print(f"- Params JSON             : {OUTDIR}/best_params.json")


Fitur kandidat: 89 | Target: forward_returns
Folds: 24
[100]	fold01's rmse: 0.00712788
[200]	fold01's rmse: 0.00712788
[300]	fold01's rmse: 0.00712788
[400]	fold01's rmse: 0.00712788
[500]	fold01's rmse: 0.00712788
[600]	fold01's rmse: 0.00712788
[100]	fold02's rmse: 0.00581698
[200]	fold02's rmse: 0.00581698
[300]	fold02's rmse: 0.00581698
[400]	fold02's rmse: 0.00581698
[500]	fold02's rmse: 0.00581698
[600]	fold02's rmse: 0.00581698
[100]	fold03's rmse: 0.00593723
[200]	fold03's rmse: 0.00593723
[300]	fold03's rmse: 0.00593723
[400]	fold03's rmse: 0.00593723
[500]	fold03's rmse: 0.00593723
[600]	fold03's rmse: 0.00593723
[100]	fold04's rmse: 0.00786643
[200]	fold04's rmse: 0.00786722
[300]	fold04's rmse: 0.00786857
[400]	fold04's rmse: 0.00786738
[500]	fold04's rmse: 0.00786956
[600]	fold04's rmse: 0.00786968
[700]	fold04's rmse: 0.0078689
[800]	fold04's rmse: 0.00786513
[900]	fold04's rmse: 0.00786185
[1000]	fold04's rmse: 0.00786087
[1100]	fold04's rmse: 0.00786631
[1200]	fold04's 

# Validasi

In [8]:
# ============================================
# Tahap 7 — Validasi (OOF/Test/Submission)
# ============================================
import os, json, glob, warnings
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.stats import spearmanr

warnings.filterwarnings("ignore")

# ---------- Lokasi & artefak ----------
ROOT    = "/kaggle/working/hull-tactical-market-prediction"
INPUT   = "/kaggle/input/hull-tactical-market-prediction"
OUTDIR  = f"{ROOT}/outputs"
VALDIR  = f"{OUTDIR}/validation"
Path(VALDIR).mkdir(parents=True, exist_ok=True)

ID_COL        = "date_id"
TARGET_CANDS  = ["forward_returns","target","y_true","market_forward_excess_returns"]
PRED_CANDS    = ["calibrated","y_pred","prediction","signal","allocation","weight","pred"]
TEST_PRED_CANDS = ["prediction_calibrated","prediction","signal","y_pred","allocation","weight"]

# Artefak umum yang mungkin sudah ada
OOF_HINTS = [
    f"{OUTDIR}/calibration/oof_calibrated.csv",
    f"{OUTDIR}/tuning_lgbm/lgbm_oof_tuned.csv",
    f"{OUTDIR}/signals/lgbm_oof.csv",
]
TEST_HINTS = [
    f"{OUTDIR}/calibration/test_calibrated.csv",
    f"{OUTDIR}/tuning_lgbm/test_signal_lgbm_tuned.csv",
    f"{OUTDIR}/signals/test_signal_lgbm.csv",
]
SUB_HINTS = [
    f"{ROOT}/kaggle_evaluation/submission.csv",
    "/kaggle/working/submission.csv",
]

# ---------- Util ----------
def _metrics(y, p):
    rmse = float(np.sqrt(mean_squared_error(y, p)))
    mae  = float(mean_absolute_error(y, p))
    corr = float(np.corrcoef(y, p)[0,1]) if len(y) > 1 else float("nan")
    try:
        spr = float(spearmanr(y, p, nan_policy="omit").correlation)
    except Exception:
        spr = float("nan")
    return {"rmse": rmse, "mae": mae, "corr": corr, "spearman": spr}

def _pick_col(df, cands):
    for c in cands:
        if c in df.columns: return c
    return None

def _first_exists(paths):
    for p in paths:
        if os.path.isfile(p): return p
    return None

def _list_all_csvs(base):
    files = glob.glob(f"{base}/**/*.csv", recursive=True) + glob.glob(f"{base}/**/*.parquet", recursive=True)
    return files

def _read_any(fp):
    return pd.read_parquet(fp) if fp.endswith(".parquet") else pd.read_csv(fp)

# ---------- 1) Muat data train (untuk target & sanity) ----------
train = pd.read_csv(f"{INPUT}/train.csv")
tgt_col = next((c for c in TARGET_CANDS if c in train.columns), None)
if tgt_col is None:
    raise ValueError(f"Tidak menemukan kolom target pada train. Kandidat: {TARGET_CANDS}")

# ---------- 2) Temukan OOF & Test secara otomatis ----------
oof_fp  = _first_exists(OOF_HINTS)
test_fp = _first_exists(TEST_HINTS)

# fallback: cari seluruh CSV bila hint tidak ada (mis. nama folder berbeda)
if oof_fp is None or test_fp is None:
    for fp in _list_all_csvs(OUTDIR):
        try:
            df = _read_any(fp)
            if oof_fp is None:
                if (_pick_col(df, PRED_CANDS) is not None) and (_pick_col(df, TARGET_CANDS) is not None):
                    oof_fp = fp
            if test_fp is None:
                if (_pick_col(df, TEST_PRED_CANDS) is not None) and (ID_COL in df.columns):
                    test_fp = fp
            if oof_fp and test_fp: break
        except Exception:
            pass

if oof_fp is None:
    raise FileNotFoundError("Tidak menemukan OOF (cari di outputs/**). Jalankan baseline/tuning/kalibrasi dulu.")
if test_fp is None:
    raise FileNotFoundError("Tidak menemukan prediksi test (cari di outputs/**).")

print(f"[INFO] OOF  : {oof_fp}")
print(f"[INFO] Test : {test_fp}")

oof  = _read_any(oof_fp).copy()
test = _read_any(test_fp).copy()

# Tentukan kolom prediksi
oof_pred_col  = _pick_col(oof,  PRED_CANDS)
test_pred_col = _pick_col(test, TEST_PRED_CANDS)
if oof_pred_col is None:
    raise ValueError(f"Tidak menemukan kolom prediksi pada OOF. Kandidat: {PRED_CANDS}")
if test_pred_col is None:
    raise ValueError(f"Tidak menemukan kolom prediksi pada Test. Kandidat: {TEST_PRED_CANDS}")

# Pastikan target tersedia di OOF: jika tidak, map dari train via (date_id)
if _pick_col(oof, TARGET_CANDS) is None:
    if ID_COL in oof.columns and ID_COL in train.columns:
        oof = oof.merge(train[[ID_COL, tgt_col]], on=ID_COL, how="left", validate="m:1")
    else:
        raise ValueError("OOF tidak punya target & tidak bisa merge via date_id.")

oof_tgt_col = _pick_col(oof, TARGET_CANDS)

# ---------- 3) Validasi OOF: metrik global & stability by time ----------
oof = oof.sort_values(ID_COL).reset_index(drop=True)
y = oof[oof_tgt_col].astype("float64").to_numpy()
p = oof[oof_pred_col].astype("float64").to_numpy()
m_global = _metrics(y, p)

# Bagi menjadi N irisan waktu berimbang
N_SLICES = 10
dates = np.sort(oof[ID_COL].unique())
chunks = np.array_split(dates, N_SLICES)
slice_rows = []
for i, seg in enumerate(chunks, 1):
    sub = oof[oof[ID_COL].isin(seg)]
    my = sub[oof_tgt_col].astype("float64").to_numpy()
    mp = sub[oof_pred_col].astype("float64").to_numpy()
    mm = _metrics(my, mp)
    mm.update({
        "slice": i,
        "date_min": int(seg.min()),
        "date_max": int(seg.max()),
        "n": int(sub.shape[0]),
        "pred_mean": float(sub[oof_pred_col].mean()),
        "pred_std": float(sub[oof_pred_col].std(ddof=0))
    })
    slice_rows.append(mm)

oof_slice_df = pd.DataFrame(slice_rows)
oof_slice_df.to_csv(f"{VALDIR}/oof_slice_metrics.csv", index=False)

# ---------- 4) Drift check: distribusi prediksi OOF vs Test ----------
# Normalisasi ringan: pastikan numeric & tanpa NaN/Inf
def _clean(x):
    x = pd.Series(x, dtype="float64").replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return x.to_numpy()

oo = _clean(oof[oof_pred_col])
tt = _clean(test[test_pred_col])

dist = {
    "oof": {"mean": float(np.mean(oo)), "std": float(np.std(oo, ddof=0)),
            "min": float(np.min(oo)), "max": float(np.max(oo)), "n": int(oof.shape[0])},
    "test": {"mean": float(np.mean(tt)), "std": float(np.std(tt, ddof=0)),
             "min": float(np.min(tt)), "max": float(np.max(tt)), "n": int(test.shape[0])},
    "ratio_std_test_to_oof": float((np.std(tt, ddof=0)+1e-12)/(np.std(oo, ddof=0)+1e-12)),
}

# ---------- 5) Cek submission (opsional) ----------
sub_fp = _first_exists(SUB_HINTS)
sub_info = None
if sub_fp and os.path.isfile(sub_fp):
    sub = pd.read_csv(sub_fp)
    ok_schema = (ID_COL in sub.columns) and (_pick_col(sub, ["allocation","prediction","signal"]) is not None)
    miss = None
    if ok_schema:
        # harap semua id test ada
        miss = sorted(set(test[ID_COL].unique().tolist()) - set(sub[ID_COL].unique().tolist()))
        miss = len(miss)
    sub_info = {
        "path": sub_fp,
        "ok_schema": bool(ok_schema),
        "rows": int(sub.shape[0]),
        "missing_ids_vs_test": int(miss if miss is not None else -1)
    }

# ---------- 6) Simpan ringkasan ----------
summary = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "files": {
        "oof": oof_fp,
        "test": test_fp,
        "submission": sub_info["path"] if sub_info else None
    },
    "columns": {
        "oof_pred": oof_pred_col,
        "oof_target": oof_tgt_col,
        "test_pred": test_pred_col
    },
    "oof_metrics_global": m_global,
    "oof_metrics_slices_csv": f"{VALDIR}/oof_slice_metrics.csv",
    "distribution_check": dist,
    "submission_check": sub_info
}
Path(f"{VALDIR}/validation_summary.json").write_text(json.dumps(summary, indent=2), encoding="utf-8")

# Simpan sampel residual untuk audit cepat
res = oof[[ID_COL, oof_tgt_col, oof_pred_col]].copy()
res["residual"] = res[oof_tgt_col].astype("float64") - res[oof_pred_col].astype("float64")
res.head(2000).to_csv(f"{VALDIR}/oof_residual_sample.csv", index=False, float_format="%.9f")

print("\n=== VALIDASI — SELESAI ===")
print(f"- OOF global : RMSE={m_global['rmse']:.6f} | MAE={m_global['mae']:.6f} | Corr={m_global['corr']:.4f} | Spearman={m_global['spearman']:.4f}")
print(f"- Slice CSV  : {VALDIR}/oof_slice_metrics.csv")
print(f"- Distribusi : OOF std={dist['oof']['std']:.6f} | Test std={dist['test']['std']:.6f} | Rasio std={dist['ratio_std_test_to_oof']:.3f}")
if sub_info:
    print(f"- Submission : {sub_info['path']} | ok_schema={sub_info['ok_schema']} | rows={sub_info['rows']} | missing_ids_vs_test={sub_info['missing_ids_vs_test']}")
print(f"- Ringkasan  : {VALDIR}/validation_summary.json")


[INFO] OOF  : /kaggle/working/hull-tactical-market-prediction/outputs/calibration/oof_calibrated.csv
[INFO] Test : /kaggle/working/hull-tactical-market-prediction/outputs/calibration/test_calibrated.csv

=== VALIDASI — SELESAI ===
- OOF global : RMSE=0.010513 | MAE=0.007492 | Corr=0.0886 | Spearman=0.0552
- Slice CSV  : /kaggle/working/hull-tactical-market-prediction/outputs/validation/oof_slice_metrics.csv
- Distribusi : OOF std=0.000705 | Test std=0.000705 | Rasio std=1.000
- Ringkasan  : /kaggle/working/hull-tactical-market-prediction/outputs/validation/validation_summary.json


# Submission

In [9]:
# ============================================
# Tahap 8 — Submission (robust & idempotent)
# ============================================
import os, json, warnings, glob
from pathlib import Path
from datetime import datetime
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# --- Lokasi proyek ---
ROOT   = "/kaggle/working/hull-tactical-market-prediction"
INPUT  = "/kaggle/input/hull-tactical-market-prediction"
OUTDIR = f"{ROOT}/outputs"
KEDIR  = f"{ROOT}/kaggle_evaluation"
Path(KEDIR).mkdir(parents=True, exist_ok=True)

ID_COL = "date_id"
SUBMIT_COL = "allocation"
FLOAT_FMT = "%.9f"
CLIP_RANGE = (-1.0, 1.0)

# --- Kandidat sumber prediksi (urutan prioritas) ---
CANDIDATES = [
    f"{OUTDIR}/mapped/test_allocation.csv",              # dari Mapping Sinyal
    f"{OUTDIR}/calibration/test_calibrated.csv",         # dari Kalibrasi (akan diubah ke 'allocation')
    f"{OUTDIR}/signals/test_signal_lgbm.csv",            # baseline test signal
    f"{OUTDIR}/tuning_lgbm/test_signal_lgbm_tuned.csv",  # hasil tuning (jika ada)
]

# --- Util ---
def _first_exists(paths):
    for p in paths:
        if os.path.isfile(p):
            return p
    return None

def _auto_pick_pred_col(df):
    # urutan kandidat kolom prediksi
    for c in ["allocation","weight","prediction_calibrated","prediction","signal","y_pred","lgbm_pred","score"]:
        if c in df.columns:
            return c
    return None

def _read(path):
    return pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path)

def _save_csv(df, path):
    Path(path).parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False, float_format=FLOAT_FMT)

# --- 1) Muat test.csv untuk referensi ID ---
test_csv = f"{INPUT}/test.csv"
assert os.path.isfile(test_csv), f"Tidak menemukan {test_csv}"
test = pd.read_csv(test_csv)[[ID_COL]].copy()
test_ids = pd.Series(sorted(test[ID_COL].unique()))

# --- 2) Cari sumber prediksi terbaik ---
src = _first_exists(CANDIDATES)
if src is None:
    # fallback: cari apa pun di outputs yang punya pred + ID
    for fp in glob.glob(f"{OUTDIR}/**/*.*", recursive=True):
        if fp.endswith((".csv",".parquet")):
            try:
                df = _read(fp)
                if (ID_COL in df.columns) and (_auto_pick_pred_col(df) is not None):
                    src = fp
                    break
            except Exception:
                pass

assert src is not None, "Tidak menemukan file prediksi test di outputs/. Jalankan tahap sebelumnya."
dfp = _read(src).copy()
pred_col = _auto_pick_pred_col(dfp)
assert pred_col is not None, f"Tidak menemukan kolom prediksi pada {src}"

# --- 3) Bangun submission yang selaras dengan test ---
sub = dfp[[ID_COL, pred_col]].rename(columns={pred_col: SUBMIT_COL}).copy()

# sort & dedup by last (jaga-jaga)
sub = sub.sort_values(ID_COL).drop_duplicates(ID_COL, keep="last").reset_index(drop=True)

# selaraskan ke ID test (inner join + isi 0 untuk yang hilang)
sub = test_ids.to_frame(name=ID_COL).merge(sub, on=ID_COL, how="left")
x = sub[SUBMIT_COL].astype("float64").to_numpy()
# NaN/Inf guard
x = np.nan_to_num(x, nan=0.0, posinf=0.0, neginf=0.0)
# clip aman
if CLIP_RANGE is not None:
    lo, hi = CLIP_RANGE
    x = np.clip(x, lo, hi)
sub[SUBMIT_COL] = x.astype("float32")

# --- 4) Simpan file submission & audit ---
submission_main = "/kaggle/working/submission.csv"
_save_csv(sub[[ID_COL, SUBMIT_COL]], submission_main)

# salinan + audit di kaggle_evaluation
submission_copy = f"{KEDIR}/submission.csv"
audit_path      = f"{KEDIR}/submission_audit.csv"
meta_path       = f"{KEDIR}/submission_meta.json"

_save_csv(sub[[ID_COL, SUBMIT_COL]], submission_copy)

# Audit gabungkan sumber mentah (jika berbeda nama kolom)
audit = sub.merge(dfp[[ID_COL, pred_col]], on=ID_COL, how="left")
audit = audit.rename(columns={pred_col: "prediction_source"})
_save_csv(audit, audit_path)

meta = {
    "generated_at": datetime.now().isoformat(timespec="seconds"),
    "source_file": src,
    "source_pred_col": pred_col,
    "submission_rows": int(sub.shape[0]),
    "clip_range": list(CLIP_RANGE) if CLIP_RANGE else None,
    "stats": {
        "min": float(sub[SUBMIT_COL].min()),
        "max": float(sub[SUBMIT_COL].max()),
        "mean": float(sub[SUBMIT_COL].mean()),
        "std": float(sub[SUBMIT_COL].std(ddof=0)),
        "n_unique_ids": int(sub[ID_COL].nunique()),
    },
    "paths": {
        "submission_main": submission_main,
        "submission_copy": submission_copy,
        "audit": audit_path
    }
}
Path(meta_path).write_text(json.dumps(meta, indent=2), encoding="utf-8")

# --- 5) Ringkasan ringkas di output ---
print("=== SUBMISSION — SELESAI ===")
print(f"- Sumber      : {src} (pred_col='{pred_col}')")
print(f"- Baris       : {meta['submission_rows']}")
print(f"- Stats       : mean={meta['stats']['mean']:.6f} | std={meta['stats']['std']:.6f} | min={meta['stats']['min']:.6f} | max={meta['stats']['max']:.6f}")
print(f"- File utama  : {submission_main}")
print(f"- Salinan     : {submission_copy}")
print(f"- Audit       : {audit_path}")


=== SUBMISSION — SELESAI ===
- Sumber      : /kaggle/working/hull-tactical-market-prediction/outputs/calibration/test_calibrated.csv (pred_col='prediction_calibrated')
- Baris       : 10
- Stats       : mean=-0.000108 | std=0.000000 | min=-0.000108 | max=-0.000108
- File utama  : /kaggle/working/submission.csv
- Salinan     : /kaggle/working/hull-tactical-market-prediction/kaggle_evaluation/submission.csv
- Audit       : /kaggle/working/hull-tactical-market-prediction/kaggle_evaluation/submission_audit.csv
