In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mallorn-dataset/sample_submission.csv
/kaggle/input/mallorn-dataset/test_log.csv
/kaggle/input/mallorn-dataset/train_log.csv
/kaggle/input/mallorn-dataset/split_17/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_17/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_01/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_01/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_02/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_02/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_08/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_08/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_04/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_04/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_07/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_07/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_15/train_full_lightcurves.csv
/kaggle/i

 # Kaggle CPU Environment Setup

In [2]:
# ============================================================
# STAGE 0 — Kaggle CPU Environment Setup (ONE CELL, SAFE + COHESIVE) — REVISI FULL v5
# Fix utama v5:
# - Quick LC numeric check dibuat "COLUMN-AWARE":
#     * Flux: TOLERANT (banyak NaN -> WARN + dicatat, bukan error)
#     * Time(MJD) & Flux_err: tetap ketat (NaN besar -> error)
# - Hanya error jika indikasi parsing rusak: semua/hampir semua Flux NaN
# - Buat report ringkas NaN head per 40 file (untuk audit)
# ============================================================

import os, sys, gc, json, time, random, hashlib, warnings
from pathlib import Path

import numpy as np
import pandas as pd

# ----------------------------
# 0) Quiet + deterministic
# ----------------------------
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

SEED = 2025
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

# ----------------------------
# 1) CPU thread limits (anti-freeze on Kaggle CPU)
# ----------------------------
THREADS = 2
os.environ.setdefault("OMP_NUM_THREADS", str(THREADS))
os.environ.setdefault("OPENBLAS_NUM_THREADS", str(THREADS))
os.environ.setdefault("MKL_NUM_THREADS", str(THREADS))
os.environ.setdefault("VECLIB_MAXIMUM_THREADS", str(THREADS))
os.environ.setdefault("NUMEXPR_NUM_THREADS", str(THREADS))
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
os.environ.setdefault("TF_NUM_INTRAOP_THREADS", str(THREADS))
os.environ.setdefault("TF_NUM_INTEROP_THREADS", "1")

try:
    import torch
    torch.manual_seed(SEED)
    torch.set_num_threads(THREADS)
    torch.set_num_interop_threads(1)
except Exception:
    torch = None

# ----------------------------
# 2) PATHS
# ----------------------------
DATA_ROOT = Path("/kaggle/input/mallorn-dataset")
PATHS = {
    "DATA_ROOT": DATA_ROOT,
    "SAMPLE_SUB": DATA_ROOT / "sample_submission.csv",
    "TRAIN_LOG":  DATA_ROOT / "train_log.csv",
    "TEST_LOG":   DATA_ROOT / "test_log.csv",
    "SPLITS":     [DATA_ROOT / f"split_{i:02d}" for i in range(1, 21)],
}

# ----------------------------
# 3) WORKDIR (versioned run)
# ----------------------------
WORKDIR = Path("/kaggle/working")
BASE_RUN_DIR = WORKDIR / "mallorn_run"
BASE_RUN_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 4) CONFIG / TOGGLES
# ----------------------------
CFG = {
    # Core plan (stage berikutnya)
    "USE_TRANSFORMER": True,
    "USE_GBDT": True,
    "USE_ENSEMBLE": True,
    "USE_THRESHOLD_TUNING": True,

    "USE_DEEXTINCT": True,
    "USE_RESTFRAME_TIME": True,
    "USE_SNR_FEATURES": True,
    "USE_ASINH_FLUX": True,

    "L_MAX": 256,
    "TRUNC_POLICY": "smart_band_peak",
    "PAD_POLICY": "dynamic",

    "N_FOLDS": 5,
    "CV_STRATIFY": True,
    "CV_USE_SPLIT_COL": True,

    "SNR_DET_THR": 3.0,
    "MIN_FLUXERR": 1e-6,

    # Stage 0 checks
    "QUICK_LC_SCHEMA_CHECK": True,
    "SAMPLE_ID_CROSSCHECK": True,
    "SAMPLE_ID_PER_SPLIT": 5,
    "MAX_CHUNKS_PER_FILE": 6,
    "CHUNK_ROWS": 200_000,

    "LC_HEAD_ROWS": 2000,

    # Per-column NA policy untuk quick check (head-only)
    # - Time(MJD), Flux_err harus hampir bersih
    # - Flux boleh NaN lebih banyak (anggap missing obs) -> WARN saja
    "LC_NA_POLICY": {
        "Time (MJD)": {"max_na_abs": 3, "max_na_frac": 0.002, "fail_if_all_na": True, "hard_fail": True},
        "Flux_err":   {"max_na_abs": 3, "max_na_frac": 0.002, "fail_if_all_na": True, "hard_fail": True},

        # Flux: toleran. Hanya fail jika hampir semua NaN (indikasi parsing rusak)
        "Flux":       {"max_na_abs": 10_000, "max_na_frac": 0.80, "fail_if_all_na": True, "hard_fail": False,
                       "fail_if_na_frac_ge": 0.98},
    }
}

def _hash_cfg(d: dict) -> str:
    s = json.dumps(d, sort_keys=True)
    return hashlib.sha256(s.encode("utf-8")).hexdigest()[:10]

CFG_HASH = _hash_cfg(CFG)
RUN_TAG = time.strftime("%Y%m%d_%H%M%S")
RUN_DIR = BASE_RUN_DIR / f"run_{RUN_TAG}_{CFG_HASH}"

ART_DIR   = RUN_DIR / "artifacts"
CACHE_DIR = RUN_DIR / "cache"
OOF_DIR   = RUN_DIR / "oof"
SUB_DIR   = RUN_DIR / "submissions"
LOG_DIR   = RUN_DIR / "logs"
EMB_DIR   = CACHE_DIR / "embeddings"
FEAT_DIR  = CACHE_DIR / "features"

for d in [RUN_DIR, ART_DIR, CACHE_DIR, OOF_DIR, SUB_DIR, LOG_DIR, EMB_DIR, FEAT_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 5) Hard guards: files must exist
# ----------------------------
def _must_exist(p: Path, what: str):
    if not p.exists():
        raise FileNotFoundError(f"[MISSING] {what}: {p}")

_must_exist(PATHS["SAMPLE_SUB"], "sample_submission.csv")
_must_exist(PATHS["TRAIN_LOG"],  "train_log.csv")
_must_exist(PATHS["TEST_LOG"],   "test_log.csv")

missing_splits = [s for s in PATHS["SPLITS"] if not s.exists()]
if missing_splits:
    sample = "\n".join(str(x) for x in missing_splits[:5])
    raise FileNotFoundError(f"Some split folders are missing (showing up to 5):\n{sample}")

bad = []
for sd in PATHS["SPLITS"]:
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"
    if (not tr.exists()) or (not te.exists()):
        bad.append((sd.name, tr.exists(), te.exists()))
if bad:
    msg = "\n".join([f"- {name}: train={tr_ok}, test={te_ok}" for name, tr_ok, te_ok in bad[:10]])
    raise FileNotFoundError("Some split lightcurve files are missing (showing up to 10):\n" + msg)

# ----------------------------
# 6) CSV read defaults (stabil untuk angka kosong)
# ----------------------------
SAFE_NA_VALUES = ["", " ", "NA", "NaN", "nan", "NULL", "null", "None", "none"]
SAFE_READ_KW = dict(low_memory=False, na_values=SAFE_NA_VALUES, keep_default_na=True)

def _norm_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

# ----------------------------
# 7) Load metadata kecil (sample + logs)
# ----------------------------
df_sub = pd.read_csv(PATHS["SAMPLE_SUB"], dtype={"object_id": "string"}, **SAFE_READ_KW)
df_sub = _norm_cols(df_sub)
if not {"object_id", "prediction"}.issubset(df_sub.columns):
    raise ValueError(f"sample_submission columns must include object_id,prediction. Found: {list(df_sub.columns)}")

df_train_log = pd.read_csv(PATHS["TRAIN_LOG"], dtype={"object_id": "string", "split": "string"}, **SAFE_READ_KW)
df_test_log  = pd.read_csv(PATHS["TEST_LOG"],  dtype={"object_id": "string", "split": "string"}, **SAFE_READ_KW)
df_train_log = _norm_cols(df_train_log)
df_test_log  = _norm_cols(df_test_log)

# ----------------------------
# 8) Validate log schemas + normalize split + unify Z_err
# ----------------------------
need_train = {"object_id", "EBV", "Z", "split", "target"}
need_test  = {"object_id", "EBV", "Z", "split"}  # Z_err opsional

missing_train = sorted(list(need_train - set(df_train_log.columns)))
missing_test  = sorted(list(need_test  - set(df_test_log.columns)))
if missing_train:
    raise ValueError(f"train_log.csv missing required columns: {missing_train}")
if missing_test:
    raise ValueError(f"test_log.csv missing required columns: {missing_test}")

for col in ["EBV", "Z"]:
    df_train_log[col] = pd.to_numeric(df_train_log[col], errors="coerce")
    df_test_log[col]  = pd.to_numeric(df_test_log[col],  errors="coerce")
    if df_train_log[col].isna().any():
        raise ValueError(f"train_log {col} has NaN after numeric coercion: {int(df_train_log[col].isna().sum())}")
    if df_test_log[col].isna().any():
        raise ValueError(f"test_log {col} has NaN after numeric coercion: {int(df_test_log[col].isna().sum())}")

if "Z_err" in df_test_log.columns:
    df_test_log["Z_err"] = pd.to_numeric(df_test_log["Z_err"], errors="coerce")
else:
    df_test_log["Z_err"] = np.nan

if "Z_err" in df_train_log.columns:
    df_train_log["Z_err"] = pd.to_numeric(df_train_log["Z_err"], errors="coerce")
else:
    df_train_log["Z_err"] = np.nan

df_train_log["has_zerr"] = 0
df_test_log["has_zerr"]  = (~df_test_log["Z_err"].isna()).astype("int8")
df_train_log["Z_err"] = df_train_log["Z_err"].fillna(0.0)
df_test_log["Z_err"]  = df_test_log["Z_err"].fillna(0.0)

valid_split_names = {f"split_{i:02d}" for i in range(1, 21)}
def _normalize_split(x):
    if pd.isna(x):
        return ""
    s = str(x).strip()
    if not s:
        return ""
    if s.isdigit():
        return f"split_{int(s):02d}"
    s2 = s.lower().replace("-", "_").replace(" ", "_")
    if s2.startswith("split_"):
        tail = s2.split("split_", 1)[1].strip("_")
        if tail.isdigit():
            return f"split_{int(tail):02d}"
        return s2
    return s

df_train_log["split"] = df_train_log["split"].map(_normalize_split)
df_test_log["split"]  = df_test_log["split"].map(_normalize_split)

bad_train_split = sorted(set(df_train_log["split"]) - valid_split_names)
bad_test_split  = sorted(set(df_test_log["split"])  - valid_split_names)
if bad_train_split:
    raise ValueError(f"train_log has invalid split values (examples): {bad_train_split[:10]}")
if bad_test_split:
    raise ValueError(f"test_log has invalid split values (examples): {bad_test_split[:10]}")

if df_train_log["object_id"].duplicated().any():
    raise ValueError(f"train_log duplicated object_id: {int(df_train_log['object_id'].duplicated().sum())}")
if df_test_log["object_id"].duplicated().any():
    raise ValueError(f"test_log duplicated object_id: {int(df_test_log['object_id'].duplicated().sum())}")

df_train_log["target"] = pd.to_numeric(df_train_log["target"], errors="coerce")
if df_train_log["target"].isna().any():
    raise ValueError(f"train_log target has NaN after numeric coercion: {int(df_train_log['target'].isna().sum())}")
uniq_t = set(pd.unique(df_train_log["target"]).tolist())
if not uniq_t.issubset({0, 1}):
    raise ValueError(f"train_log target must be binary 0/1. Found: {sorted(list(uniq_t))}")

train_ids = set(df_train_log["object_id"].astype("string"))
test_ids  = set(df_test_log["object_id"].astype("string"))
intersect = train_ids & test_ids
if intersect:
    raise ValueError(f"train_log and test_log share object_id (should be disjoint). Examples: {list(intersect)[:5]}")

sub_ids = set(df_sub["object_id"].astype("string"))
missing_in_test = sub_ids - test_ids
missing_in_sub  = test_ids - sub_ids
if missing_in_test:
    raise ValueError(f"sample_submission has object_id not in test_log (up to 5): {list(missing_in_test)[:5]}")
if missing_in_sub:
    raise ValueError(f"test_log has object_id not in sample_submission (up to 5): {list(missing_in_sub)[:5]}")

# ----------------------------
# 9) Build split manifest (40 CSV index) + save
# ----------------------------
split_rows = []
for i in range(1, 21):
    sp = f"split_{i:02d}"
    sd = DATA_ROOT / sp
    split_rows.append({
        "split": sp,
        "dir": str(sd),
        "train_csv": str(sd / "train_full_lightcurves.csv"),
        "test_csv":  str(sd / "test_full_lightcurves.csv"),
        "train_exists": (sd / "train_full_lightcurves.csv").exists(),
        "test_exists":  (sd / "test_full_lightcurves.csv").exists(),
        "n_train_objects_log": int((df_train_log["split"] == sp).sum()),
        "n_test_objects_log":  int((df_test_log["split"]  == sp).sum()),
    })
df_split_manifest = pd.DataFrame(split_rows)
manifest_path = LOG_DIR / "split_manifest.csv"
df_split_manifest.to_csv(manifest_path, index=False)

# ----------------------------
# 10) OPTIONAL: quick lightcurve schema check (tolerant numeric)
# ----------------------------
LC_EXPECT = {"object_id", "Time (MJD)", "Flux", "Flux_err", "Filter"}
LC_FILTER_OK = set(list("ugrizy"))

def _na_policy(col: str):
    pol = CFG.get("LC_NA_POLICY", {})
    return pol.get(col, {"max_na_abs": 0, "max_na_frac": 0.0, "fail_if_all_na": True, "hard_fail": True})

def _warn_or_raise_numeric(dfh: pd.DataFrame, col: str, csv_path: Path, report_row: dict):
    pol = _na_policy(col)
    vv = pd.to_numeric(dfh[col], errors="coerce")
    n = len(vv)
    n_na = int(vv.isna().sum())
    frac = (n_na / max(n, 1)) if n > 0 else 1.0

    report_row[f"na_{col}"] = n_na
    report_row[f"nafrac_{col}"] = float(frac)

    if n == 0:
        raise ValueError(f"[LC NUM] Empty head in {csv_path.name}")

    if pol.get("fail_if_all_na", True) and n_na == n:
        ex = dfh[col].astype(str).head(5).tolist()
        raise ValueError(f"[LC NUM] All values non-numeric/NaN in {csv_path.name} col={col}. Examples: {ex}")

    # special: Flux only fail if NA fraction extremely high (indikasi parsing rusak)
    fail_if_ge = pol.get("fail_if_na_frac_ge", None)
    if fail_if_ge is not None and frac >= float(fail_if_ge) and n >= 200:
        ex = dfh.loc[vv.isna(), col].astype(str).head(5).tolist()
        raise ValueError(
            f"[LC NUM] Suspicious: {csv_path.name} col={col} NA frac {frac*100:.2f}% (>= {float(fail_if_ge)*100:.1f}%). "
            f"Examples: {ex}"
        )

    # normal tolerance logic
    max_abs = int(pol.get("max_na_abs", 0))
    max_frac = float(pol.get("max_na_frac", 0.0))
    hard_fail = bool(pol.get("hard_fail", True))

    if n_na > 0:
        ex = dfh.loc[vv.isna(), col].astype(str).head(5).tolist()
        if hard_fail and (n_na > max_abs) and (frac > max_frac):
            raise ValueError(
                f"[LC NUM] Too many non-numeric/NaN in {csv_path.name} col={col}: "
                f"{n_na}/{n} ({frac*100:.2f}%). Examples: {ex}"
            )
        else:
            print(f"[WARN] {csv_path.name} col={col}: {n_na}/{n} ({frac*100:.2f}%) non-numeric/NaN in head. Examples: {ex}")

    return vv

def _quick_lc_check(csv_path: Path, n_head: int, report_row: dict):
    dfh = pd.read_csv(csv_path, nrows=n_head, **SAFE_READ_KW)
    dfh = _norm_cols(dfh)

    miss = sorted(list(LC_EXPECT - set(dfh.columns)))
    if miss:
        raise ValueError(f"[LC SCHEMA] Missing columns in {csv_path.name}: {miss}")

    filt = dfh["Filter"].astype("string").str.strip().str.lower()
    filt = filt[~filt.isna()]
    badf = sorted(list(set(filt.unique().tolist()) - LC_FILTER_OK))
    if badf:
        raise ValueError(f"[LC FILTER] Invalid filter values in {csv_path.name}: {badf[:10]}")

    _ = _warn_or_raise_numeric(dfh, "Time (MJD)", csv_path, report_row)
    flux = _warn_or_raise_numeric(dfh, "Flux", csv_path, report_row)
    ferr = _warn_or_raise_numeric(dfh, "Flux_err", csv_path, report_row)

    # flux_err should be >= 0 (ignore NaN)
    ferr2 = ferr.dropna()
    if len(ferr2) > 0 and (ferr2 < 0).any():
        raise ValueError(f"[LC NUM] Negative Flux_err in head of {csv_path.name}")

def _sample_id_crosscheck(csv_path: Path, want_ids: set, chunk_rows: int, max_chunks: int):
    if not want_ids:
        return True
    remaining = set(want_ids)
    read_ok = False
    for i, chunk in enumerate(pd.read_csv(csv_path, usecols=["object_id"], chunksize=chunk_rows, **SAFE_READ_KW)):
        read_ok = True
        found = set(chunk["object_id"].astype("string")) & remaining
        remaining -= found
        if not remaining:
            return True
        if i + 1 >= max_chunks:
            break
    if not read_ok:
        raise ValueError(f"[LC READ] Could not read {csv_path}")
    if remaining:
        print(f"[WARN] Sample ID crosscheck limited-scan miss for {csv_path.name}: missing {len(remaining)} ids (scan cap).")
    return True

# Run checks for all 40 files; write report
lc_report = []
if CFG["QUICK_LC_SCHEMA_CHECK"] or CFG["SAMPLE_ID_CROSSCHECK"]:
    for sp in sorted(valid_split_names):
        sd  = DATA_ROOT / sp
        trp = sd / "train_full_lightcurves.csv"
        tep = sd / "test_full_lightcurves.csv"

        row_tr = {"split": sp, "kind": "train", "file": str(trp)}
        row_te = {"split": sp, "kind": "test",  "file": str(tep)}

        if CFG["QUICK_LC_SCHEMA_CHECK"]:
            _quick_lc_check(trp, int(CFG["LC_HEAD_ROWS"]), row_tr)
            _quick_lc_check(tep, int(CFG["LC_HEAD_ROWS"]), row_te)

        if CFG["SAMPLE_ID_CROSSCHECK"]:
            k = int(CFG["SAMPLE_ID_PER_SPLIT"])
            tr_ids = df_train_log.loc[df_train_log["split"] == sp, "object_id"].astype("string")
            te_ids = df_test_log.loc[df_test_log["split"]  == sp, "object_id"].astype("string")
            samp_tr = set(tr_ids.sample(n=min(k, len(tr_ids)), random_state=SEED).tolist()) if len(tr_ids) else set()
            samp_te = set(te_ids.sample(n=min(k, len(te_ids)), random_state=SEED).tolist()) if len(te_ids) else set()
            _sample_id_crosscheck(trp, samp_tr, int(CFG["CHUNK_ROWS"]), int(CFG["MAX_CHUNKS_PER_FILE"]))
            _sample_id_crosscheck(tep, samp_te, int(CFG["CHUNK_ROWS"]), int(CFG["MAX_CHUNKS_PER_FILE"]))

        lc_report.append(row_tr)
        lc_report.append(row_te)

lc_report_path = LOG_DIR / "lc_head_na_report.csv"
if lc_report:
    pd.DataFrame(lc_report).to_csv(lc_report_path, index=False)

# ----------------------------
# 11) Summaries
# ----------------------------
pos = int((df_train_log["target"] == 1).sum())
neg = int((df_train_log["target"] == 0).sum())
tot = int(len(df_train_log))

def _q(x, qs=(0.01, 0.5, 0.99)):
    x = np.asarray(x, dtype=float)
    return [float(np.quantile(x, q)) for q in qs]

df_split_stats = (
    df_train_log.groupby("split")["target"]
    .agg(["count","sum"])
    .rename(columns={"count":"n_train","sum":"n_pos"})
    .reset_index()
)
df_split_stats["pos_rate"] = df_split_stats["n_pos"] / df_split_stats["n_train"].clip(lower=1)
df_split_stats = df_split_stats.sort_values("split")
split_stats_path = LOG_DIR / "train_split_stats.csv"
df_split_stats.to_csv(split_stats_path, index=False)

print("ENV OK (Kaggle CPU)")
print(f"- Python: {sys.version.split()[0]}")
print(f"- Numpy:  {np.__version__}")
print(f"- Pandas: {pd.__version__}")
if torch is not None:
    print(f"- Torch:  {torch.__version__} | CUDA available: {torch.cuda.is_available()}")
else:
    print("- Torch:  not available")

print("\nRUN OK")
print(f"- RUN_DIR:  {RUN_DIR}")
print(f"- CFG_HASH: {CFG_HASH}")
print(f"- manifest: {manifest_path}")
if lc_report:
    print(f"- lc head NA report: {lc_report_path}")

print("\nDATA OK")
print(f"- train_log objects: {len(df_train_log):,} | pos(TDE)={pos:,} | neg={neg:,} | pos%={(pos/max(tot,1))*100:.2f}%")
print(f"- test_log objects:  {len(df_test_log):,}")
print(f"- sample_submission rows: {len(df_sub):,}")
print(f"- splits: 20 folders | lightcurve files: 40 CSV (train+test)")
print(f"- split stats saved: {split_stats_path}")

print("\nMETA SNAPSHOT (train vs test)")
print(f"- EBV quantiles train [1%,50%,99%]: {_q(df_train_log['EBV'].values)} | test: {_q(df_test_log['EBV'].values)}")
print(f"- Z   quantiles train [1%,50%,99%]: {_q(df_train_log['Z'].values)}   | test: {_q(df_test_log['Z'].values)}")
print(f"- has_zerr test %: {float(df_test_log['has_zerr'].mean()*100):.2f}% (train selalu 0)")

# ----------------------------
# 12) Save config snapshot
# ----------------------------
snap = {
    "SEED": SEED,
    "CFG_HASH": CFG_HASH,
    "RUN_TAG": RUN_TAG,
    "DATA_ROOT": str(DATA_ROOT),
    "RUN_DIR": str(RUN_DIR),
    "THREADS": {k: os.environ.get(k, "") for k in [
        "OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","NUMEXPR_NUM_THREADS",
        "TF_NUM_INTRAOP_THREADS","TF_NUM_INTEROP_THREADS"
    ]},
    "CFG": CFG,
    "FILES": {
        "sample_submission": str(PATHS["SAMPLE_SUB"]),
        "train_log": str(PATHS["TRAIN_LOG"]),
        "test_log": str(PATHS["TEST_LOG"]),
        "split_manifest": str(manifest_path),
        "lc_head_na_report": str(lc_report_path) if lc_report else "",
    }
}
cfg_path_json = RUN_DIR / "config_stage0.json"
with open(cfg_path_json, "w", encoding="utf-8") as f:
    json.dump(snap, f, indent=2)
print(f"\nSaved config: {cfg_path_json}")

# ----------------------------
# 13) Export to globals
# ----------------------------
globals().update({
    "SEED": SEED,
    "THREADS": THREADS,
    "CFG": CFG,
    "CFG_HASH": CFG_HASH,
    "PATHS": PATHS,
    "DATA_ROOT": DATA_ROOT,
    "WORKDIR": WORKDIR,
    "RUN_DIR": RUN_DIR,
    "ART_DIR": ART_DIR,
    "CACHE_DIR": CACHE_DIR,
    "EMB_DIR": EMB_DIR,
    "FEAT_DIR": FEAT_DIR,
    "OOF_DIR": OOF_DIR,
    "SUB_DIR": SUB_DIR,
    "LOG_DIR": LOG_DIR,
    "df_sub": df_sub,
    "df_train_log": df_train_log,
    "df_test_log": df_test_log,
    "df_split_manifest": df_split_manifest,
    "df_split_stats": df_split_stats,
})

gc.collect()

[WARN] train_full_lightcurves.csv col=Flux: 1/2000 (0.05%) non-numeric/NaN in head. Examples: ['nan']
[WARN] train_full_lightcurves.csv col=Flux: 4/2000 (0.20%) non-numeric/NaN in head. Examples: ['nan', 'nan', 'nan', 'nan']
[WARN] test_full_lightcurves.csv col=Flux: 2/2000 (0.10%) non-numeric/NaN in head. Examples: ['nan', 'nan']
[WARN] test_full_lightcurves.csv col=Flux: 1/2000 (0.05%) non-numeric/NaN in head. Examples: ['nan']
[WARN] train_full_lightcurves.csv col=Flux: 190/2000 (9.50%) non-numeric/NaN in head. Examples: ['nan', 'nan', 'nan', 'nan', 'nan']
[WARN] train_full_lightcurves.csv col=Flux: 1/2000 (0.05%) non-numeric/NaN in head. Examples: ['nan']
[WARN] test_full_lightcurves.csv col=Flux: 1/2000 (0.05%) non-numeric/NaN in head. Examples: ['nan']
[WARN] train_full_lightcurves.csv col=Flux: 2/2000 (0.10%) non-numeric/NaN in head. Examples: ['nan', 'nan']
[WARN] train_full_lightcurves.csv col=Flux: 2/2000 (0.10%) non-numeric/NaN in head. Examples: ['nan', 'nan']
[WARN] train_

186

# Verify Dataset Paths & Split Discovery

In [3]:
# ============================================================
# STAGE 1 — Split Routing + Lightcurve Micro-Profiling (ONE CELL, CPU-SAFE) — REVISI FULL v2
# Fokus:
# - Pakai globals STAGE 0: PATHS, df_train_log, df_test_log, (opsional) CFG, LOG_DIR, RUN_DIR
# - TIDAK load full lightcurves: hanya header + sample kecil + limited chunk scan
# - Validasi:
#   * split folders + 40 CSV ada
#   * schema kolom minimal sesuai
#   * filter values wajar (ugrizy), toleran NaN/blank
#   * numeric sample check:
#       - Time (MJD), Flux_err: relatif ketat (kalau parah -> error)
#       - Flux: toleran (boleh NaN lebih banyak -> warn)
#   * crosscheck object_id (sample dari logs) benar-benar muncul di CSV (limited scan)
# - Output penting utk F1 tinggi (dipakai stage berikutnya):
#   * split_routing.csv: daftar 40 file + count objects log
#   * lc_sample_stats.csv: missing rate/head stats per file (buat tuning cleaning/clipping)
# ============================================================

import re, gc, json
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require STAGE 0 globals
# ----------------------------
need0 = ["PATHS", "df_train_log", "df_test_log"]
for need in need0:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 dulu.")

DATA_ROOT = Path(PATHS["DATA_ROOT"])
SPLIT_DIRS = {p.name: p for p in PATHS["SPLITS"]}  # split_01..split_20 -> Path
VALID_SPLITS = {f"split_{i:02d}" for i in range(1, 21)}

# Optional dirs from stage 0
RUN_DIR = Path(globals().get("RUN_DIR", "/kaggle/working/mallorn_run"))
LOG_DIR = Path(globals().get("LOG_DIR", RUN_DIR / "logs"))
LOG_DIR.mkdir(parents=True, exist_ok=True)

# Optional CFG
CFG_LOCAL = globals().get("CFG", {})
SEED = int(globals().get("SEED", 2025))

# ----------------------------
# 1) Safe read config (konsisten dengan STAGE 0)
# ----------------------------
SAFE_NA_VALUES = ["", " ", "NA", "NaN", "nan", "NULL", "null", "None", "none"]
SAFE_READ_KW = dict(low_memory=False, na_values=SAFE_NA_VALUES, keep_default_na=True)

# sampling knobs (CPU-safe)
HEAD_ROWS = int(CFG_LOCAL.get("LC_HEAD_ROWS", 2000))          # untuk numeric/head checks
FILTER_ROWS = int(CFG_LOCAL.get("LC_HEAD_ROWS", 2000))        # reuse
SAMPLE_ID_PER_SPLIT = int(CFG_LOCAL.get("SAMPLE_ID_PER_SPLIT", 5))
CHUNK_ROWS = int(CFG_LOCAL.get("CHUNK_ROWS", 200_000))
MAX_CHUNKS_PER_FILE = int(CFG_LOCAL.get("MAX_CHUNKS_PER_FILE", 6))

# ----------------------------
# 2) Helpers
# ----------------------------
REQ_LC_COLS = {"object_id", "Time (MJD)", "Flux", "Flux_err", "Filter"}
ALLOWED_FILTERS = {"u", "g", "r", "i", "z", "y"}

def _norm_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

def normalize_split_name(x) -> str:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    s = str(x).strip()
    if not s:
        return ""
    s2 = s.lower().replace("-", "_").replace(" ", "_")
    if s2.isdigit():
        return f"split_{int(s2):02d}"
    m = re.fullmatch(r"split_(\d{1,2})", s2)
    if m:
        return f"split_{int(m.group(1)):02d}"
    m = re.fullmatch(r"split(\d{1,2})", s2)
    if m:
        return f"split_{int(m.group(1)):02d}"
    return s2

def sizeof_mb(p: Path) -> float:
    try:
        return p.stat().st_size / (1024**2)
    except Exception:
        return float("nan")

def read_header_cols(p: Path):
    df0 = pd.read_csv(p, nrows=0, **SAFE_READ_KW)
    return [c.strip() for c in df0.columns]

def sample_filter_values(p: Path, nrows: int = 500):
    df = pd.read_csv(p, usecols=["Filter"], nrows=nrows, **SAFE_READ_KW)
    v = df["Filter"].astype("string").str.strip().str.lower()
    v = v[~v.isna()]
    return sorted(set(v.tolist()))

def _numeric_sample_report(p: Path, nrows: int):
    """
    Baca sample kecil untuk numeric coercion.
    - Time & Flux_err: kalau terlalu banyak NaN -> error (indikasi parsing rusak)
    - Flux: toleran, hanya dicatat.
    """
    usecols = ["Time (MJD)", "Flux", "Flux_err"]
    dfh = pd.read_csv(p, usecols=usecols, nrows=nrows, **SAFE_READ_KW)
    dfh = _norm_cols(dfh)

    out = {"n_sample": int(len(dfh))}
    if len(dfh) == 0:
        out.update({"time_na_frac": 1.0, "flux_na_frac": 1.0, "ferr_na_frac": 1.0})
        return out

    t = pd.to_numeric(dfh["Time (MJD)"], errors="coerce")
    f = pd.to_numeric(dfh["Flux"], errors="coerce")
    e = pd.to_numeric(dfh["Flux_err"], errors="coerce")

    out["time_na_frac"] = float(t.isna().mean())
    out["flux_na_frac"] = float(f.isna().mean())
    out["ferr_na_frac"] = float(e.isna().mean())

    # quick stats (ignore NaN)
    if (~t.isna()).any():
        out["time_min"] = float(t.min())
        out["time_max"] = float(t.max())
    if (~f.isna()).any():
        out["flux_neg_frac"] = float((f.dropna() < 0).mean())
    if (~e.isna()).any():
        out["ferr_min"] = float(e.dropna().min())
        out["ferr_p50"] = float(e.dropna().median())

    return out

def _sample_id_presence(csv_path: Path, want_ids: set, chunk_rows: int, max_chunks: int):
    """
    Limited scan untuk memastikan beberapa object_id dari log benar-benar ada di file.
    Return: found_count, missing_ids(set)
    """
    if not want_ids:
        return 0, set()
    remaining = set(want_ids)
    found = set()
    read_ok = False

    for i, chunk in enumerate(pd.read_csv(csv_path, usecols=["object_id"], chunksize=chunk_rows, **SAFE_READ_KW)):
        read_ok = True
        ids = set(chunk["object_id"].astype("string"))
        hit = remaining & ids
        if hit:
            found |= hit
            remaining -= hit
        if not remaining:
            break
        if i + 1 >= max_chunks:
            break

    if not read_ok:
        raise ValueError(f"[LC READ] Could not read {csv_path}")
    return len(found), remaining

# ----------------------------
# 3) Normalize split col in logs (idempotent, see STAGE 0)
# ----------------------------
for df, name in [(df_train_log, "train_log"), (df_test_log, "test_log")]:
    if "split" not in df.columns:
        raise ValueError(f"{name} missing 'split' column.")
    df["split"] = df["split"].map(normalize_split_name)

bad_train_split = sorted(set(df_train_log["split"].unique()) - VALID_SPLITS)
bad_test_split  = sorted(set(df_test_log["split"].unique())  - VALID_SPLITS)
if bad_train_split:
    raise ValueError(f"train_log has invalid split values (examples): {bad_train_split[:10]}")
if bad_test_split:
    raise ValueError(f"test_log has invalid split values (examples): {bad_test_split[:10]}")

# ----------------------------
# 4) Verify disk splits set
# ----------------------------
disk_splits = set(SPLIT_DIRS.keys())
if disk_splits != VALID_SPLITS:
    missing = sorted(list(VALID_SPLITS - disk_splits))
    extra   = sorted(list(disk_splits - VALID_SPLITS))
    msg = []
    if missing:
        msg.append(f"Missing split folders: {missing[:10]}")
    if extra:
        msg.append(f"Unexpected split folders: {extra[:10]}")
    raise RuntimeError("Split folder set mismatch.\n" + "\n".join(msg))

# ----------------------------
# 5) Build routing manifest (40 file) + validate exists
# ----------------------------
train_counts = df_train_log["split"].value_counts().to_dict()
test_counts  = df_test_log["split"].value_counts().to_dict()

routing_rows = []
missing_files = []

for sp in sorted(VALID_SPLITS):
    sd = SPLIT_DIRS[sp]
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"

    if not tr.exists(): missing_files.append(str(tr))
    if not te.exists(): missing_files.append(str(te))

    routing_rows.append({
        "split": sp,
        "train_csv": str(tr),
        "test_csv": str(te),
        "train_mb": sizeof_mb(tr),
        "test_mb": sizeof_mb(te),
        "n_train_objects_log": int(train_counts.get(sp, 0)),
        "n_test_objects_log": int(test_counts.get(sp, 0)),
    })

if missing_files:
    raise FileNotFoundError("Some lightcurve files missing (showing up to 10):\n" + "\n".join(missing_files[:10]))

df_routing = pd.DataFrame(routing_rows)
routing_path = LOG_DIR / "split_routing.csv"
df_routing.to_csv(routing_path, index=False)

# ----------------------------
# 6) Header-only schema check (strict for required cols)
# ----------------------------
schema_issues = []
for sp in sorted(VALID_SPLITS):
    sd = SPLIT_DIRS[sp]
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"

    cols_tr = read_header_cols(tr)
    cols_te = read_header_cols(te)

    miss_tr = sorted(list(REQ_LC_COLS - set(cols_tr)))
    miss_te = sorted(list(REQ_LC_COLS - set(cols_te)))
    if miss_tr or miss_te:
        schema_issues.append((sp, miss_tr, miss_te, cols_tr, cols_te))

if schema_issues:
    sp, miss_tr, miss_te, cols_tr, cols_te = schema_issues[0]
    raise ValueError(
        "Lightcurve column mismatch detected.\n"
        f"Example split: {sp}\n"
        f"Missing in train_full_lightcurves.csv: {miss_tr}\n"
        f"Missing in test_full_lightcurves.csv : {miss_te}\n"
        f"Train columns: {cols_tr}\n"
        f"Test columns : {cols_te}\n"
    )

# ----------------------------
# 7) Tiny filter sanity + numeric sample profiling + ID crosscheck
# ----------------------------
stats_rows = []
warn_count = 0

# numeric policy
# - Time & Flux_err: fail kalau >2% NaN di sample (indikasi parsing kacau)
MAX_TIME_NA_FRAC = 0.02
MAX_FERR_NA_FRAC = 0.02
# - Flux: TOLERANT (hanya dicatat)

rng = np.random.default_rng(SEED)

for sp in sorted(VALID_SPLITS):
    sd = SPLIT_DIRS[sp]
    for kind in ["train", "test"]:
        p = sd / f"{kind}_full_lightcurves.csv"

        # filter values from small sample
        vals = sample_filter_values(p, nrows=min(500, FILTER_ROWS))
        bad = sorted([v for v in vals if (v not in ALLOWED_FILTERS)])
        # tolerate "nan" already removed; only bad true strings remain
        if bad:
            raise ValueError(
                f"Unexpected Filter values detected: split={sp} kind={kind} bad={bad[:10]} all_sampled={vals[:20]}"
            )

        # numeric sample report
        nr = _numeric_sample_report(p, nrows=HEAD_ROWS)
        # hard fail if Time/Flux_err parsing too broken
        if nr.get("time_na_frac", 0.0) > MAX_TIME_NA_FRAC:
            raise ValueError(f"[LC NUM] Time(MJD) NaN too high in sample: split={sp} kind={kind} frac={nr['time_na_frac']:.3f}")
        if nr.get("ferr_na_frac", 0.0) > MAX_FERR_NA_FRAC:
            raise ValueError(f"[LC NUM] Flux_err NaN too high in sample: split={sp} kind={kind} frac={nr['ferr_na_frac']:.3f}")
        # Flux: warn only
        if nr.get("flux_na_frac", 0.0) > 0:
            warn_count += 1

        # sample ID crosscheck (limited scan)
        if kind == "train":
            ids = df_train_log.loc[df_train_log["split"] == sp, "object_id"].astype("string")
        else:
            ids = df_test_log.loc[df_test_log["split"] == sp, "object_id"].astype("string")

        k = min(SAMPLE_ID_PER_SPLIT, len(ids))
        want = set(ids.sample(n=k, random_state=SEED).tolist()) if k > 0 else set()
        found_n, missing_ids = _sample_id_presence(p, want, CHUNK_ROWS, MAX_CHUNKS_PER_FILE)

        # jika miss banyak sekali untuk split ini, itu red-flag
        miss_frac = (len(missing_ids) / max(len(want), 1)) if want else 0.0
        if want and miss_frac >= 0.8:
            raise ValueError(
                f"[LC ID] Severe mismatch: split={sp} kind={kind} missing {len(missing_ids)}/{len(want)} "
                f"within limited scan. Example missing: {list(missing_ids)[:3]}"
            )
        if want and missing_ids:
            # warn kalau ada yang miss tapi tidak parah (scan cap bisa jadi penyebab)
            print(f"[WARN] ID crosscheck limited-scan miss: split={sp} kind={kind} missing {len(missing_ids)}/{len(want)}")

        stats_rows.append({
            "split": sp,
            "kind": kind,
            "file": str(p),
            "file_mb": sizeof_mb(p),
            "n_sample": nr.get("n_sample", 0),
            "time_na_frac": nr.get("time_na_frac", np.nan),
            "flux_na_frac": nr.get("flux_na_frac", np.nan),
            "ferr_na_frac": nr.get("ferr_na_frac", np.nan),
            "time_min": nr.get("time_min", np.nan),
            "time_max": nr.get("time_max", np.nan),
            "flux_neg_frac": nr.get("flux_neg_frac", np.nan),
            "ferr_min": nr.get("ferr_min", np.nan),
            "ferr_p50": nr.get("ferr_p50", np.nan),
            "filter_sample": ",".join(vals[:10]),
            "id_check_k": int(len(want)),
            "id_found": int(found_n),
            "id_missing": int(len(missing_ids)),
        })

df_lc_stats = pd.DataFrame(stats_rows)
lc_stats_path = LOG_DIR / "lc_sample_stats.csv"
df_lc_stats.to_csv(lc_stats_path, index=False)

# ----------------------------
# 8) Summary prints
# ----------------------------
print("STAGE 1 OK — SPLIT ROUTING READY")
print(f"- DATA_ROOT: {DATA_ROOT}")
print(f"- splits on disk: {len(VALID_SPLITS)} (split_01..split_20)")
print(f"- routing saved: {routing_path}")
print(f"- lc sample stats saved: {lc_stats_path}")

# object counts ringkas
print("\nOBJECT COUNTS (from logs)")
for sp in sorted(VALID_SPLITS):
    print(f"- {sp}: train={int(train_counts.get(sp,0)):,} | test={int(test_counts.get(sp,0)):,}")

# highlight worst flux_na_frac (optional info)
worst = (
    df_lc_stats.sort_values("flux_na_frac", ascending=False)
    .head(6)[["split","kind","flux_na_frac","time_na_frac","ferr_na_frac","file_mb"]]
)
print("\nWORST SAMPLE (highest flux_na_frac in head sample)")
print(worst.to_string(index=False))

# ----------------------------
# 9) Export to globals
# ----------------------------
SPLIT_LIST = [f"split_{i:02d}" for i in range(1, 21)]
globals().update({
    "DATA_ROOT": DATA_ROOT,
    "SPLIT_DIRS": SPLIT_DIRS,
    "SPLIT_LIST": SPLIT_LIST,
    "df_split_routing": df_routing,
    "df_lc_sample_stats": df_lc_stats,
})

gc.collect()
print("\nStage 1 complete: splits verified + routing/stats exported.")


STAGE 1 OK — SPLIT ROUTING READY
- DATA_ROOT: /kaggle/input/mallorn-dataset
- splits on disk: 20 (split_01..split_20)
- routing saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/logs/split_routing.csv
- lc sample stats saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/logs/lc_sample_stats.csv

OBJECT COUNTS (from logs)
- split_01: train=155 | test=364
- split_02: train=170 | test=414
- split_03: train=138 | test=338
- split_04: train=145 | test=332
- split_05: train=165 | test=375
- split_06: train=155 | test=374
- split_07: train=165 | test=398
- split_08: train=162 | test=387
- split_09: train=128 | test=289
- split_10: train=144 | test=331
- split_11: train=146 | test=325
- split_12: train=155 | test=353
- split_13: train=143 | test=379
- split_14: train=154 | test=351
- split_15: train=158 | test=342
- split_16: train=155 | test=354
- split_17: train=153 | test=351
- split_18: train=152 | test=345
- split_19: train=147 | test=375
- split_20: train=15

# Load and Validate Train/Test Logs

In [4]:
# ============================================================
# STAGE 2 — Clean Meta Logs + CV Fold Assignment (ONE CELL, CPU-SAFE) — REVISI FULL v2
# Output:
#   * df_train_meta, df_test_meta (index=object_id)
#   * id2split_train, id2split_test
#   * artifacts/train_meta.(parquet|csv), test_meta.(parquet|csv)
#   * artifacts/split_stats.csv
#   * artifacts/train_folds.csv
#   * artifacts/id2split_train.json, artifacts/id2split_test.json
# Notes:
# - Tidak load full lightcurves.
# - Clipping/transform pakai TRAIN statistics saja (anti leakage).
# - Fold strategy: stratified within each split -> fold global stabil.
# ============================================================

import re, gc, json, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require STAGE 0/1 globals
# ----------------------------
for need in ["PATHS", "ART_DIR", "SPLIT_DIRS", "CFG", "SEED"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 & STAGE 1 dulu.")

TRAIN_LOG_PATH = Path(PATHS["TRAIN_LOG"])
TEST_LOG_PATH  = Path(PATHS["TEST_LOG"])
ART_DIR = Path(ART_DIR)
ART_DIR.mkdir(parents=True, exist_ok=True)

SEED = int(SEED)
N_FOLDS = int(CFG.get("N_FOLDS", 5))

VALID_SPLITS = {f"split_{i:02d}" for i in range(1, 21)}
disk_splits = set(SPLIT_DIRS.keys())
if disk_splits != VALID_SPLITS:
    raise RuntimeError("SPLIT_DIRS tidak lengkap / mismatch. Jalankan ulang STAGE 1.")

# konsisten dengan STAGE 0/1
SAFE_NA_VALUES = ["", " ", "NA", "NaN", "nan", "NULL", "null", "None", "none"]
SAFE_READ_KW = dict(low_memory=False, na_values=SAFE_NA_VALUES, keep_default_na=True)

# ----------------------------
# 1) Helpers
# ----------------------------
def normalize_split_name(x) -> str:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    s = str(x).strip()
    if not s:
        return ""
    s2 = s.lower().replace("-", "_").replace(" ", "_")
    if s2.isdigit():
        return f"split_{int(s2):02d}"
    m = re.fullmatch(r"split_(\d{1,2})", s2)
    if m:
        return f"split_{int(m.group(1)):02d}"
    m = re.fullmatch(r"split(\d{1,2})", s2)
    if m:
        return f"split_{int(m.group(1)):02d}"
    return s2

def _norm_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

def _coerce_float32(df: pd.DataFrame, col: str):
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce").astype("float32")

def _safe_clip(series: pd.Series, lo: float, hi: float) -> pd.Series:
    # series float32 -> clip float32
    return series.clip(lower=np.float32(lo), upper=np.float32(hi)).astype("float32")

# ----------------------------
# 2) Load logs (fresh read; stabil)
# ----------------------------
dtype_log = {"object_id": "string", "split": "string"}
df_train = pd.read_csv(TRAIN_LOG_PATH, dtype=dtype_log, **SAFE_READ_KW)
df_test  = pd.read_csv(TEST_LOG_PATH,  dtype=dtype_log, **SAFE_READ_KW)

df_train = _norm_cols(df_train)
df_test  = _norm_cols(df_test)

# ----------------------------
# 3) Required columns check
# ----------------------------
req_common = {"object_id", "split", "EBV", "Z"}
req_train  = req_common | {"target"}
req_test   = req_common

miss_train = sorted(list(req_train - set(df_train.columns)))
miss_test  = sorted(list(req_test  - set(df_test.columns)))
if miss_train:
    raise ValueError(f"train_log.csv missing required columns: {miss_train} | found={list(df_train.columns)}")
if miss_test:
    raise ValueError(f"test_log.csv missing required columns: {miss_test} | found={list(df_test.columns)}")

# ----------------------------
# 4) Basic cleaning (id + split canonical)
# ----------------------------
df_train["object_id"] = df_train["object_id"].astype("string").str.strip()
df_test["object_id"]  = df_test["object_id"].astype("string").str.strip()

df_train["split"] = df_train["split"].astype("string").map(normalize_split_name)
df_test["split"]  = df_test["split"].astype("string").map(normalize_split_name)

bad_train_split = sorted(set(df_train["split"].unique()) - VALID_SPLITS)
bad_test_split  = sorted(set(df_test["split"].unique())  - VALID_SPLITS)
if bad_train_split:
    raise ValueError(f"train_log has invalid split values (examples): {bad_train_split[:10]}")
if bad_test_split:
    raise ValueError(f"test_log has invalid split values (examples): {bad_test_split[:10]}")

bad_train_disk = sorted([s for s in set(df_train["split"].unique()) if s not in disk_splits])
bad_test_disk  = sorted([s for s in set(df_test["split"].unique())  if s not in disk_splits])
if bad_train_disk:
    raise FileNotFoundError(f"train_log references unknown split(s) not on disk: {bad_train_disk[:10]}")
if bad_test_disk:
    raise FileNotFoundError(f"test_log references unknown split(s) not on disk: {bad_test_disk[:10]}")

# ----------------------------
# 5) Ensure Z_err exists + numeric coercion (float32)
# ----------------------------
if "Z_err" not in df_train.columns:
    df_train["Z_err"] = np.nan
if "Z_err" not in df_test.columns:
    df_test["Z_err"] = np.nan

for c in ["EBV", "Z", "Z_err"]:
    _coerce_float32(df_train, c)
    _coerce_float32(df_test, c)

# ----------------------------
# 6) Duplicates / overlap checks (hard fail)
# ----------------------------
if df_train["object_id"].duplicated().any():
    ex = df_train.loc[df_train["object_id"].duplicated(), "object_id"].head(5).tolist()
    raise ValueError(f"Duplicated object_id in train_log (examples): {ex}")
if df_test["object_id"].duplicated().any():
    ex = df_test.loc[df_test["object_id"].duplicated(), "object_id"].head(5).tolist()
    raise ValueError(f"Duplicated object_id in test_log (examples): {ex}")

overlap = set(df_train["object_id"].tolist()) & set(df_test["object_id"].tolist())
if overlap:
    raise ValueError(f"object_id overlap between train_log and test_log (examples): {list(overlap)[:5]}")

# ----------------------------
# 7) Target validation (train)
# ----------------------------
df_train["target"] = pd.to_numeric(df_train["target"], errors="coerce")
if df_train["target"].isna().any():
    raise ValueError(f"train_log target has NaN after coercion: {int(df_train['target'].isna().sum())} rows.")
uniq_t = set(pd.unique(df_train["target"]).tolist())
if not uniq_t.issubset({0, 1}):
    raise ValueError(f"train_log target must be binary 0/1. Found: {sorted(list(uniq_t))}")
df_train["target"] = df_train["target"].astype("int8")

# ----------------------------
# 8) Missing flags + robust fills
# ----------------------------
for df in [df_train, df_test]:
    df["EBV_missing"] = df["EBV"].isna().astype("int8")
    df["Z_missing"]   = df["Z"].isna().astype("int8")
    df["Zerr_missing"]= df["Z_err"].isna().astype("int8")

# EBV: fill NaN -> 0.0 (fisik masuk akal sebagai default aman)
df_train["EBV"] = df_train["EBV"].fillna(np.float32(0.0)).astype("float32")
df_test["EBV"]  = df_test["EBV"].fillna(np.float32(0.0)).astype("float32")

# Z: fill NaN -> median per split (fallback global median)
def _fill_z_by_split(df: pd.DataFrame) -> pd.Series:
    z = df["Z"]
    if not z.isna().any():
        return z.astype("float32")
    z2 = z.copy()
    z2 = z2.fillna(df.groupby("split")["Z"].transform("median"))
    gmed = float(np.nanmedian(z.values)) if np.isfinite(z.values).any() else 0.0
    z2 = z2.fillna(np.float32(gmed))
    return z2.astype("float32")

df_train["Z"] = _fill_z_by_split(df_train)
df_test["Z"]  = _fill_z_by_split(df_test)

# Z_err: train biasanya blank -> 0.0; test ada
df_train["Z_err"] = df_train["Z_err"].fillna(np.float32(0.0)).astype("float32")
df_test["Z_err"]  = df_test["Z_err"].fillna(np.float32(0.0)).astype("float32")

# flags yang sering dipakai model
df_train["has_zerr"] = np.int8(0)   # train: spec-z
df_test["has_zerr"]  = np.int8(1)   # test : photo-z error tersedia
df_train["is_photoz"] = np.int8(0)
df_test["is_photoz"]  = np.int8(1)

# ----------------------------
# 9) Train-based clipping (anti outlier) + derived meta features
# ----------------------------
# clip pakai TRAIN saja -> apply ke train & test
EBV_LO, EBV_HI = np.quantile(df_train["EBV"].values.astype(float), [0.001, 0.999])
Z_LO,   Z_HI   = np.quantile(df_train["Z"].values.astype(float),   [0.001, 0.999])

df_train["EBV_clip"] = _safe_clip(df_train["EBV"], EBV_LO, EBV_HI)
df_test["EBV_clip"]  = _safe_clip(df_test["EBV"],  EBV_LO, EBV_HI)

df_train["Z_clip"] = _safe_clip(df_train["Z"], Z_LO, Z_HI)
df_test["Z_clip"]  = _safe_clip(df_test["Z"],  Z_LO, Z_HI)

# log transform (lebih stabil untuk model tabular)
df_train["log1pZ"] = np.log1p(df_train["Z_clip"].astype("float32")).astype("float32")
df_test["log1pZ"]  = np.log1p(df_test["Z_clip"].astype("float32")).astype("float32")

# zerr relative (test only meaningful, tapi aman diset untuk train=0)
eps = np.float32(1e-6)
df_train["zerr_rel"] = (df_train["Z_err"] / (df_train["Z_clip"] + eps)).astype("float32")
df_test["zerr_rel"]  = (df_test["Z_err"]  / (df_test["Z_clip"]  + eps)).astype("float32")

# split_id (numeric encoding stabil)
split2id = {f"split_{i:02d}": i for i in range(1, 21)}
df_train["split_id"] = df_train["split"].map(split2id).astype("int16")
df_test["split_id"]  = df_test["split"].map(split2id).astype("int16")

# ----------------------------
# 10) CV fold assignment: stratified within each split (stabil untuk OOF + threshold)
# ----------------------------
# Strategi:
# - Untuk tiap split: pisahkan pos/neg, shuffle, lalu bagikan merata ke fold 0..K-1
# - Hasil: setiap fold mendapatkan “campuran” dari semua split dan proporsi kelas lebih stabil.
df_train["fold"] = -1

rng = np.random.default_rng(SEED)
for sp in sorted(VALID_SPLITS):
    idx = df_train.index[df_train["split"] == sp].to_numpy()
    if len(idx) == 0:
        continue

    y = df_train.loc[idx, "target"].to_numpy()
    pos_idx = idx[y == 1]
    neg_idx = idx[y == 0]

    rng.shuffle(pos_idx)
    rng.shuffle(neg_idx)

    # round-robin assignment
    for j, ii in enumerate(pos_idx):
        df_train.at[ii, "fold"] = int(j % N_FOLDS)
    for j, ii in enumerate(neg_idx):
        # offset agar tidak semua kelas jatuh di fold yang sama di split kecil
        df_train.at[ii, "fold"] = int((j + 1) % N_FOLDS)

# hard check
if (df_train["fold"] < 0).any():
    n_bad = int((df_train["fold"] < 0).sum())
    raise RuntimeError(f"Fold assignment gagal: ada {n_bad} baris fold=-1")

# ----------------------------
# 11) Build meta tables (index=object_id)
# ----------------------------
# Simpan kolom yang benar-benar akan dipakai berikutnya.
keep_train = [
    "object_id","split","split_id",
    "EBV","EBV_clip","Z","Z_clip","log1pZ","Z_err","zerr_rel",
    "EBV_missing","Z_missing","Zerr_missing","has_zerr","is_photoz",
    "fold","target"
]
keep_test = [
    "object_id","split","split_id",
    "EBV","EBV_clip","Z","Z_clip","log1pZ","Z_err","zerr_rel",
    "EBV_missing","Z_missing","Zerr_missing","has_zerr","is_photoz"
]

# optional: SpecType untuk analisis (train only)
if "SpecType" in df_train.columns:
    keep_train.append("SpecType")

df_train_meta = df_train[keep_train].copy()
df_test_meta  = df_test[keep_test].copy()

df_train_meta = df_train_meta.set_index("object_id", drop=True).sort_index()
df_test_meta  = df_test_meta.set_index("object_id", drop=True).sort_index()

id2split_train = df_train_meta["split"].to_dict()
id2split_test  = df_test_meta["split"].to_dict()

# ----------------------------
# 12) Save artifacts (parquet preferred, csv fallback)
# ----------------------------
train_pq = ART_DIR / "train_meta.parquet"
test_pq  = ART_DIR / "test_meta.parquet"
train_csv = ART_DIR / "train_meta.csv"
test_csv  = ART_DIR / "test_meta.csv"

saved_train = saved_test = None
try:
    df_train_meta.to_parquet(train_pq, index=True)
    df_test_meta.to_parquet(test_pq, index=True)
    saved_train, saved_test = str(train_pq), str(test_pq)
except Exception:
    df_train_meta.to_csv(train_csv, index=True)
    df_test_meta.to_csv(test_csv, index=True)
    saved_train, saved_test = str(train_csv), str(test_csv)

# split stats (train/test counts + pos rate)
split_stats = pd.DataFrame({
    "train_objects": df_train_meta["split"].value_counts().reindex(sorted(VALID_SPLITS)).fillna(0).astype(int),
    "test_objects":  df_test_meta["split"].value_counts().reindex(sorted(VALID_SPLITS)).fillna(0).astype(int),
})
split_stats.index.name = "split"

pos_by_split = df_train_meta.groupby("split")["target"].sum().reindex(sorted(VALID_SPLITS)).fillna(0).astype(int)
split_stats["train_pos"] = pos_by_split.values
split_stats["train_pos_rate"] = (split_stats["train_pos"] / split_stats["train_objects"].clip(lower=1)).astype("float32")

split_stats_path = ART_DIR / "split_stats.csv"
split_stats.to_csv(split_stats_path)

# folds report
fold_path = ART_DIR / "train_folds.csv"
df_train_meta.reset_index()[["object_id","split","fold","target"]].to_csv(fold_path, index=False)

# id2split json
with open(ART_DIR / "id2split_train.json", "w", encoding="utf-8") as f:
    json.dump(id2split_train, f)
with open(ART_DIR / "id2split_test.json", "w", encoding="utf-8") as f:
    json.dump(id2split_test, f)

# ----------------------------
# 13) Print summary
# ----------------------------
pos = int((df_train_meta["target"] == 1).sum())
neg = int((df_train_meta["target"] == 0).sum())
tot = int(len(df_train_meta))

# class imbalance helper (buat model stage lanjut)
pos_rate = pos / max(tot, 1)
scale_pos_weight = float(neg / max(pos, 1))

print("STAGE 2 OK — META READY (clean + folds)")
print(f"- train objects: {tot:,} | pos(TDE)={pos:,} | neg={neg:,} | pos%={pos_rate*100:.3f}%")
print(f"- test objects : {len(df_test_meta):,}")
print(f"- saved train  : {saved_train}")
print(f"- saved test   : {saved_test}")
print(f"- saved stats  : {split_stats_path}")
print(f"- saved folds  : {fold_path}")
print(f"- scale_pos_weight (neg/pos): {scale_pos_weight:.3f}")

print("\nTRAIN-BASED CLIP RANGES")
print(f"- EBV clip: [{float(EBV_LO):.6f}, {float(EBV_HI):.6f}]")
print(f"- Z   clip: [{float(Z_LO):.6f}, {float(Z_HI):.6f}]")

# fold sanity
fold_tab = df_train_meta.reset_index().groupby("fold")["target"].agg(["count","sum"]).rename(columns={"sum":"pos"})
fold_tab["pos_rate"] = fold_tab["pos"] / fold_tab["count"].clip(lower=1)
print("\nFOLD BALANCE (count/pos/pos_rate)")
print(fold_tab.to_string())

# ----------------------------
# 14) Export globals
# ----------------------------
globals().update({
    "df_train_meta": df_train_meta,
    "df_test_meta": df_test_meta,
    "id2split_train": id2split_train,
    "id2split_test": id2split_test,
    "split_stats": split_stats,
    "split2id": split2id,
    "scale_pos_weight": scale_pos_weight,
})

gc.collect()


STAGE 2 OK — META READY (clean + folds)
- train objects: 3,043 | pos(TDE)=148 | neg=2,895 | pos%=4.864%
- test objects : 7,135
- saved train  : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/train_meta.parquet
- saved test   : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/test_meta.parquet
- saved stats  : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/split_stats.csv
- saved folds  : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/train_folds.csv
- scale_pos_weight (neg/pos): 19.561

TRAIN-BASED CLIP RANGES
- EBV clip: [0.005042, 0.581790]
- Z   clip: [0.044923, 4.032352]

FOLD BALANCE (count/pos/pos_rate)
      count  pos  pos_rate
fold                      
0       607   38  0.062603
1       619   33  0.053312
2       612   29  0.047386
3       606   26  0.042904
4       599   22  0.036728


64

# Lightcurve Loading Strategy

In [5]:
# ============================================================
# STAGE 3 — Robust Lightcurve Loader Utilities (ONE CELL, Kaggle CPU-SAFE) — REVISI FULL v2
# - Split-wise file mapping + chunked reader utilities (no full concat)
# - Tahan terhadap "nan" string / mixed dtype pada Flux
# - Builds:
#   * SPLIT_FILES: {split_XX: {"train": Path, "test": Path}}
#   * train_ids_by_split / test_ids_by_split
#   * iter_lightcurve_chunks(): generator chunk normalized
#   * load_object_lightcurve(): per-object extraction (streaming) + optional early-stop heuristic
# - Saves:
#   * artifacts/split_file_manifest.csv
#   * artifacts/object_counts_by_split.csv
# ============================================================

import gc, re
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["SPLIT_DIRS", "SPLIT_LIST", "df_train_meta", "df_test_meta", "ART_DIR", "CFG", "SEED"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 -> STAGE 1 -> STAGE 2 dulu.")

ART_DIR = Path(ART_DIR)
ART_DIR.mkdir(parents=True, exist_ok=True)

SEED = int(SEED)
MIN_FLUXERR = float(CFG.get("MIN_FLUXERR", 1e-6))

# konsisten dengan STAGE 0/2
SAFE_NA_VALUES = ["", " ", "NA", "NaN", "nan", "NULL", "null", "None", "none"]
SAFE_READ_KW = dict(low_memory=False, na_values=SAFE_NA_VALUES, keep_default_na=True)

REQ_LC_KEYS = ["object_id", "mjd", "flux", "flux_err", "filter"]
ALLOWED_FILTERS = {"u", "g", "r", "i", "z", "y"}
FILTER_ORDER = {"u":0, "g":1, "r":2, "i":3, "z":4, "y":5}

# ----------------------------
# 1) Build split file mapping (train/test lightcurves)
# ----------------------------
SPLIT_FILES = {}
for s in SPLIT_LIST:
    sd = Path(SPLIT_DIRS[s])
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"
    if (not tr.exists()) or (not te.exists()):
        raise FileNotFoundError(f"Missing lightcurve file(s) in {sd}: train={tr.exists()} test={te.exists()}")
    SPLIT_FILES[s] = {"train": tr, "test": te}

# Save split file manifest
manifest = []
for s in SPLIT_LIST:
    p_tr = SPLIT_FILES[s]["train"]
    p_te = SPLIT_FILES[s]["test"]
    manifest.append({
        "split": s,
        "train_path": str(p_tr),
        "test_path": str(p_te),
        "train_mb": float(p_tr.stat().st_size) / (1024**2),
        "test_mb":  float(p_te.stat().st_size) / (1024**2),
    })
df_manifest = pd.DataFrame(manifest).sort_values("split")
manifest_path = ART_DIR / "split_file_manifest.csv"
df_manifest.to_csv(manifest_path, index=False)

# ----------------------------
# 2) Build object routing by split (fast, vectorized)
# ----------------------------
# df_train_meta/index = object_id, column split exists
train_ids_by_split = {s: df_train_meta.index[df_train_meta["split"] == s].astype(str).tolist() for s in SPLIT_LIST}
test_ids_by_split  = {s: df_test_meta.index[df_test_meta["split"] == s].astype(str).tolist()  for s in SPLIT_LIST}

df_counts = pd.DataFrame({
    "split": SPLIT_LIST,
    "train_objects": [len(train_ids_by_split[s]) for s in SPLIT_LIST],
    "test_objects":  [len(test_ids_by_split[s]) for s in SPLIT_LIST],
})
counts_path = ART_DIR / "object_counts_by_split.csv"
df_counts.to_csv(counts_path, index=False)

# ----------------------------
# 3) Robust header mapping -> canonical columns
# ----------------------------
_LC_CFG_CACHE = {}  # (split_name, which) -> cfg dict

def _canon_col(x: str) -> str:
    # canonicalize header name for matching
    s = str(x).strip().lower()
    s = s.replace("\ufeff", "")  # BOM safety
    s = re.sub(r"\s+", "", s)    # remove whitespace
    s = s.replace("(", "").replace(")", "")
    s = s.replace("-", "_")
    return s

def _build_lc_read_cfg(p: Path):
    """
    Robust mapping based on header only.
    Accepts variants:
      object_id, Object_ID
      Time (MJD), Time(MJD), time_mjd, mjd, time
      Flux, flux
      Flux_err, FluxErr, flux_err
      Filter, filter
    """
    h = pd.read_csv(p, nrows=0, **SAFE_READ_KW)
    orig_cols = list(h.columns)

    # map canonical -> original col
    c2o = {}
    for c in orig_cols:
        k = _canon_col(c)
        if k not in c2o:
            c2o[k] = c

    # locate required columns
    obj_col = c2o.get("object_id", None)

    # time candidates (canonical forms)
    time_col = None
    for k in ["time_mjd", "timemjd", "mjd", "time"]:
        if k in c2o:
            time_col = c2o[k]
            break

    flux_col = c2o.get("flux", None)

    # flux_err candidates
    ferr_col = None
    for k in ["flux_err", "fluxerr", "fluxerror"]:
        if k in c2o:
            ferr_col = c2o[k]
            break

    filt_col = c2o.get("filter", None)

    missing = []
    if obj_col is None:  missing.append("object_id")
    if time_col is None: missing.append("Time (MJD)")
    if flux_col is None: missing.append("Flux")
    if ferr_col is None: missing.append("Flux_err")
    if filt_col is None: missing.append("Filter")
    if missing:
        raise ValueError(
            f"Missing required lightcurve columns in {p.name}: {missing}. "
            f"Header sample: {orig_cols[:20]}"
        )

    usecols = [obj_col, time_col, flux_col, ferr_col, filt_col]
    rename = {obj_col:"object_id", time_col:"mjd", flux_col:"flux", ferr_col:"flux_err", filt_col:"filter"}

    # dtype: string untuk id/filter; numeric biarkan parser -> nanti coercion robust
    dtypes = {obj_col:"string", filt_col:"string"}

    return {"usecols": usecols, "dtype": dtypes, "rename": rename}

def _normalize_lc_chunk(df: pd.DataFrame, drop_bad_filter: bool = True, drop_bad_mjd: bool = True):
    """
    Normalize to canonical columns:
      object_id (string), mjd (float32), flux (float32), flux_err (float32), filter (string lower)
    Rules:
      - filter di-lower + strip; invalid filter -> NaN, bisa drop
      - mjd/flux/flux_err -> to_numeric(errors='coerce')
      - flux boleh NaN (missing obs) -> tetap dipertahankan
      - flux_err dipaksa >= MIN_FLUXERR untuk nilai non-NaN
      - drop row jika object_id kosong
    """
    # ensure column set
    df = df[["object_id","mjd","flux","flux_err","filter"]].copy()

    # clean strings
    df["object_id"] = df["object_id"].astype("string").str.strip()
    df["filter"] = df["filter"].astype("string").str.strip().str.lower()

    # normalize invalid filter -> <NA>
    # note: string dtype uses <NA>
    df.loc[~df["filter"].isin(list(ALLOWED_FILTERS)), "filter"] = pd.NA

    # numeric coercion
    df["mjd"] = pd.to_numeric(df["mjd"], errors="coerce").astype("float32")
    df["flux"] = pd.to_numeric(df["flux"], errors="coerce").astype("float32")
    df["flux_err"] = pd.to_numeric(df["flux_err"], errors="coerce").astype("float32")

    # enforce min flux_err for stability (ignore NaN)
    fe = df["flux_err"]
    if MIN_FLUXERR > 0:
        df.loc[fe.notna() & (fe < MIN_FLUXERR), "flux_err"] = np.float32(MIN_FLUXERR)

    # drop rows with empty id
    df = df[df["object_id"].notna() & (df["object_id"] != "")]

    # optional drops
    if drop_bad_filter:
        df = df[df["filter"].notna()]
    if drop_bad_mjd:
        df = df[df["mjd"].notna()]

    return df[REQ_LC_KEYS]

# ----------------------------
# 4) Chunked readers (core strategy)
# ----------------------------
def iter_lightcurve_chunks(
    split_name: str,
    which: str,
    chunksize: int = 400_000,
    drop_bad_filter: bool = True,
    drop_bad_mjd: bool = True
):
    """
    Stream read a split lightcurve CSV in chunks.
    Yields normalized chunks with columns:
      object_id, mjd, flux, flux_err, filter

    Notes:
    - flux boleh NaN (missing), akan diproses di stage tokenization.
    - filter invalid dibuang (default).
    """
    if split_name not in SPLIT_FILES:
        raise KeyError(f"Unknown split_name={split_name}.")
    if which not in ("train", "test"):
        raise ValueError("which must be 'train' or 'test'")

    p = SPLIT_FILES[split_name][which]
    key = (split_name, which)
    if key not in _LC_CFG_CACHE:
        _LC_CFG_CACHE[key] = _build_lc_read_cfg(p)

    cfg = _LC_CFG_CACHE[key]

    reader = pd.read_csv(
        p,
        usecols=cfg["usecols"],
        dtype=cfg["dtype"],          # only string enforced
        chunksize=int(chunksize),
        **SAFE_READ_KW
    )
    for chunk in reader:
        chunk = chunk.rename(columns=cfg["rename"])
        yield _normalize_lc_chunk(chunk, drop_bad_filter=drop_bad_filter, drop_bad_mjd=drop_bad_mjd)

def load_object_lightcurve(
    object_id: str,
    which: str,
    chunksize: int = 400_000,
    sort_time: bool = True,
    max_chunks: int = None,
    stop_after_found_block: bool = True
):
    """
    Debug-safe per-object extraction by streaming the relevant split file.
    Heuristic stop:
      - If file tends to be grouped by object_id, stop after we have found rows
        and then we observe a full chunk with zero hits.
      - This is optional and safe for debugging, not for training loop.
    """
    object_id = str(object_id).strip()

    if which == "train":
        if object_id not in df_train_meta.index:
            raise KeyError(f"object_id not found in df_train_meta: {object_id}")
        split_name = str(df_train_meta.loc[object_id, "split"])
    elif which == "test":
        if object_id not in df_test_meta.index:
            raise KeyError(f"object_id not found in df_test_meta: {object_id}")
        split_name = str(df_test_meta.loc[object_id, "split"])
    else:
        raise ValueError("which must be 'train' or 'test'")

    pieces = []
    seen = 0
    found_any = False
    last_hit = False

    for ch in iter_lightcurve_chunks(split_name, which, chunksize=chunksize):
        seen += 1
        sub = ch[ch["object_id"] == object_id]
        hit = (len(sub) > 0)
        if hit:
            pieces.append(sub)
            found_any = True

        # heuristic stop: sudah pernah ketemu, lalu satu chunk penuh tidak ada hit
        if stop_after_found_block and found_any and last_hit and (not hit):
            break
        last_hit = hit

        if max_chunks is not None and seen >= int(max_chunks):
            break

    if not pieces:
        out = pd.DataFrame(columns=REQ_LC_KEYS)
    else:
        out = pd.concat(pieces, ignore_index=True)
        if sort_time and len(out) > 1:
            # stable sort
            out["filter_ord"] = out["filter"].map(FILTER_ORDER).astype("int16")
            out = out.sort_values(["mjd", "filter_ord"], kind="mergesort").drop(columns=["filter_ord"]).reset_index(drop=True)

    return out

# ----------------------------
# 5) Quick smoke test (very light)
# ----------------------------
_smoke_splits = ["split_01", "split_08", "split_17"]
for s in _smoke_splits:
    if len(train_ids_by_split.get(s, [])) == 0 or len(test_ids_by_split.get(s, [])) == 0:
        raise RuntimeError(f"Split {s} has 0 objects in train/test log (unexpected).")

    ch_tr = next(iter_lightcurve_chunks(s, "train", chunksize=50_000))
    ch_te = next(iter_lightcurve_chunks(s, "test",  chunksize=50_000))

    if list(ch_tr.columns) != REQ_LC_KEYS:
        raise RuntimeError(f"Train chunk schema mismatch in {s}: {list(ch_tr.columns)}")
    if list(ch_te.columns) != REQ_LC_KEYS:
        raise RuntimeError(f"Test chunk schema mismatch in {s}: {list(ch_te.columns)}")

    badf_tr = sorted(set(ch_tr["filter"].dropna().unique()) - ALLOWED_FILTERS)
    badf_te = sorted(set(ch_te["filter"].dropna().unique()) - ALLOWED_FILTERS)
    if badf_tr or badf_te:
        raise ValueError(f"Unexpected filter values in smoke chunk split={s}: train_bad={badf_tr} test_bad={badf_te}")

print("STAGE 3 OK — LIGHTCURVE LOADING UTILITIES READY")
print(f"- Saved manifest: {manifest_path}")
print(f"- Saved counts  : {counts_path}")
print("- Ready for next stage: split-wise preprocessing + sequence/token building.")

# ----------------------------
# 6) Export globals for next stages
# ----------------------------
globals().update({
    "SPLIT_FILES": SPLIT_FILES,
    "train_ids_by_split": train_ids_by_split,
    "test_ids_by_split": test_ids_by_split,
    "iter_lightcurve_chunks": iter_lightcurve_chunks,
    "load_object_lightcurve": load_object_lightcurve,
    "REQ_LC_KEYS": REQ_LC_KEYS,
    "ALLOWED_FILTERS": ALLOWED_FILTERS,
    "FILTER_ORDER": FILTER_ORDER,
})

gc.collect()


STAGE 3 OK — LIGHTCURVE LOADING UTILITIES READY
- Saved manifest: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/split_file_manifest.csv
- Saved counts  : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/object_counts_by_split.csv
- Ready for next stage: split-wise preprocessing + sequence/token building.


303

# Photometric Cleaning (De-extinction + Negative Flux Safe Transform)

In [6]:
# ============================================================
# STAGE 4 — Photometric Cleaning (FORCE OVERWRITE) — REVISI FULL v6.1
# FIX:
# - Tidak pakai numpy.astype("string") (error). Pakai pd.array(..., dtype="string")
# - FORCE overwrite: wipe lc_clean_mag lalu rebuild
# - Atomic write (.tmp -> rename)
# ============================================================

import gc, json, warnings, time, shutil
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["iter_lightcurve_chunks", "df_train_meta", "df_test_meta", "ART_DIR", "SPLIT_LIST"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 -> 1 -> 2 -> 3 dulu.")

ART_DIR = Path(ART_DIR)

# ----------------------------
# 1) Settings
# ----------------------------
CHUNKSIZE   = 350_000
ERR_EPS     = 1e-6
SNR_DET     = 3.0
DET_SIGMA   = 3.0

MIN_FLUX_POS_UJY   = 1e-6
MAG_MIN, MAG_MAX   = -10.0, 50.0
MAGERR_FLOOR_DET   = 1e-3
MAGERR_FLOOR_ND    = 0.75
MAGERR_CAP         = 10.0

WRITE_FORMAT = "parquet"
ONLY_SPLITS  = None
KEEP_FLUX_DEBUG = False
DROP_BAD_TIME_ROWS = True

# FORCE overwrite
REBUILD_MODE = "wipe_all"  # "wipe_all" (recommended) | "wipe_parts_only"

# ----------------------------
# 2) Extinction coefficients (placeholder; ganti kalau punya nilai resmi)
# ----------------------------
EXT_RLAMBDA = {"u": 4.8, "g": 3.6, "r": 2.7, "i": 2.1, "z": 1.6, "y": 1.3}
BAND2ID = {"u": 0, "g": 1, "r": 2, "i": 3, "z": 4, "y": 5}
ID2BAND = {v: k for k, v in BAND2ID.items()}

EBV_TRAIN_SER = df_train_meta["EBV"]
EBV_TEST_SER  = df_test_meta["EBV"]

MAG_ZP = float(2.5 * np.log10(3631e6))  # ~23.9

# ----------------------------
# 3) Output root + WIPE (with safety guard)
# ----------------------------
LC_CLEAN_DIR = ART_DIR / "lc_clean_mag"

# safety: ensure LC_CLEAN_DIR is under ART_DIR
art_abs = ART_DIR.resolve()
lc_abs  = LC_CLEAN_DIR.resolve()
if str(art_abs) not in str(lc_abs):
    raise RuntimeError("Safety guard: LC_CLEAN_DIR bukan turunan ART_DIR. Abort.")

if REBUILD_MODE == "wipe_all":
    if LC_CLEAN_DIR.exists():
        shutil.rmtree(LC_CLEAN_DIR, ignore_errors=True)
    LC_CLEAN_DIR.mkdir(parents=True, exist_ok=True)
elif REBUILD_MODE == "wipe_parts_only":
    LC_CLEAN_DIR.mkdir(parents=True, exist_ok=True)
else:
    raise ValueError("REBUILD_MODE must be 'wipe_all' or 'wipe_parts_only'")

# ----------------------------
# 4) Atomic writer
# ----------------------------
def _atomic_write_parquet(df: pd.DataFrame, out_path: Path):
    tmp = out_path.with_name(out_path.name + ".tmp")
    df.to_parquet(tmp, index=False)
    tmp.replace(out_path)

def _atomic_write_csv_gz(df: pd.DataFrame, out_path: Path):
    out_path = out_path.with_suffix(".csv.gz")
    tmp = out_path.with_name(out_path.name + ".tmp")
    df.to_csv(tmp, index=False, compression="gzip")
    tmp.replace(out_path)
    return out_path

def write_part(df: pd.DataFrame, out_path: Path, fmt: str):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if fmt == "parquet":
        try:
            _atomic_write_parquet(df, out_path)
            return "parquet", out_path
        except Exception as e:
            alt = _atomic_write_csv_gz(df, out_path)
            return f"csv.gz (fallback from parquet: {type(e).__name__})", alt
    elif fmt == "csv.gz":
        alt = _atomic_write_csv_gz(df, out_path)
        return "csv.gz", alt
    else:
        raise ValueError("fmt must be 'parquet' or 'csv.gz'")

# ----------------------------
# 5) Core cleaning (NaN/negative-safe)
# ----------------------------
def clean_chunk_to_mag(ch: pd.DataFrame, ebv_ser: pd.Series):
    # object_id as pandas StringDtype (safe)
    oid = ch["object_id"].astype("string").to_numpy(copy=False)

    mjd = ch["mjd"].to_numpy(dtype=np.float32, copy=False)

    flux = ch["flux"].to_numpy(dtype=np.float32, copy=False)
    err  = ch["flux_err"].to_numpy(dtype=np.float32, copy=False)

    # sanitize err
    err = np.nan_to_num(err, nan=np.float32(ERR_EPS), posinf=np.float32(ERR_EPS), neginf=np.float32(ERR_EPS))
    err = np.maximum(err, np.float32(ERR_EPS))

    # sanitize flux: keep NaN as NaN, but force inf -> NaN
    flux = flux.astype(np.float32, copy=False)
    flux[~np.isfinite(flux)] = np.float32(np.nan)

    # filter normalize
    filt = ch["filter"].astype("string").to_numpy(copy=False)
    filt = np.char.lower(np.char.strip(filt.astype(str)))

    # band_id
    band_id = np.full(len(ch), -1, dtype=np.int8)
    for b, bid in BAND2ID.items():
        band_id[filt == b] = np.int8(bid)
    if np.any(band_id < 0):
        bad = sorted(set(filt[band_id < 0].tolist()))
        raise ValueError(f"Unknown filter values encountered (example up to 10): {bad[:10]}")

    # EBV lookup
    ebv = ch["object_id"].map(ebv_ser).fillna(0.0).to_numpy(dtype=np.float32)
    ebv[~np.isfinite(ebv)] = np.float32(0.0)

    # R_lambda lookup
    rlam = np.zeros(len(ch), dtype=np.float32)
    for b, rv in EXT_RLAMBDA.items():
        rlam[filt == b] = np.float32(rv)

    A = (rlam * ebv).astype(np.float32)
    mul = np.power(np.float32(10.0), (np.float32(0.4) * A)).astype(np.float32)

    flux_deext = (flux * mul).astype(np.float32)
    err_deext  = (err  * mul).astype(np.float32)

    # snr (NaN flux -> snr=0)
    snr = np.zeros_like(err_deext, dtype=np.float32)
    okf = np.isfinite(flux_deext)
    snr[okf] = (flux_deext[okf] / np.maximum(err_deext[okf], np.float32(ERR_EPS))).astype(np.float32)

    detected = (snr > np.float32(SNR_DET)).astype(np.int8)

    nan_flux_rows = int((~okf).sum())
    if nan_flux_rows:
        detected[~okf] = np.int8(0)
        snr[~okf] = np.float32(0.0)

    flux_detlim = (np.float32(DET_SIGMA) * err_deext).astype(np.float32)

    flux_for_mag = np.where(
        detected == 1,
        np.maximum(flux_deext, np.float32(MIN_FLUX_POS_UJY)),
        np.maximum(flux_detlim, np.float32(MIN_FLUX_POS_UJY)),
    ).astype(np.float32)

    mag = (np.float32(MAG_ZP) - np.float32(2.5) * np.log10(flux_for_mag)).astype(np.float32)
    mag = np.clip(mag, np.float32(MAG_MIN), np.float32(MAG_MAX)).astype(np.float32)

    mag_err = (np.float32(1.0857362) * (err_deext / flux_for_mag)).astype(np.float32)
    mag_err = np.clip(mag_err, np.float32(MAGERR_FLOOR_DET), np.float32(MAGERR_CAP)).astype(np.float32)

    if MAGERR_FLOOR_ND is not None and float(MAGERR_FLOOR_ND) > 0:
        mag_err = np.where(
            detected == 1,
            mag_err,
            np.maximum(mag_err, np.float32(MAGERR_FLOOR_ND))
        ).astype(np.float32)

    out = pd.DataFrame({
        "object_id": pd.array(oid, dtype="string"),
        "mjd": mjd.astype(np.float32, copy=False),
        "band_id": band_id.astype(np.int8, copy=False),
        "mag": mag.astype(np.float32, copy=False),
        "mag_err": mag_err.astype(np.float32, copy=False),
        "snr": snr.astype(np.float32, copy=False),
        "detected": detected.astype(np.int8, copy=False),
    })

    dropped_time = 0
    if DROP_BAD_TIME_ROWS:
        t = out["mjd"].to_numpy(dtype=np.float32, copy=False)
        keep = np.isfinite(t)
        dropped_time = int((~keep).sum())
        if dropped_time:
            out = out[keep]

    if KEEP_FLUX_DEBUG:
        out["flux_deext"] = pd.Series(np.nan_to_num(flux_deext, nan=0.0), dtype="float32")
        out["err_deext"]  = pd.Series(err_deext, dtype="float32")

    return out, dropped_time, nan_flux_rows

# ----------------------------
# 6) Process split-wise
# ----------------------------
splits_to_use = ONLY_SPLITS if (ONLY_SPLITS is not None) else SPLIT_LIST

summary_rows, manifest_rows = [], []

def _wipe_parts_dir(out_dir: Path):
    if out_dir.exists():
        for pat in ["part_*.parquet", "part_*.csv.gz", "*.tmp"]:
            for f in out_dir.glob(pat):
                try:
                    f.unlink()
                except Exception:
                    pass

def process_split(split_name: str, which: str):
    ebv_ser = EBV_TRAIN_SER if which == "train" else EBV_TEST_SER
    out_dir = LC_CLEAN_DIR / split_name / which
    out_dir.mkdir(parents=True, exist_ok=True)

    if REBUILD_MODE == "wipe_parts_only":
        _wipe_parts_dir(out_dir)

    t0 = time.time()
    part_idx = 0
    n_rows_total = 0
    n_neg_proxy = 0
    n_det = 0
    n_finite_mag = 0
    mag_min = np.inf
    mag_max = -np.inf
    dropped_time_total = 0
    nan_flux_total = 0

    for ch in iter_lightcurve_chunks(split_name, which, chunksize=CHUNKSIZE):
        cleaned, dropped_time, nan_flux = clean_chunk_to_mag(ch, ebv_ser)

        dropped_time_total += dropped_time
        nan_flux_total += nan_flux

        n_rows = int(len(cleaned))
        n_rows_total += n_rows

        snr_arr = cleaned["snr"].to_numpy(dtype=np.float32, copy=False)
        n_neg_proxy += int((snr_arr < 0).sum())

        det_arr = cleaned["detected"].to_numpy(dtype=np.int8, copy=False)
        n_det += int(det_arr.sum())

        mag_arr = cleaned["mag"].to_numpy(dtype=np.float32, copy=False)
        fin = np.isfinite(mag_arr)
        n_finite_mag += int(fin.sum())
        if fin.any():
            mag_min = float(min(mag_min, float(np.min(mag_arr[fin]))))
            mag_max = float(max(mag_max, float(np.max(mag_arr[fin]))))

        out_path = out_dir / f"part_{part_idx:04d}.parquet"
        used_fmt, final_path = write_part(cleaned, out_path, WRITE_FORMAT)

        manifest_rows.append({
            "split": split_name,
            "which": which,
            "part": part_idx,
            "path": str(final_path),
            "rows": n_rows,
            "format": used_fmt,
        })

        part_idx += 1
        del cleaned, ch
        if part_idx % 10 == 0:
            gc.collect()

    dt = time.time() - t0
    summary_rows.append({
        "split": split_name,
        "which": which,
        "parts": part_idx,
        "rows": n_rows_total,
        "neg_flux_frac_proxy": (n_neg_proxy / max(n_rows_total, 1)),
        "det_frac_snr_gt_thr": (n_det / max(n_rows_total, 1)),
        "finite_mag_frac": (n_finite_mag / max(n_rows_total, 1)),
        "mag_min": (mag_min if np.isfinite(mag_min) else np.nan),
        "mag_max": (mag_max if np.isfinite(mag_max) else np.nan),
        "dropped_time_rows": int(dropped_time_total),
        "nan_flux_rows": int(nan_flux_total),
        "sec": float(dt),
    })

    print(
        f"[Stage 4] {split_name}/{which}: parts={part_idx} | rows={n_rows_total:,} | "
        f"det%={100*(n_det/max(n_rows_total,1)):.2f}% | "
        f"nan_flux={nan_flux_total:,} | drop_time={dropped_time_total:,} | "
        f"mag_range=[{(mag_min if np.isfinite(mag_min) else np.nan):.2f}, {(mag_max if np.isfinite(mag_max) else np.nan):.2f}] | "
        f"time={dt:.1f}s"
    )

print(f"[Stage 4] REBUILD_MODE={REBUILD_MODE} | Writing to: {LC_CLEAN_DIR}")
for s in splits_to_use:
    process_split(s, "train")
    process_split(s, "test")

# ----------------------------
# 7) Save manifests + summary + config
# ----------------------------
df_manifest = pd.DataFrame(manifest_rows)
df_summary  = pd.DataFrame(summary_rows)

manifest_path = LC_CLEAN_DIR / "lc_clean_mag_manifest.csv"
summary_path  = LC_CLEAN_DIR / "lc_clean_mag_summary.csv"

df_manifest.to_csv(manifest_path, index=False)
df_summary.to_csv(summary_path, index=False)

cfg_path = LC_CLEAN_DIR / "photometric_config_mag.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump({
        "EXT_RLAMBDA": EXT_RLAMBDA,
        "SNR_DET": float(SNR_DET),
        "DET_SIGMA": float(DET_SIGMA),
        "ERR_EPS": float(ERR_EPS),
        "MIN_FLUX_POS_UJY": float(MIN_FLUX_POS_UJY),
        "MAG_ZP": float(MAG_ZP),
        "MAG_MIN": float(MAG_MIN),
        "MAG_MAX": float(MAG_MAX),
        "MAGERR_FLOOR_DET": float(MAGERR_FLOOR_DET),
        "MAGERR_FLOOR_ND": float(MAGERR_FLOOR_ND),
        "MAGERR_CAP": float(MAGERR_CAP),
        "CHUNKSIZE": int(CHUNKSIZE),
        "WRITE_FORMAT": str(WRITE_FORMAT),
        "ONLY_SPLITS": list(splits_to_use),
        "KEEP_FLUX_DEBUG": bool(KEEP_FLUX_DEBUG),
        "DROP_BAD_TIME_ROWS": bool(DROP_BAD_TIME_ROWS),
        "REBUILD_MODE": str(REBUILD_MODE),
    }, f, indent=2)

print("\n[Stage 4] Done.")
print(f"- LC_CLEAN_DIR : {LC_CLEAN_DIR}")
print(f"- Saved manifest: {manifest_path}")
print(f"- Saved summary : {summary_path}")
print(f"- Saved config  : {cfg_path}")

# ----------------------------
# 8) Helper for next stages
# ----------------------------
def get_clean_parts(split_name: str, which: str):
    m = df_manifest[(df_manifest["split"] == split_name) & (df_manifest["which"] == which)].sort_values("part")
    return m["path"].astype(str).tolist()

globals().update({
    "EXT_RLAMBDA": EXT_RLAMBDA,
    "BAND2ID": BAND2ID,
    "ID2BAND": ID2BAND,
    "MAG_ZP": MAG_ZP,
    "LC_CLEAN_DIR": LC_CLEAN_DIR,
    "lc_clean_mag_manifest": df_manifest,
    "lc_clean_mag_summary": df_summary,
    "get_clean_parts": get_clean_parts,
})

gc.collect()


[Stage 4] REBUILD_MODE=wipe_all | Writing to: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/lc_clean_mag
[Stage 4] split_01/train: parts=1 | rows=26,324 | det%=19.34% | nan_flux=11 | drop_time=0 | mag_range=[19.72, 25.96] | time=0.1s
[Stage 4] split_01/test: parts=1 | rows=59,235 | det%=23.02% | nan_flux=23 | drop_time=0 | mag_range=[19.61, 26.20] | time=0.2s
[Stage 4] split_02/train: parts=1 | rows=25,609 | det%=24.45% | nan_flux=6 | drop_time=0 | mag_range=[20.10, 26.04] | time=0.1s
[Stage 4] split_02/test: parts=1 | rows=71,229 | det%=21.69% | nan_flux=8 | drop_time=0 | mag_range=[18.77, 26.32] | time=0.3s
[Stage 4] split_03/train: parts=1 | rows=21,676 | det%=21.65% | nan_flux=5 | drop_time=0 | mag_range=[20.17, 26.23] | time=0.1s
[Stage 4] split_03/test: parts=1 | rows=53,751 | det%=21.90% | nan_flux=8 | drop_time=0 | mag_range=[19.61, 26.37] | time=0.2s
[Stage 4] split_04/train: parts=1 | rows=22,898 | det%=21.11% | nan_flux=12 | drop_time=0 | mag_range=[20

93

# Sequence Tokenization (Event-based Tokens)

In [7]:
# ============================================================
# STAGE 5 — Sequence Tokenization (Event-based Tokens) (ONE CELL, Kaggle CPU-SAFE)
# REVISI FULL v5 (FIX PATH SYNC + HARDENED + BUCKET ROBUST)
#
# FIX UTAMA:
# - Auto-find STAGE 4 manifest (lc_clean_mag_manifest.csv) dari run manapun.
# - Sync path BENAR:
#     manifest: .../run_*/artifacts/lc_clean_mag/lc_clean_mag_manifest.csv
#     LC_CLEAN_DIR = manifest.parent
#     ART_DIR      = LC_CLEAN_DIR.parent
#     RUN_DIR      = ART_DIR.parent
# - Validasi semua part path ada (tidak hanya manifest-nya).
#
# OUTPUT:
# - artifacts/seq_tokens/split_XX/{train|test}/shard_*.npz
# - artifacts/seq_tokens/seq_manifest_{train|test}.csv
# - artifacts/seq_tokens/seq_build_stats.csv
# - artifacts/seq_tokens/seq_config.json
# ============================================================

import gc, json, warnings, time
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require minimal globals
# ----------------------------
for need in ["ART_DIR", "df_train_meta", "df_test_meta"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan minimal STAGE 0 + STAGE 2 dulu (ART_DIR + meta).")

ART_DIR = Path(ART_DIR)

# ----------------------------
# 1) Helpers
# ----------------------------
def _safe_string_series(s: pd.Series) -> pd.Series:
    # Pandas 2.x aman pakai "string"; tapi fallback kalau environment aneh
    try:
        return s.astype("string").str.strip()
    except Exception:
        return s.astype(str).str.strip()

def _find_stage4_manifest(art_dir: Path) -> Path | None:
    # 1) try current ART_DIR
    cand = art_dir / "lc_clean_mag" / "lc_clean_mag_manifest.csv"
    if cand.exists():
        return cand

    # 2) scan all runs
    root = Path("/kaggle/working/mallorn_run")
    if not root.exists():
        return None

    cands = list(root.glob("run_*/artifacts/lc_clean_mag/lc_clean_mag_manifest.csv"))
    if not cands:
        cands = list(root.glob("run_*/**/lc_clean_mag_manifest.csv"))

    if not cands:
        return None

    cands = sorted(cands, key=lambda p: p.stat().st_mtime, reverse=True)
    return cands[0]

def _sync_dirs_from_manifest(manifest_csv: Path):
    # manifest: .../run_*/artifacts/lc_clean_mag/lc_clean_mag_manifest.csv
    lc_clean_dir = manifest_csv.parent                  # .../lc_clean_mag
    art_dir_new  = lc_clean_dir.parent                  # .../artifacts
    run_dir_new  = art_dir_new.parent                   # .../run_*
    return run_dir_new, art_dir_new, lc_clean_dir

# ----------------------------
# 2) Locate STAGE 4 output (robust)
# ----------------------------
manifest_csv = _find_stage4_manifest(ART_DIR)
if manifest_csv is None:
    root = Path("/kaggle/working/mallorn_run")
    runs = sorted([p.name for p in root.glob("run_*") if p.is_dir()])[-15:] if root.exists() else []
    raise RuntimeError(
        "Output STAGE 4 (lc_clean_mag_manifest.csv) tidak ditemukan.\n"
        f"- ART_DIR saat ini: {ART_DIR}\n"
        f"- Expected: {ART_DIR/'lc_clean_mag'}\n"
        f"- Runs available (last 15): {runs}\n"
        "Solusi: pastikan STAGE 4 benar-benar selesai dan menulis artifacts/lc_clean_mag."
    )

RUN_DIR, ART_DIR, LC_CLEAN_DIR = _sync_dirs_from_manifest(manifest_csv)

print("STAGE 5 ROUTING SYNC OK")
print(f"- RUN_DIR      : {RUN_DIR}")
print(f"- ART_DIR      : {ART_DIR}")
print(f"- LC_CLEAN_DIR : {LC_CLEAN_DIR}")
print(f"- manifest_csv : {manifest_csv}")

# ----------------------------
# 3) Load & validate Stage4 manifest
# ----------------------------
_df_clean_manifest = pd.read_csv(manifest_csv)
_df_clean_manifest.columns = [c.strip() for c in _df_clean_manifest.columns]

need_cols = {"split", "which", "part", "path"}
miss = sorted(list(need_cols - set(_df_clean_manifest.columns)))
if miss:
    raise RuntimeError(f"Manifest STAGE 4 missing columns: {miss} | cols={list(_df_clean_manifest.columns)}")

# validate part files exist
paths = _df_clean_manifest["path"].astype(str).tolist()
missing_paths = [p for p in paths if not Path(p).exists()]
if missing_paths:
    ex = missing_paths[:10]
    raise RuntimeError(
        "Ada file part STAGE 4 yang hilang (manifest ada tapi file tidak ada).\n"
        f"Missing count={len(missing_paths)} | contoh={ex}\n"
        "Solusi: rerun STAGE 4 dengan mode rebuild/wipe untuk regenerasi cache."
    )

def get_clean_parts(split_name: str, which: str):
    m = _df_clean_manifest[(_df_clean_manifest["split"] == split_name) & (_df_clean_manifest["which"] == which)]
    if m.empty:
        return []
    return m.sort_values("part")["path"].astype(str).tolist()

# ----------------------------
# 4) Recover SPLIT_LIST + routing ids if missing
# ----------------------------
if "SPLIT_LIST" not in globals() or not isinstance(SPLIT_LIST, (list, tuple)) or len(SPLIT_LIST) == 0:
    splits = sorted(set(df_train_meta["split"].astype(str).tolist()) | set(df_test_meta["split"].astype(str).tolist()))
    SPLIT_LIST = splits if splits else [f"split_{i:02d}" for i in range(1, 21)]

if "train_ids_by_split" not in globals() or not isinstance(train_ids_by_split, dict):
    train_ids_by_split = {s: [] for s in SPLIT_LIST}
    for oid, sp in df_train_meta["split"].items():
        train_ids_by_split[str(sp)].append(str(oid))

if "test_ids_by_split" not in globals() or not isinstance(test_ids_by_split, dict):
    test_ids_by_split = {s: [] for s in SPLIT_LIST}
    for oid, sp in df_test_meta["split"].items():
        test_ids_by_split[str(sp)].append(str(oid))

# ----------------------------
# 5) Settings
# ----------------------------
ONLY_SPLITS = None                 # None=all
REBUILD_MODE = "wipe_all"          # "wipe_all" or "reuse_if_exists"

COMPRESS_NPZ = False
SHARD_MAX_OBJECTS = 1500

SNR_TANH_SCALE = 10.0
TIME_CLIP_MAX_DAYS = None
DROP_BAD_TIME_ROWS = True

L_MAX = int(CFG.get("L_MAX", 256)) if "CFG" in globals() else 256
TRUNC_POLICY = str(CFG.get("TRUNC_POLICY", "smart")) if "CFG" in globals() else "smart"  # smart/head/none
KEEP_DET_FRAC = float(CFG.get("KEEP_DET_FRAC", 0.70)) if "CFG" in globals() else 0.70
KEEP_EDGE = True
USE_RESTFRAME_TIME = bool(CFG.get("USE_RESTFRAME_TIME", True)) if "CFG" in globals() else True

NUM_BUCKETS = 64

SEQ_DIR = Path(ART_DIR) / "seq_tokens"
SEQ_DIR.mkdir(parents=True, exist_ok=True)

TOKEN_MODE = None
FEATURE_NAMES = None
FEATURE_DIM = None

BASE_COLS = {"object_id", "mjd", "band_id", "snr", "detected"}
MODE_COLS = {"mag": {"mag", "mag_err"}, "asinh": {"flux_asinh", "err_log1p"}}

# ----------------------------
# 6) Reader for cleaned parts
# ----------------------------
def _read_clean_part(path: str) -> pd.DataFrame:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Clean part missing: {p}")

    if p.suffix == ".parquet":
        df = pd.read_parquet(p)
    elif p.name.endswith(".csv.gz"):
        df = pd.read_csv(p, compression="gzip")
    else:
        df = pd.read_csv(p)

    df.columns = [c.strip() for c in df.columns]

    global TOKEN_MODE, FEATURE_NAMES, FEATURE_DIM
    if TOKEN_MODE is None:
        cols = set(df.columns)
        if BASE_COLS.issubset(cols) and MODE_COLS["mag"].issubset(cols):
            TOKEN_MODE = "mag"
            FEATURE_NAMES = ["t_rel_log", "dt_log", "mag", "mag_err_log", "snr_tanh", "detected"]
        elif BASE_COLS.issubset(cols) and MODE_COLS["asinh"].issubset(cols):
            TOKEN_MODE = "asinh"
            FEATURE_NAMES = ["t_rel_log", "dt_log", "flux_asinh", "err_log1p", "snr_tanh", "detected"]
        else:
            raise RuntimeError(
                "Cannot detect cleaned schema.\n"
                f"Found cols={list(df.columns)}\n"
                "Expected MAG or ASINH schema from STAGE 4."
            )
        FEATURE_DIM = len(FEATURE_NAMES)

    req = set(BASE_COLS) | set(MODE_COLS[TOKEN_MODE])
    miss = sorted(list(req - set(df.columns)))
    if miss:
        raise RuntimeError(f"Clean part missing columns: {miss} | file={p}")

    df["object_id"] = _safe_string_series(df["object_id"])
    df["mjd"] = pd.to_numeric(df["mjd"], errors="coerce").astype(np.float32)
    df["band_id"] = pd.to_numeric(df["band_id"], errors="coerce").astype(np.int16)
    df["snr"] = pd.to_numeric(df["snr"], errors="coerce").astype(np.float32)
    df["detected"] = pd.to_numeric(df["detected"], errors="coerce").fillna(0).astype(np.int8)

    if TOKEN_MODE == "mag":
        df["mag"] = pd.to_numeric(df["mag"], errors="coerce").astype(np.float32)
        df["mag_err"] = pd.to_numeric(df["mag_err"], errors="coerce").astype(np.float32)
    else:
        df["flux_asinh"] = pd.to_numeric(df["flux_asinh"], errors="coerce").astype(np.float32)
        df["err_log1p"] = pd.to_numeric(df["err_log1p"], errors="coerce").astype(np.float32)

    if DROP_BAD_TIME_ROWS:
        df = df[np.isfinite(df["mjd"].to_numpy())]

    return df

# ----------------------------
# 7) Truncation
# ----------------------------
def _smart_truncate(mjd, det, snr, Lmax: int):
    n = len(mjd)
    if n <= Lmax:
        return np.arange(n, dtype=np.int64)

    idx_all = np.arange(n, dtype=np.int64)
    keep = set()
    if KEEP_EDGE:
        keep.add(0); keep.add(n - 1)

    det_idx = idx_all[det.astype(bool)]
    k_det = int(max(0, min(len(det_idx), int(np.floor(Lmax * KEEP_DET_FRAC)))))
    if k_det > 0 and len(det_idx) > 0:
        score = np.abs(snr[det_idx])
        top = det_idx[np.argsort(-score)[:k_det]]
        for i in top.tolist():
            keep.add(int(i))

    if len(keep) < Lmax:
        rem = [i for i in idx_all.tolist() if i not in keep]
        need = Lmax - len(keep)
        if rem and need > 0:
            pick = np.linspace(0, len(rem) - 1, num=need, dtype=int)
            for p in pick.tolist():
                keep.add(int(rem[p]))

    out = np.array(sorted(keep), dtype=np.int64)
    if len(out) > Lmax:
        pos = np.linspace(0, len(out) - 1, num=Lmax, dtype=int)
        out = out[pos]
    return out

# ----------------------------
# 8) Build tokens per object
# ----------------------------
def build_object_tokens(df_obj: pd.DataFrame, z_val: float = 0.0):
    if df_obj is None or df_obj.empty:
        return None, None, 0, 0

    mjd = df_obj["mjd"].to_numpy(dtype=np.float32, copy=False)
    band = df_obj["band_id"].to_numpy(dtype=np.int16, copy=False)
    snr  = df_obj["snr"].to_numpy(dtype=np.float32, copy=False)
    det  = df_obj["detected"].to_numpy(dtype=np.int8, copy=False)

    order = np.lexsort((band, mjd))
    mjd = mjd[order]; band = band[order]; snr = snr[order]; det = det[order]

    z = float(z_val) if (z_val is not None and np.isfinite(z_val)) else 0.0
    denom = (1.0 + max(z, 0.0)) if USE_RESTFRAME_TIME else 1.0

    t0 = mjd[0]
    t_rel = (mjd - t0) / np.float32(denom)
    dt = np.empty_like(t_rel); dt[0] = 0.0
    if len(t_rel) > 1:
        dt[1:] = np.maximum(t_rel[1:] - t_rel[:-1], 0.0)

    if TIME_CLIP_MAX_DAYS is not None:
        mx = np.float32(TIME_CLIP_MAX_DAYS)
        t_rel = np.clip(t_rel, 0.0, mx)
        dt    = np.clip(dt,    0.0, mx)

    t_rel_log = np.log1p(t_rel).astype(np.float32)
    dt_log    = np.log1p(dt).astype(np.float32)

    snr = np.nan_to_num(snr, nan=0.0, posinf=0.0, neginf=0.0)
    snr_tanh = np.tanh(snr / np.float32(SNR_TANH_SCALE)).astype(np.float32)
    det_f = det.astype(np.float32)

    if TOKEN_MODE == "mag":
        mag = df_obj["mag"].to_numpy(dtype=np.float32, copy=False)[order]
        mag_err = df_obj["mag_err"].to_numpy(dtype=np.float32, copy=False)[order]
        mag = np.nan_to_num(mag, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        mag_err = np.nan_to_num(mag_err, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        mag_err = np.maximum(mag_err, np.float32(0.0))
        mag_err_log = np.log1p(mag_err).astype(np.float32)
        X = np.stack([t_rel_log, dt_log, mag, mag_err_log, snr_tanh, det_f], axis=1).astype(np.float32)
    else:
        flux = df_obj["flux_asinh"].to_numpy(dtype=np.float32, copy=False)[order]
        elog = df_obj["err_log1p"].to_numpy(dtype=np.float32, copy=False)[order]
        flux = np.nan_to_num(flux, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        elog = np.nan_to_num(elog, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        X = np.stack([t_rel_log, dt_log, flux, elog, snr_tanh, det_f], axis=1).astype(np.float32)

    L0 = int(X.shape[0])
    if L_MAX and int(L_MAX) > 0 and X.shape[0] > int(L_MAX):
        if TRUNC_POLICY == "smart":
            keep = _smart_truncate(mjd, det, snr, int(L_MAX))
        elif TRUNC_POLICY == "head":
            keep = np.arange(int(L_MAX), dtype=np.int64)
        else:  # "none"
            keep = np.arange(X.shape[0], dtype=np.int64)

        if len(keep) != X.shape[0]:
            X = X[keep]
            band = band[keep]

            # recompute dt_log (stabil)
            sel_mjd = mjd[keep]
            sel_t = (sel_mjd - sel_mjd[0]) / np.float32(denom)
            sel_dt = np.empty_like(sel_t); sel_dt[0] = 0.0
            if len(sel_t) > 1:
                sel_dt[1:] = np.maximum(sel_t[1:] - sel_t[:-1], 0.0)
            X[:, 1] = np.log1p(sel_dt).astype(np.float32)

    return X, band.astype(np.int8), L0, int(X.shape[0])

# ----------------------------
# 9) Shard writer
# ----------------------------
def save_shard(out_path: Path, object_ids, X_concat, B_concat, offsets):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    obj_arr = np.asarray(object_ids, dtype="S")  # bytes
    if COMPRESS_NPZ:
        np.savez_compressed(out_path, object_id=obj_arr, x=X_concat, band=B_concat, offsets=offsets)
    else:
        np.savez(out_path, object_id=obj_arr, x=X_concat, band=B_concat, offsets=offsets)

# ----------------------------
# 10) Robust builder: bucketize -> groupby object -> shard
# ----------------------------
def build_sequences_bucket(split_name: str, which: str, expected_ids: set, out_dir: Path, num_buckets: int = 64):
    try:
        import pyarrow as pa
        import pyarrow.parquet as pq
    except Exception as e:
        raise RuntimeError("pyarrow tidak tersedia. Di Kaggle biasanya ada.") from e

    parts = get_clean_parts(split_name, which)
    if not parts:
        raise RuntimeError(f"Tidak ada cleaned parts untuk {split_name}/{which}. Cek STAGE 4 output.")

    tmp_dir = Path(ART_DIR) / "tmp_seq_buckets" / split_name / which
    tmp_dir.mkdir(parents=True, exist_ok=True)

    writers = {}

    def bucket_idx(series_objid: pd.Series) -> np.ndarray:
        h = pd.util.hash_pandas_object(series_objid, index=False).to_numpy(dtype=np.uint64, copy=False)
        return (h % np.uint64(num_buckets)).astype(np.int16)

    kept_rows = 0
    t0 = time.time()

    for p in parts:
        df = _read_clean_part(p)
        if df.empty:
            continue

        df = df[df["object_id"].isin(expected_ids)]
        if df.empty:
            continue

        kept_rows += int(len(df))
        bidx = bucket_idx(df["object_id"])
        df["_b"] = bidx

        for b in np.unique(bidx):
            sub = df[df["_b"] == b].drop(columns=["_b"])
            if sub.empty:
                continue
            fp = tmp_dir / f"bucket_{int(b):03d}.parquet"
            table = pa.Table.from_pandas(sub, preserve_index=False)
            if int(b) not in writers:
                writers[int(b)] = pq.ParquetWriter(fp, table.schema, compression="snappy")
            writers[int(b)].write_table(table)

        del df
        gc.collect()

    for w in writers.values():
        w.close()

    meta = df_train_meta if which == "train" else df_test_meta

    manifest_rows = []
    shard_idx = 0
    batch_obj_ids, batch_X, batch_B, batch_len = [], [], [], []
    built_ids = set()

    len_before, len_after = [], []

    def flush_shard_local():
        nonlocal shard_idx, batch_obj_ids, batch_X, batch_B, batch_len, manifest_rows
        if not batch_obj_ids:
            return
        lengths = np.asarray(batch_len, dtype=np.int64)
        offsets = np.zeros(len(lengths) + 1, dtype=np.int64)
        offsets[1:] = np.cumsum(lengths)

        Xc = np.concatenate(batch_X, axis=0).astype(np.float32)
        Bc = np.concatenate(batch_B, axis=0).astype(np.int8)

        shard_path = out_dir / f"shard_{shard_idx:04d}.npz"
        save_shard(shard_path, batch_obj_ids, Xc, Bc, offsets)

        for i, oid in enumerate(batch_obj_ids):
            manifest_rows.append({
                "object_id": oid,
                "split": split_name,
                "which": which,
                "shard": str(shard_path),
                "start": int(offsets[i]),
                "length": int(lengths[i]),
            })

        shard_idx += 1
        batch_obj_ids, batch_X, batch_B, batch_len = [], [], [], []
        gc.collect()

    for bf in sorted(tmp_dir.glob("bucket_*.parquet")):
        dfb = pd.read_parquet(bf)
        if dfb.empty:
            try: bf.unlink()
            except Exception: pass
            continue

        # group-by object_id
        for oid, g in dfb.groupby("object_id", sort=False):
            oid = str(oid)
            if oid in built_ids:
                continue

            z_val = float(meta.loc[oid, "Z"]) if (USE_RESTFRAME_TIME and oid in meta.index) else 0.0

            L0 = int(len(g))
            X, B, lb, la = build_object_tokens(g, z_val=z_val)
            if X is None:
                continue

            len_before.append(lb)
            len_after.append(la)

            batch_obj_ids.append(oid)
            batch_X.append(X)
            batch_B.append(B)
            batch_len.append(X.shape[0])
            built_ids.add(oid)

            if len(batch_obj_ids) >= SHARD_MAX_OBJECTS:
                flush_shard_local()

        try: bf.unlink()
        except Exception: pass
        del dfb
        gc.collect()

    flush_shard_local()

    try: tmp_dir.rmdir()
    except Exception:
        pass

    st = {
        "kept_rows": int(kept_rows),
        "built_objects": int(len(built_ids)),
        "len_before_mean": float(np.mean(len_before)) if len_before else 0.0,
        "len_before_p95": float(np.quantile(len_before, 0.95)) if len_before else 0.0,
        "len_after_mean": float(np.mean(len_after)) if len_after else 0.0,
        "len_after_p95": float(np.quantile(len_after, 0.95)) if len_after else 0.0,
        "truncated_frac": float(np.mean([a < b for a, b in zip(len_after, len_before)])) if len_before else 0.0,
        "time_s": float(time.time() - t0),
    }
    return manifest_rows, st

# ----------------------------
# 11) RUN
# ----------------------------
splits_to_run = ONLY_SPLITS if (ONLY_SPLITS is not None) else SPLIT_LIST
all_manifest_train, all_manifest_test, split_run_stats = [], [], []

def expected_set_for(split_name: str, which: str) -> set:
    return set(train_ids_by_split.get(split_name, [])) if which == "train" else set(test_ids_by_split.get(split_name, []))

for split_name in splits_to_run:
    for which in ["train", "test"]:
        out_dir = SEQ_DIR / split_name / which
        out_dir.mkdir(parents=True, exist_ok=True)

        expected_ids = expected_set_for(split_name, which)
        if len(expected_ids) == 0:
            raise RuntimeError(f"Expected ids empty for {split_name}/{which}.")

        # rebuild handling
        shard_exists = any(out_dir.glob("shard_*.npz"))
        if REBUILD_MODE == "reuse_if_exists" and shard_exists:
            print(f"\n[Stage 5] SKIP (exists): {split_name}/{which}")
            continue
        else:
            for f in out_dir.glob("shard_*.npz"):
                try: f.unlink()
                except Exception: pass

        print(f"\n[Stage 5] {split_name}/{which} | expected={len(expected_ids):,} | L_MAX={L_MAX} | TRUNC={TRUNC_POLICY}")

        manifest_rows, st = build_sequences_bucket(
            split_name=split_name,
            which=which,
            expected_ids=expected_ids,
            out_dir=out_dir,
            num_buckets=NUM_BUCKETS
        )

        built = st["built_objects"]
        if built != len(expected_ids):
            missing = len(expected_ids) - built
            raise RuntimeError(
                f"[Stage 5] Mismatch {split_name}/{which}: built={built:,} expected={len(expected_ids):,} missing={missing:,}\n"
                "Jika mismatch terjadi, biasanya ada object_id yang tidak muncul di cleaned parts (STAGE 4) atau file part korup."
            )

        print(f"[Stage 5] OK: built={built:,} | kept_rows={st['kept_rows']:,} | "
              f"len_mean {st['len_before_mean']:.1f}->{st['len_after_mean']:.1f} | "
              f"p95 {st['len_before_p95']:.1f}->{st['len_after_p95']:.1f} | "
              f"trunc%={st['truncated_frac']*100:.1f}% | "
              f"time={st['time_s']:.2f}s | mode={TOKEN_MODE}")

        split_run_stats.append({"split": split_name, "which": which, **st})

        if which == "train":
            all_manifest_train.extend(manifest_rows)
        else:
            all_manifest_test.extend(manifest_rows)

        gc.collect()

# ----------------------------
# 12) Save manifests + stats + config
# ----------------------------
df_m_train = pd.DataFrame(all_manifest_train).sort_values(["split", "shard", "start"]).reset_index(drop=True)
df_m_test  = pd.DataFrame(all_manifest_test).sort_values(["split", "shard", "start"]).reset_index(drop=True)

mtrain_path = SEQ_DIR / "seq_manifest_train.csv"
mtest_path  = SEQ_DIR / "seq_manifest_test.csv"
df_m_train.to_csv(mtrain_path, index=False)
df_m_test.to_csv(mtest_path, index=False)

df_stats = pd.DataFrame(split_run_stats)
stats_path = SEQ_DIR / "seq_build_stats.csv"
df_stats.to_csv(stats_path, index=False)

cfg = {
    "token_mode": TOKEN_MODE,
    "feature_names": FEATURE_NAMES,
    "feature_dim": int(FEATURE_DIM),
    "snr_tanh_scale": float(SNR_TANH_SCALE),
    "time_clip_max_days": None if TIME_CLIP_MAX_DAYS is None else float(TIME_CLIP_MAX_DAYS),
    "compress_npz": bool(COMPRESS_NPZ),
    "shard_max_objects": int(SHARD_MAX_OBJECTS),
    "num_buckets": int(NUM_BUCKETS),
    "L_MAX": int(L_MAX),
    "TRUNC_POLICY": str(TRUNC_POLICY),
    "KEEP_DET_FRAC": float(KEEP_DET_FRAC),
    "USE_RESTFRAME_TIME": bool(USE_RESTFRAME_TIME),
    "REBUILD_MODE": str(REBUILD_MODE),
    "RUN_DIR_USED": str(RUN_DIR),
    "ART_DIR_USED": str(ART_DIR),
    "LC_CLEAN_DIR_USED": str(LC_CLEAN_DIR),
    "manifest_csv": str(manifest_csv),
}
cfg_path = SEQ_DIR / "seq_config.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(cfg, f, indent=2)

print("\n[Stage 5] DONE")
print(f"- token_mode : {TOKEN_MODE}")
print(f"- features   : {FEATURE_NAMES}")
print(f"- Saved: {mtrain_path} (rows={len(df_m_train):,})")
print(f"- Saved: {mtest_path}  (rows={len(df_m_test):,})")
print(f"- Saved: {stats_path}")
print(f"- Saved: {cfg_path}")

# ----------------------------
# 13) Smoke test
# ----------------------------
def load_sequence(object_id: str, which: str):
    object_id = str(object_id).strip()
    m = df_m_train if which == "train" else df_m_test
    row = m[m["object_id"] == object_id]
    if row.empty:
        raise KeyError(f"object_id not found in seq manifest ({which}): {object_id}")
    r = row.iloc[0]
    data = np.load(r["shard"], allow_pickle=False)
    start = int(r["start"]); length = int(r["length"])
    X = data["x"][start:start+length]
    B = data["band"][start:start+length]
    return X, B

_smoke_oid = str(df_train_meta.index[0])
X_sm, B_sm = load_sequence(_smoke_oid, "train")
print(f"\n[Stage 5] Smoke test object_id={_smoke_oid}")
print(f"- seq_len={len(X_sm)} | X_shape={X_sm.shape} | bands_unique={sorted(set(B_sm.tolist()))}")

# ----------------------------
# 14) Export globals
# ----------------------------
globals().update({
    "RUN_DIR": RUN_DIR,
    "ART_DIR": ART_DIR,
    "LC_CLEAN_DIR": LC_CLEAN_DIR,
    "SEQ_DIR": SEQ_DIR,
    "seq_manifest_train": df_m_train,
    "seq_manifest_test": df_m_test,
    "SEQ_FEATURE_NAMES": FEATURE_NAMES,
    "SEQ_FEATURE_DIM": int(FEATURE_DIM),
    "SEQ_TOKEN_MODE": TOKEN_MODE,
    "get_clean_parts": get_clean_parts,
    "load_sequence": load_sequence,
})

gc.collect()


STAGE 5 ROUTING SYNC OK
- RUN_DIR      : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418
- ART_DIR      : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts
- LC_CLEAN_DIR : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/lc_clean_mag
- manifest_csv : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/lc_clean_mag/lc_clean_mag_manifest.csv

[Stage 5] split_01/train | expected=155 | L_MAX=256 | TRUNC=smart_band_peak
[Stage 5] OK: built=155 | kept_rows=26,324 | len_mean 169.8->169.8 | p95 191.7->191.7 | trunc%=0.0% | time=8.76s | mode=mag

[Stage 5] split_01/test | expected=364 | L_MAX=256 | TRUNC=smart_band_peak
[Stage 5] OK: built=364 | kept_rows=59,235 | len_mean 162.7->162.7 | p95 193.8->193.8 | trunc%=0.0% | time=8.90s | mode=mag

[Stage 5] split_02/train | expected=170 | L_MAX=256 | TRUNC=smart_band_peak
[Stage 5] OK: built=170 | kept_rows=25,609 | len_mean 150.6->150.6 | p95 195.5->195.5 | trunc%=0.0% | time=8.37s

55

# Sequence Length Policy (Padding, Truncation, Windowing)

In [8]:
# ============================================================
# STAGE 6 — Sequence Length Policy (Padding, Truncation, Windowing)
# ONE CELL, Kaggle CPU-SAFE — REVISI FULL v2 (MAG/ASINH COMPAT, HARDENED)
#
# Output:
# - artifacts/fixed_seq/{train|test}_{X|B|M}.dat  (memmap)
# - artifacts/fixed_seq/{train|test}_ids.npy
# - artifacts/fixed_seq/train_y.npy
# - artifacts/fixed_seq/{train|test}_origlen.npy, {train|test}_winstart.npy, {train|test}_winend.npy
# - artifacts/fixed_seq/length_policy_config.json
# ============================================================

import gc, json, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["seq_manifest_train", "seq_manifest_test", "SEQ_FEATURE_NAMES",
             "df_train_meta", "df_test_meta", "ART_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 -> 1 -> 2 -> 3 -> 4 -> 5 dulu.")

ART_DIR = Path(ART_DIR)

m_train = seq_manifest_train.copy()
m_test  = seq_manifest_test.copy()

SEQ_FEATURE_NAMES = list(SEQ_FEATURE_NAMES)
feat = {name: i for i, name in enumerate(SEQ_FEATURE_NAMES)}

# ----------------------------
# 0b) Detect token_mode (MAG vs ASINH)
# ----------------------------
SEQ_TOKEN_MODE = globals().get("SEQ_TOKEN_MODE", None)
if SEQ_TOKEN_MODE is None:
    if ("flux_asinh" in feat) and ("err_log1p" in feat):
        SEQ_TOKEN_MODE = "asinh"
    elif ("mag" in feat) and ("mag_err_log" in feat):
        SEQ_TOKEN_MODE = "mag"
    else:
        raise ValueError(
            "Cannot infer SEQ_TOKEN_MODE from SEQ_FEATURE_NAMES.\n"
            f"SEQ_FEATURE_NAMES={SEQ_FEATURE_NAMES}\n"
            "Expected either (flux_asinh, err_log1p) or (mag, mag_err_log)."
        )

REQ_COMMON = ["t_rel_log", "dt_log", "snr_tanh", "detected"]
for k in REQ_COMMON:
    if k not in feat:
        raise ValueError(f"SEQ_FEATURE_NAMES must include '{k}'. Found: {SEQ_FEATURE_NAMES}")

if SEQ_TOKEN_MODE == "asinh":
    if "flux_asinh" not in feat:
        raise ValueError("token_mode=asinh requires 'flux_asinh'.")
    SCORE_VALUE_FEAT = "flux_asinh"
elif SEQ_TOKEN_MODE == "mag":
    if "mag" not in feat:
        raise ValueError("token_mode=mag requires 'mag'.")
    SCORE_VALUE_FEAT = "mag"
else:
    raise ValueError(f"Unknown SEQ_TOKEN_MODE={SEQ_TOKEN_MODE}")

print(f"[Stage 6] token_mode={SEQ_TOKEN_MODE} | score_value_feat={SCORE_VALUE_FEAT} | F={len(SEQ_FEATURE_NAMES)}")

# ----------------------------
# 1) Settings
# ----------------------------
FORCE_MAX_LEN = None         # e.g. 256 (kalau mau paksa)
MAXLEN_CAPS = (256, 384, 512) # CPU-safe choices

# Score weights (bisa kamu tweak)
W_SNR = 1.00
W_VAL = 0.35
W_DET = 0.25

# Padding policy
PAD_BAND_ID = 0              # aman (tetap kompatibel kalau band 0..5)
SHIFT_BAND_IDS = False       # True => simpan band+1 untuk token asli, pad=0 (butuh kompatibilitas downstream)

# Build policy
REBUILD_MODE = "wipe_all"    # "wipe_all" atau "reuse_if_exists"
DTYPE_X = np.float32         # jangan ganti ke fp16 dulu kecuali kamu yakin

# ----------------------------
# 2) Inspect length distribution -> choose MAX_LEN
# ----------------------------
def describe_lengths(m: pd.DataFrame, name: str):
    L = m["length"].to_numpy(dtype=np.int32, copy=False)
    q = np.percentile(L, [0, 1, 5, 10, 25, 50, 75, 90, 95, 98, 99, 100])
    print(f"\n{name} length stats")
    print(f"- n_objects={len(L):,} | min={int(q[0])} | p50={int(q[5])} | p90={int(q[7])} | p95={int(q[8])} | p99={int(q[10])} | max={int(q[-1])}")
    return q

q_tr = describe_lengths(m_train, "TRAIN")
q_te = describe_lengths(m_test,  "TEST")

p95 = int(max(q_tr[8], q_te[8]))
if FORCE_MAX_LEN is not None:
    MAX_LEN = int(FORCE_MAX_LEN)
else:
    if p95 <= 256:
        MAX_LEN = 256
    elif p95 <= 384:
        MAX_LEN = 384
    else:
        MAX_LEN = 512

if MAX_LEN not in MAXLEN_CAPS and FORCE_MAX_LEN is None:
    # fallback to closest cap
    MAX_LEN = int(min(MAXLEN_CAPS, key=lambda x: abs(x - MAX_LEN)))

print(f"\n[Stage 6] MAX_LEN={MAX_LEN} (based on p95={p95})")

# ----------------------------
# 3) Window scoring (adaptive)
# ----------------------------
def _brightness_proxy_from_mag(mag: np.ndarray) -> np.ndarray:
    mag = np.nan_to_num(mag, nan=np.float32(0.0), posinf=np.float32(0.0), neginf=np.float32(0.0)).astype(np.float32, copy=False)
    if mag.size == 0:
        return np.zeros_like(mag, dtype=np.float32)
    med = np.float32(np.median(mag))
    br = np.maximum(med - mag, np.float32(0.0))
    br = np.log1p(br).astype(np.float32, copy=False)
    return br

def _score_tokens(X: np.ndarray) -> np.ndarray:
    snr = np.abs(X[:, feat["snr_tanh"]]).astype(np.float32, copy=False)
    det = X[:, feat["detected"]].astype(np.float32, copy=False)

    if SEQ_TOKEN_MODE == "asinh":
        val = np.abs(X[:, feat["flux_asinh"]]).astype(np.float32, copy=False)
    else:
        mag = X[:, feat["mag"]].astype(np.float32, copy=False)
        val = _brightness_proxy_from_mag(mag)

    score = (np.float32(W_SNR) * snr) + (np.float32(W_VAL) * val) + (np.float32(W_DET) * det)
    score = np.nan_to_num(score, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32, copy=False)
    return score

def select_best_window(score: np.ndarray, max_len: int) -> tuple[int, int]:
    """
    Pilih window kontigu panjang max_len yang memaksimalkan sum(score).
    O(L) pakai prefix sum.
    """
    L = int(score.shape[0])
    if L <= max_len:
        return 0, L

    # prefix sum
    cs = np.empty(L + 1, dtype=np.float32)
    cs[0] = 0.0
    np.cumsum(score, out=cs[1:])

    # window sums: cs[i+max_len]-cs[i], i=0..L-max_len
    ws = cs[max_len:] - cs[:-max_len]
    if not np.isfinite(ws).any():
        start = (L - max_len) // 2
    else:
        start = int(np.argmax(ws))
    end = start + max_len
    return start, end

def pad_to_fixed(X: np.ndarray, B: np.ndarray, max_len: int):
    """
    Returns:
      Xp: (max_len, F) float32
      Bp: (max_len,) int8
      Mp: (max_len,) int8  (1=real token)
      orig_len, win_start, win_end
    """
    L = int(X.shape[0])
    F = int(X.shape[1])

    Xp = np.zeros((max_len, F), dtype=DTYPE_X)
    Bp = np.full((max_len,), PAD_BAND_ID, dtype=np.int8)
    Mp = np.zeros((max_len,), dtype=np.int8)

    if L <= 0:
        return Xp, Bp, Mp, 0, 0, 0

    if L <= max_len:
        Xw = X
        Bw = B
        ws, we = 0, L
    else:
        sc = _score_tokens(X)
        ws, we = select_best_window(sc, max_len=max_len)
        Xw = X[ws:we]
        Bw = B[ws:we]

    lw = int(Xw.shape[0])
    Xp[:lw] = Xw.astype(DTYPE_X, copy=False)

    if SHIFT_BAND_IDS:
        # token band 1..K, pad=0
        Bp[:lw] = (Bw.astype(np.int16, copy=False) + 1).astype(np.int8)
    else:
        Bp[:lw] = Bw.astype(np.int8, copy=False)

    Mp[:lw] = 1
    return Xp, Bp, Mp, L, int(ws), int(we)

# ----------------------------
# 4) Fixed cache builder (process per shard once)
# ----------------------------
FIX_DIR = Path(ART_DIR) / "fixed_seq"
FIX_DIR.mkdir(parents=True, exist_ok=True)

# choose id ordering
train_ids = [str(x) for x in df_train_meta.index.to_list()]

# y column robust
_y_col = None
for cand in ["target", "y", "label", "class", "target_id"]:
    if cand in df_train_meta.columns:
        _y_col = cand
        break
if _y_col is None:
    raise RuntimeError(f"Cannot find target column in df_train_meta. cols={list(df_train_meta.columns)[:30]}")

y_train = pd.to_numeric(df_train_meta[_y_col], errors="coerce").fillna(0).astype(np.int16).to_numpy(copy=False)

# test ordering
test_ids = None
if "df_sub" in globals() and isinstance(globals()["df_sub"], pd.DataFrame) and "object_id" in df_sub.columns:
    test_ids = df_sub["object_id"].astype(str).str.strip().to_list()
else:
    # try sample_submission.csv from PATHS if exists
    if "PATHS" in globals() and isinstance(PATHS, dict):
        sp = PATHS.get("sample_submission", None) or PATHS.get("sample_sub", None)
        if sp and Path(sp).exists():
            _tmp = pd.read_csv(sp)
            if "object_id" in _tmp.columns:
                test_ids = _tmp["object_id"].astype(str).str.strip().to_list()
    if test_ids is None:
        test_ids = [str(x) for x in df_test_meta.index.to_list()]

# strict unique ids
if len(set(train_ids)) != len(train_ids):
    raise RuntimeError("train_ids contains duplicates. Check df_train_meta.index.")
if len(set(test_ids)) != len(test_ids):
    raise RuntimeError("test_ids contains duplicates. Check ordering source (df_sub/sample_sub/df_test_meta).")

train_row = {oid: i for i, oid in enumerate(train_ids)}
test_row  = {oid: i for i, oid in enumerate(test_ids)}

NTR = len(train_ids)
NTE = len(test_ids)
F = len(SEQ_FEATURE_NAMES)

# estimate disk usage
def _gb(nbytes): return float(nbytes) / (1024**3)
size_tr = NTR * MAX_LEN * F * np.dtype(DTYPE_X).itemsize
size_te = NTE * MAX_LEN * F * np.dtype(DTYPE_X).itemsize
print(f"\n[Stage 6] Memmap X sizes approx: train={_gb(size_tr):.2f} GB | test={_gb(size_te):.2f} GB | dtype={DTYPE_X}")

# memmap paths
train_X_path = FIX_DIR / "train_X.dat"
train_B_path = FIX_DIR / "train_B.dat"
train_M_path = FIX_DIR / "train_M.dat"
test_X_path  = FIX_DIR / "test_X.dat"
test_B_path  = FIX_DIR / "test_B.dat"
test_M_path  = FIX_DIR / "test_M.dat"

train_len_path = FIX_DIR / "train_origlen.npy"
train_ws_path  = FIX_DIR / "train_winstart.npy"
train_we_path  = FIX_DIR / "train_winend.npy"
test_len_path  = FIX_DIR / "test_origlen.npy"
test_ws_path   = FIX_DIR / "test_winstart.npy"
test_we_path   = FIX_DIR / "test_winend.npy"

# ----------------------------
# 4b) Rebuild handling
# ----------------------------
def _all_exist(paths):
    return all(Path(p).exists() for p in paths)

reuse_paths = [
    train_X_path, train_B_path, train_M_path,
    test_X_path, test_B_path, test_M_path,
    FIX_DIR / "train_ids.npy", FIX_DIR / "test_ids.npy", FIX_DIR / "train_y.npy",
    train_len_path, train_ws_path, train_we_path,
    test_len_path, test_ws_path, test_we_path,
    FIX_DIR / "length_policy_config.json"
]

if REBUILD_MODE == "reuse_if_exists" and _all_exist(reuse_paths):
    print("[Stage 6] REUSE (exists): fixed_seq cache already present.")
    globals().update({
        "FIX_DIR": FIX_DIR, "MAX_LEN": MAX_LEN,
        "FIX_TRAIN_X_PATH": train_X_path, "FIX_TRAIN_B_PATH": train_B_path, "FIX_TRAIN_M_PATH": train_M_path,
        "FIX_TEST_X_PATH": test_X_path,  "FIX_TEST_B_PATH": test_B_path,  "FIX_TEST_M_PATH": test_M_path,
        "FIX_TRAIN_Y_PATH": FIX_DIR / "train_y.npy",
        "FIX_TRAIN_IDS_PATH": FIX_DIR / "train_ids.npy",
        "FIX_TEST_IDS_PATH": FIX_DIR / "test_ids.npy",
        "FIX_POLICY_CFG_PATH": FIX_DIR / "length_policy_config.json",
        "SEQ_TOKEN_MODE": SEQ_TOKEN_MODE,
    })
    raise SystemExit

# ----------------------------
# 5) Create memmaps
# ----------------------------
Xtr = np.memmap(train_X_path, dtype=DTYPE_X, mode="w+", shape=(NTR, MAX_LEN, F))
Btr = np.memmap(train_B_path, dtype=np.int8,  mode="w+", shape=(NTR, MAX_LEN))
Mtr = np.memmap(train_M_path, dtype=np.int8,  mode="w+", shape=(NTR, MAX_LEN))

Xte = np.memmap(test_X_path, dtype=DTYPE_X, mode="w+", shape=(NTE, MAX_LEN, F))
Bte = np.memmap(test_B_path, dtype=np.int8,  mode="w+", shape=(NTE, MAX_LEN))
Mte = np.memmap(test_M_path, dtype=np.int8,  mode="w+", shape=(NTE, MAX_LEN))

origlen_tr = np.zeros((NTR,), dtype=np.int32)
winstart_tr = np.zeros((NTR,), dtype=np.int32)
winend_tr   = np.zeros((NTR,), dtype=np.int32)

origlen_te = np.zeros((NTE,), dtype=np.int32)
winstart_te = np.zeros((NTE,), dtype=np.int32)
winend_te   = np.zeros((NTE,), dtype=np.int32)

filled_tr = np.zeros((NTR,), dtype=np.uint8)
filled_te = np.zeros((NTE,), dtype=np.uint8)

# ----------------------------
# 6) Fill memmaps per shard (fast path)
# ----------------------------
def process_manifest_into_memmap(m: pd.DataFrame, which: str):
    if which == "train":
        row_map = train_row
        Xmm, Bmm, Mmm = Xtr, Btr, Mtr
        origlen, ws_arr, we_arr = origlen_tr, winstart_tr, winend_tr
        filled_mask = filled_tr
        expected_n = NTR
    else:
        row_map = test_row
        Xmm, Bmm, Mmm = Xte, Bte, Mte
        origlen, ws_arr, we_arr = origlen_te, winstart_te, winend_te
        filled_mask = filled_te
        expected_n = NTE

    # strict columns
    for c in ["object_id", "shard", "start", "length"]:
        if c not in m.columns:
            raise RuntimeError(f"Manifest missing column '{c}'. cols={list(m.columns)}")

    # check shard paths exist
    shard_paths = m["shard"].astype(str).unique().tolist()
    miss_sh = [p for p in shard_paths if not Path(p).exists()]
    if miss_sh:
        raise RuntimeError(f"Missing shard files ({which}): count={len(miss_sh)} | ex={miss_sh[:5]}")

    filled = 0
    dup = 0
    empty = 0

    t0 = time.time()
    # deterministic order: sort shard paths
    for shard_path in sorted(shard_paths):
        g = m[m["shard"].astype(str) == shard_path]
        if g.empty:
            continue

        data = np.load(shard_path, allow_pickle=False)
        x_all = data["x"]
        b_all = data["band"]

        # map object_id -> row index in fixed order
        oids = g["object_id"].astype(str).to_numpy()
        idxs = pd.Series(oids).map(row_map).to_numpy()  # object dtype with NaN

        # filter valid idx
        valid = np.isfinite(idxs)
        if not valid.any():
            del data
            continue

        idxs = idxs[valid].astype(np.int64)
        starts = g["start"].to_numpy(dtype=np.int64, copy=False)[valid]
        lens   = g["length"].to_numpy(dtype=np.int64, copy=False)[valid]
        oids_v = oids[valid]

        for oid, idx, st, ln in zip(oids_v, idxs, starts, lens):
            if ln <= 0:
                empty += 1
                continue
            if filled_mask[idx]:
                dup += 1
                continue

            X = x_all[st:st+ln]
            B = b_all[st:st+ln]
            Xp, Bp, Mp, L0, ws, we = pad_to_fixed(X, B, max_len=MAX_LEN)

            Xmm[idx, :, :] = Xp
            Bmm[idx, :] = Bp
            Mmm[idx, :] = Mp
            origlen[idx] = int(L0)
            ws_arr[idx] = int(ws)
            we_arr[idx] = int(we)
            filled_mask[idx] = 1
            filled += 1

        del data
        if filled % 2000 == 0:
            gc.collect()

    elapsed = time.time() - t0
    st = {"filled": int(filled), "dup_skipped": int(dup), "empty_len": int(empty), "time_s": float(elapsed), "expected": int(expected_n)}
    return st

print("\n[Stage 6] Building fixed cache (TRAIN)...")
st_tr = process_manifest_into_memmap(m_train, "train")
print(f"[Stage 6] TRAIN filled={st_tr['filled']:,}/{st_tr['expected']:,} | dup={st_tr['dup_skipped']:,} | empty={st_tr['empty_len']:,} | time={st_tr['time_s']:.2f}s")

print("\n[Stage 6] Building fixed cache (TEST)...")
st_te = process_manifest_into_memmap(m_test, "test")
print(f"[Stage 6] TEST  filled={st_te['filled']:,}/{st_te['expected']:,} | dup={st_te['dup_skipped']:,} | empty={st_te['empty_len']:,} | time={st_te['time_s']:.2f}s")

# flush memmaps
Xtr.flush(); Btr.flush(); Mtr.flush()
Xte.flush(); Bte.flush(); Mte.flush()

# ----------------------------
# 7) Hard sanity: must be 100% filled
# ----------------------------
miss_tr = np.where(filled_tr == 0)[0]
miss_te = np.where(filled_te == 0)[0]
if len(miss_tr) > 0:
    ex = [train_ids[i] for i in miss_tr[:10]]
    raise RuntimeError(f"[Stage 6] TRAIN missing filled rows: {len(miss_tr):,}/{NTR:,} | ex={ex}")
if len(miss_te) > 0:
    ex = [test_ids[i] for i in miss_te[:10]]
    raise RuntimeError(f"[Stage 6] TEST missing filled rows: {len(miss_te):,}/{NTE:,} | ex={ex}")

# ----------------------------
# 8) Save ids + y + meta arrays
# ----------------------------
np.save(FIX_DIR / "train_ids.npy", np.asarray(train_ids, dtype="S"))
np.save(FIX_DIR / "test_ids.npy",  np.asarray(test_ids, dtype="S"))
np.save(FIX_DIR / "train_y.npy",   y_train)

np.save(train_len_path, origlen_tr)
np.save(train_ws_path,  winstart_tr)
np.save(train_we_path,  winend_tr)

np.save(test_len_path,  origlen_te)
np.save(test_ws_path,   winstart_te)
np.save(test_we_path,   winend_te)

# ----------------------------
# 9) Quick sanity samples
# ----------------------------
def sanity_samples(which: str, n_show: int = 3, seed: int = 2025):
    rng = np.random.default_rng(seed)
    if which == "train":
        Xmm, Bmm, Mmm = Xtr, Btr, Mtr
        ids = train_ids
        ol = origlen_tr
    else:
        Xmm, Bmm, Mmm = Xte, Bte, Mte
        ids = test_ids
        ol = origlen_te

    idxs = rng.choice(len(ids), size=min(n_show, len(ids)), replace=False)
    print(f"\n[Stage 6] Sanity samples ({which}):")
    for i in idxs:
        kept = int(Mmm[i].sum())
        bands = sorted(set(Bmm[i, :kept].tolist())) if kept > 0 else []
        print(f"- idx={i} oid={ids[i]} orig_len={int(ol[i])} kept={kept} bands_unique={bands}")

sanity_samples("train", 3)
sanity_samples("test", 3)

# ----------------------------
# 10) Save config
# ----------------------------
policy_cfg = {
    "token_mode": SEQ_TOKEN_MODE,
    "max_len": int(MAX_LEN),
    "feature_names": list(SEQ_FEATURE_NAMES),
    "score_weights": {"W_SNR": float(W_SNR), "W_VAL": float(W_VAL), "W_DET": float(W_DET)},
    "score_value_feat": SCORE_VALUE_FEAT,
    "window_policy": "best_contiguous_window_by_max_sum(score)",
    "padding": {"PAD_BAND_ID": int(PAD_BAND_ID), "SHIFT_BAND_IDS": bool(SHIFT_BAND_IDS)},
    "dtype_X": str(DTYPE_X),
    "order": {
        "train": "df_train_meta.index",
        "test": ("df_sub.object_id" if ("df_sub" in globals() and isinstance(df_sub, pd.DataFrame) and "object_id" in df_sub.columns) else "df_test_meta.index / sample_submission fallback"),
        "y_col": str(_y_col),
    },
    "stats": {"train": st_tr, "test": st_te},
    "files": {
        "train_X": str(train_X_path), "train_B": str(train_B_path), "train_M": str(train_M_path),
        "test_X": str(test_X_path),   "test_B": str(test_B_path),   "test_M": str(test_M_path),
        "train_y": str(FIX_DIR / "train_y.npy"),
        "train_ids": str(FIX_DIR / "train_ids.npy"),
        "test_ids": str(FIX_DIR / "test_ids.npy"),
        "train_origlen": str(train_len_path), "train_winstart": str(train_ws_path), "train_winend": str(train_we_path),
        "test_origlen": str(test_len_path),   "test_winstart": str(test_ws_path),   "test_winend": str(test_we_path),
    }
}
cfg_path = FIX_DIR / "length_policy_config.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(policy_cfg, f, indent=2)

print("\n[Stage 6] DONE")
print(f"- FIX_DIR: {FIX_DIR}")
print(f"- Saved config: {cfg_path}")

# Export globals
globals().update({
    "FIX_DIR": FIX_DIR,
    "MAX_LEN": MAX_LEN,
    "FIX_TRAIN_X_PATH": train_X_path,
    "FIX_TRAIN_B_PATH": train_B_path,
    "FIX_TRAIN_M_PATH": train_M_path,
    "FIX_TEST_X_PATH": test_X_path,
    "FIX_TEST_B_PATH": test_B_path,
    "FIX_TEST_M_PATH": test_M_path,
    "FIX_TRAIN_Y_PATH": FIX_DIR / "train_y.npy",
    "FIX_TRAIN_IDS_PATH": FIX_DIR / "train_ids.npy",
    "FIX_TEST_IDS_PATH": FIX_DIR / "test_ids.npy",
    "FIX_POLICY_CFG_PATH": cfg_path,
    "SEQ_TOKEN_MODE": SEQ_TOKEN_MODE,
})

gc.collect()


[Stage 6] token_mode=mag | score_value_feat=mag | F=6

TRAIN length stats
- n_objects=3,043 | min=17 | p50=150 | p90=183 | p95=194 | p99=908 | max=1164

TEST length stats
- n_objects=7,135 | min=18 | p50=152 | p90=183 | p95=193 | p99=990 | max=1186

[Stage 6] MAX_LEN=256 (based on p95=194)

[Stage 6] Memmap X sizes approx: train=0.02 GB | test=0.04 GB | dtype=<class 'numpy.float32'>

[Stage 6] Building fixed cache (TRAIN)...
[Stage 6] TRAIN filled=3,043/3,043 | dup=0 | empty=0 | time=0.22s

[Stage 6] Building fixed cache (TEST)...
[Stage 6] TEST  filled=7,135/7,135 | dup=0 | empty=0 | time=0.44s

[Stage 6] Sanity samples (train):
- idx=1360 oid=gwilwileth_adel_amloth orig_len=157 kept=157 bands_unique=[0, 1, 2, 3, 4, 5]
- idx=3020 oid=vin_araf_gwador orig_len=151 kept=151 bands_unique=[0, 1, 2, 3, 4, 5]
- idx=3025 oid=ylf_alph_mindon orig_len=167 kept=167 bands_unique=[0, 1, 2, 3, 4, 5]

[Stage 6] Sanity samples (test):
- idx=3191 oid=rom_bellas_lebdas orig_len=142 kept=142 bands_uniqu

616

# CV Split (Object-Level, Stratified)

In [9]:
# ============================================================
# STAGE 7 — CV Split (Object-Level, Stratified) (ONE CELL, Kaggle CPU-SAFE) — REVISI FULL v2
#
# Tujuan:
# - Buat split CV di level object_id (bukan per baris lightcurve)
# - Konsisten dengan urutan TRAIN yang dipakai di STAGE 6 (fixed_seq/train_ids.npy)
#
# Upgrade v2:
# - Auto-find FIX_DIR/train_ids.npy (kalau FIX_DIR tidak ada tapi folder fixed_seq ada di ART_DIR)
# - Robust decode bytes->str + strip
# - Target column robust (auto-detect: target/y/label/class)
# - Group column robust (auto-detect: split/split_id/split_name) jika USE_GROUP_BY_SPLIT=True
# - n_splits adaptif + ENFORCE_MIN_POS_PER_FOLD (turun otomatis sampai valid)
# - Fallback otomatis: kalau StratifiedGroupKFold gagal/ tidak tersedia -> StratifiedKFold (opsional)
# - Validasi keras: missing ids, duplikasi, fold tanpa kelas, fold_assign -1
#
# Output:
# - artifacts/cv/cv_folds.csv
# - artifacts/cv/cv_folds.npz   (train_idx_f + val_idx_f)
# - artifacts/cv/cv_report.txt
# - artifacts/cv/cv_config.json
# - globals: fold_assign, folds, n_splits, CV_DIR
# ============================================================

import gc, json, os, time
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require minimal globals
# ----------------------------
for need in ["df_train_meta", "ART_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 2 dulu (df_train_meta & ART_DIR).")

SEED = int(globals().get("SEED", 2025))
ART_DIR = Path(ART_DIR)

# ----------------------------
# 1) CV Settings
# ----------------------------
DEFAULT_SPLITS = 5
FORCE_N_SPLITS = None              # set int kalau mau paksa (mis. 3), else None
MIN_POS_PER_FOLD = 3               # stabilitas; 3–10 umum
ENFORCE_MIN_POS_PER_FOLD = True    # kalau True: n_splits otomatis diturunkan sampai min_pos>=MIN_POS_PER_FOLD (atau minimal valid 1)
USE_GROUP_BY_SPLIT = False         # True => prefer StratifiedGroupKFold (groups=df_train_meta["split"])
AUTO_FALLBACK_GROUP = True         # True => kalau group-cv tidak bisa, fallback ke StratifiedKFold

print(f"[Stage 7] seed={SEED} | default_splits={DEFAULT_SPLITS} | MIN_POS_PER_FOLD={MIN_POS_PER_FOLD} "
      f"| enforce_minpos={ENFORCE_MIN_POS_PER_FOLD} | group_by_split={USE_GROUP_BY_SPLIT} | fallback_group={AUTO_FALLBACK_GROUP}")

# ----------------------------
# 2) Determine train_ids ordering (prefer fixed cache from STAGE 6)
# ----------------------------
train_ids = None
order_source = "df_train_meta.index"

def _decode_ids(arr) -> list:
    out = []
    for x in arr.tolist():
        if isinstance(x, (bytes, bytearray)):
            s = x.decode("utf-8", errors="ignore")
        else:
            s = str(x)
        out.append(s.strip())
    return out

# try: FIX_DIR/train_ids.npy
p_ids = None
if "FIX_DIR" in globals():
    p = Path(globals()["FIX_DIR"]) / "train_ids.npy"
    if p.exists():
        p_ids = p

# try: ART_DIR/fixed_seq/train_ids.npy
if p_ids is None:
    p = ART_DIR / "fixed_seq" / "train_ids.npy"
    if p.exists():
        p_ids = p

if p_ids is not None:
    raw = np.load(p_ids, allow_pickle=False)
    train_ids = _decode_ids(raw)
    order_source = str(p_ids)

if train_ids is None:
    # fallback
    train_ids = [str(x).strip() for x in df_train_meta.index.astype(str).tolist()]
    order_source = "df_train_meta.index"

# uniqueness check
if len(train_ids) != len(set(train_ids)):
    s = pd.Series(train_ids)
    dup = s[s.duplicated()].iloc[:10].tolist()
    raise RuntimeError(f"[Stage 7] train_ids has duplicates (examples): {dup}")

# ensure all ids exist in df_train_meta
missing_in_meta = [oid for oid in train_ids if oid not in df_train_meta.index]
if missing_in_meta:
    raise RuntimeError(f"[Stage 7] Some train_ids not found in df_train_meta (examples): {missing_in_meta[:10]}")

# ----------------------------
# 3) Robust target column
# ----------------------------
target_col = None
for cand in ["target", "y", "label", "class", "is_tde", "binary_target"]:
    if cand in df_train_meta.columns:
        target_col = cand
        break
if target_col is None:
    raise RuntimeError(f"[Stage 7] Cannot find target column in df_train_meta. cols(sample)={list(df_train_meta.columns)[:30]}")

y = pd.to_numeric(df_train_meta.loc[train_ids, target_col], errors="coerce").fillna(0).astype(np.int16).to_numpy(copy=False)
# force to 0/1 if it looks like boolean-ish
y = (y > 0).astype(np.int8)

N = len(train_ids)
pos = int((y == 1).sum())
neg = int((y == 0).sum())
if pos == 0 or neg == 0:
    raise RuntimeError(f"[Stage 7] Invalid class distribution: pos={pos}, neg={neg}. Cannot do stratified CV.")

# ----------------------------
# 4) Optional groups (by split)
# ----------------------------
groups = None
group_col = None
if USE_GROUP_BY_SPLIT:
    # robust group column detection
    for cand in ["split", "split_id", "split_name", "split_idx"]:
        if cand in df_train_meta.columns:
            group_col = cand
            break
    if group_col is None:
        if not AUTO_FALLBACK_GROUP:
            raise RuntimeError("[Stage 7] USE_GROUP_BY_SPLIT=True but no split column found in df_train_meta.")
        print("[Stage 7] WARN: split column not found; fallback to StratifiedKFold.")
        USE_GROUP_BY_SPLIT = False
    else:
        groups = df_train_meta.loc[train_ids, group_col].astype(str).to_numpy()

# ----------------------------
# 5) Choose n_splits safely + auto-adjust
# ----------------------------
max_splits_by_pos = pos
max_splits_by_neg = neg
max_splits_by_minpos = max(1, pos // max(int(MIN_POS_PER_FOLD), 1))

n0 = min(DEFAULT_SPLITS, max_splits_by_pos, max_splits_by_neg, max_splits_by_minpos)
if FORCE_N_SPLITS is not None:
    n0 = int(FORCE_N_SPLITS)

if n0 < 2:
    raise RuntimeError(
        f"[Stage 7] Too few samples for CV. pos={pos}, neg={neg}, MIN_POS_PER_FOLD={MIN_POS_PER_FOLD} => n_splits={n0}. "
        "Turunkan MIN_POS_PER_FOLD atau pakai holdout."
    )

print(f"[Stage 7] Initial n_splits candidate={n0} | N={N:,} pos={pos:,} neg={neg:,} pos%={pos/max(N,1)*100:.6f}% | order_source={order_source}")

# ----------------------------
# 6) Build folds (sklearn) with robust fallback
# ----------------------------
try:
    from sklearn.model_selection import StratifiedKFold
    try:
        from sklearn.model_selection import StratifiedGroupKFold
    except Exception:
        StratifiedGroupKFold = None
except Exception as e:
    raise RuntimeError("scikit-learn is not available in this environment.") from e

def _try_split(k: int, use_group: bool):
    """Return (ok, cv_type, fold_assign, folds, per_fold_posneg)"""
    fold_assign = np.full(N, -1, dtype=np.int16)
    folds = []
    per = []

    if use_group:
        if StratifiedGroupKFold is None:
            return (False, "StratifiedGroupKFold(unavailable)", None, None, None)
        splitter = StratifiedGroupKFold(n_splits=k, shuffle=True, random_state=SEED)
        split_iter = splitter.split(np.zeros(N), y, groups=groups)
        cv_type = f"StratifiedGroupKFold({group_col})"
    else:
        splitter = StratifiedKFold(n_splits=k, shuffle=True, random_state=SEED)
        split_iter = splitter.split(np.zeros(N), y)
        cv_type = "StratifiedKFold"

    try:
        for fold, (tr_idx, val_idx) in enumerate(split_iter):
            fold_assign[val_idx] = fold
            yf = y[val_idx]
            pf = int((yf == 1).sum())
            nf = int((yf == 0).sum())
            per.append((len(val_idx), pf, nf))
            folds.append({
                "fold": int(fold),
                "train_idx": tr_idx.astype(np.int32),
                "val_idx": val_idx.astype(np.int32),
            })
    except Exception as e:
        return (False, f"{cv_type} (error: {type(e).__name__})", None, None, None)

    if (fold_assign < 0).any():
        return (False, f"{cv_type} (unassigned)", None, None, None)

    # hard check: each fold must have pos>=1 and neg>=1
    for (_, pf, nf) in per:
        if pf == 0 or nf == 0:
            return (False, f"{cv_type} (empty class in fold)", None, None, None)

    return (True, cv_type, fold_assign, folds, per)

# attempt with decreasing k
best = None
used_group = bool(USE_GROUP_BY_SPLIT)

for k in range(n0, 1, -1):
    ok, cv_type, fa, folds, per = _try_split(k, use_group=used_group)
    if not ok and used_group and AUTO_FALLBACK_GROUP:
        # try fallback non-group for same k
        ok2, cv_type2, fa2, folds2, per2 = _try_split(k, use_group=False)
        if ok2:
            ok, cv_type, fa, folds, per = ok2, cv_type2, fa2, folds2, per2
            used_group = False

    if not ok:
        continue

    min_pos_seen = min(pf for (_, pf, _) in per) if per else 0
    if ENFORCE_MIN_POS_PER_FOLD and (min_pos_seen < MIN_POS_PER_FOLD) and (FORCE_N_SPLITS is None):
        # k too big for minpos requirement, keep searching smaller k
        continue

    best = (k, cv_type, fa, folds, per, min_pos_seen)
    break

# if enforce failed completely, pick the first valid that has pos>=1 per fold (warn)
if best is None:
    for k in range(n0, 1, -1):
        ok, cv_type, fa, folds, per = _try_split(k, use_group=bool(USE_GROUP_BY_SPLIT))
        if not ok and USE_GROUP_BY_SPLIT and AUTO_FALLBACK_GROUP:
            ok, cv_type, fa, folds, per = _try_split(k, use_group=False)
        if ok:
            min_pos_seen = min(pf for (_, pf, _) in per) if per else 0
            best = (k, cv_type, fa, folds, per, min_pos_seen)
            print(f"[Stage 7] NOTE: Could not satisfy MIN_POS_PER_FOLD={MIN_POS_PER_FOLD}. Using k={k} with min_pos={min_pos_seen}.")
            break

if best is None:
    raise RuntimeError("[Stage 7] Failed to build a valid stratified CV split. Try smaller DEFAULT_SPLITS / FORCE_N_SPLITS, or disable group_by_split.")

n_splits, cv_type, fold_assign, folds, per, min_pos_seen = best

# ----------------------------
# 7) Report + validation lines
# ----------------------------
lines = []
lines.append(f"CV={cv_type} n_splits={n_splits} seed={SEED}")
lines.append(f"Order source: {order_source}")
lines.append(f"Target column: {target_col}")
lines.append(f"Total: N={N} | pos={pos} | neg={neg} | pos%={pos/max(N,1)*100:.6f}%")
if USE_GROUP_BY_SPLIT:
    lines.append(f"Group col requested: {group_col} | used_group={('Group' in cv_type)}")
lines.append("Per-fold distribution (val):")

ok = True
for f in range(n_splits):
    idx = np.where(fold_assign == f)[0]
    yf = y[idx]
    pf = int((yf == 1).sum())
    nf = int((yf == 0).sum())
    lines.append(f"- fold {f}: n={len(idx):6d} | pos={pf:5d} | neg={nf:6d} | pos%={(pf/max(len(idx),1))*100:9.6f}%")
    if pf == 0 or nf == 0:
        ok = False

if not ok:
    raise RuntimeError("[Stage 7] A fold has pos=0 or neg=0 after selection (should not happen).")

if min_pos_seen < MIN_POS_PER_FOLD:
    lines.append(f"NOTE: min positives in a fold = {min_pos_seen} (< MIN_POS_PER_FOLD={MIN_POS_PER_FOLD}). "
                 "Threshold/F1 tuning bisa noisy; pertimbangkan n_splits lebih kecil.")

print(f"[Stage 7] FINAL: n_splits={n_splits} | cv_type={cv_type} | min_pos_in_fold={min_pos_seen}")

# ----------------------------
# 8) Save artifacts
# ----------------------------
CV_DIR = ART_DIR / "cv"
CV_DIR.mkdir(parents=True, exist_ok=True)

df_folds = pd.DataFrame({"object_id": train_ids, "fold": fold_assign.astype(int)})
folds_csv = CV_DIR / "cv_folds.csv"
df_folds.to_csv(folds_csv, index=False)

npz_path = CV_DIR / "cv_folds.npz"
npz_kwargs = {}
for f in range(n_splits):
    npz_kwargs[f"train_idx_{f}"] = folds[f]["train_idx"]
    npz_kwargs[f"val_idx_{f}"]   = folds[f]["val_idx"]
np.savez(npz_path, **npz_kwargs)

report_path = CV_DIR / "cv_report.txt"
with open(report_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines) + "\n")

cfg_path = CV_DIR / "cv_config.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(
        {
            "seed": SEED,
            "n_splits": int(n_splits),
            "cv_type": cv_type,
            "min_pos_per_fold": int(MIN_POS_PER_FOLD),
            "enforce_min_pos_per_fold": bool(ENFORCE_MIN_POS_PER_FOLD),
            "use_group_by_split_requested": bool(globals().get("USE_GROUP_BY_SPLIT", USE_GROUP_BY_SPLIT)),
            "order_source": order_source,
            "target_col": target_col,
            "group_col": group_col,
            "artifacts": {
                "folds_csv": str(folds_csv),
                "folds_npz": str(npz_path),
                "report_txt": str(report_path),
            },
        },
        f,
        indent=2,
    )

print("\n[Stage 7] CV split OK")
print(f"- Saved: {folds_csv}")
print(f"- Saved: {npz_path}")
print(f"- Saved: {report_path}")
print(f"- Saved: {cfg_path}")
print("\n".join(lines[-(n_splits + 4):]))

# ----------------------------
# 9) Export globals for next stage
# ----------------------------
globals().update({
    "CV_DIR": CV_DIR,
    "n_splits": int(n_splits),
    "train_ids_ordered": train_ids,
    "y_ordered": y,
    "fold_assign": fold_assign,
    "folds": folds,
    "CV_FOLDS_CSV": folds_csv,
    "CV_FOLDS_NPZ": npz_path,
    "CV_CFG_PATH": cfg_path,
    "CV_TYPE": cv_type,
})

gc.collect()


[Stage 7] seed=2025 | default_splits=5 | MIN_POS_PER_FOLD=3 | enforce_minpos=True | group_by_split=False | fallback_group=True
[Stage 7] Initial n_splits candidate=5 | N=3,043 pos=148 neg=2,895 pos%=4.863621% | order_source=/kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/fixed_seq/train_ids.npy
[Stage 7] FINAL: n_splits=5 | cv_type=StratifiedKFold | min_pos_in_fold=29

[Stage 7] CV split OK
- Saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/cv/cv_folds.csv
- Saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/cv/cv_folds.npz
- Saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/cv/cv_report.txt
- Saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/cv/cv_config.json
Order source: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/fixed_seq/train_ids.npy
Target column: target
Total: N=3043 | pos=148 | neg=2895 | pos%=4.863621%
Per-fold distribution (va

33

# Train Model (CPU-Safe Configuration)

In [10]:
# ============================================================
# STAGE 8 — Train Multiband Event Transformer (CPU-Safe) — REVISI FULL v3 (BOOST + NO-LEAK)
#
# Upgrade v3 (dibanding v2):
# - Tambah AGG_SEQ_FEATURES: statistik token per object (global + per-band) => biasanya naik performa signifikan
# - Early stopping by ROC-AUC (lebih stabil untuk imbalance), tetap log loss + F1@0.5
# - OneCycleLR scheduler + grad_accum remainder aman
# - Optional WeightedRandomSampler (imbalance fix) + light augmentation (token-drop + noise)
# - Robust target column detection
# - CKPT_DIR/OOF_DIR/LOG_DIR auto-create tetap
# ============================================================

import os, gc, json, math, time, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------
# 0) Require minimal previous stages
# ----------------------------
need_min = ["FIX_DIR","MAX_LEN","SEQ_FEATURE_NAMES","df_train_meta","n_splits","folds"]
for k in need_min:
    if k not in globals():
        raise RuntimeError(f"Missing `{k}`. Jalankan STAGE 0..7 dulu dengan urutan benar.")

# ----------------------------
# 0a) Resolve train_ids ordering + labels (robust)
# ----------------------------
def _decode_ids(arr):
    out = []
    for x in arr.tolist():
        if isinstance(x, (bytes, bytearray)):
            s = x.decode("utf-8", errors="ignore")
        else:
            s = str(x)
        out.append(s.strip())
    return out

# ordering
if "train_ids_ordered" in globals() and globals()["train_ids_ordered"] is not None:
    train_ids = list(globals()["train_ids_ordered"])
else:
    p = Path(globals()["FIX_DIR"]) / "train_ids.npy"
    if p.exists():
        raw = np.load(p, allow_pickle=False)
        train_ids = _decode_ids(raw if raw.dtype.kind in ("S","O") else raw.astype(str))
    else:
        train_ids = df_train_meta.index.astype(str).tolist()

# target column robust
target_col = None
for cand in ["target","y","label","class","is_tde","binary_target"]:
    if cand in df_train_meta.columns:
        target_col = cand
        break
if target_col is None:
    raise RuntimeError(f"Cannot find target column in df_train_meta. cols(sample)={list(df_train_meta.columns)[:40]}")

y = pd.to_numeric(df_train_meta.loc[train_ids, target_col], errors="coerce").fillna(0).astype(np.int16).to_numpy()
y = (y > 0).astype(np.int8)

# ----------------------------
# 0b) Ensure output dirs exist
# ----------------------------
if "RUN_DIR" in globals() and globals()["RUN_DIR"] is not None:
    RUN_DIR = Path(globals()["RUN_DIR"])
else:
    if "ART_DIR" in globals() and globals()["ART_DIR"] is not None:
        RUN_DIR = Path(globals()["ART_DIR"]).parent
    else:
        RUN_DIR = Path("/kaggle/working/mallorn_run")

ART_DIR = Path(globals().get("ART_DIR", RUN_DIR / "artifacts"))
ART_DIR.mkdir(parents=True, exist_ok=True)

CKPT_DIR = Path(globals().get("CKPT_DIR", RUN_DIR / "checkpoints"))
OOF_DIR  = Path(globals().get("OOF_DIR",  RUN_DIR / "oof"))
LOG_DIR  = Path(globals().get("LOG_DIR",  RUN_DIR / "logs"))
CKPT_DIR.mkdir(parents=True, exist_ok=True)
OOF_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)

globals().update({"RUN_DIR": RUN_DIR, "ART_DIR": ART_DIR, "CKPT_DIR": CKPT_DIR, "OOF_DIR": OOF_DIR, "LOG_DIR": LOG_DIR})

# ----------------------------
# 1) Torch imports + CPU safety
# ----------------------------
try:
    import torch
    import torch.nn as nn
except Exception as e:
    raise RuntimeError("PyTorch tidak tersedia di environment ini.") from e

SEED = int(globals().get("SEED", 2025))
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device("cpu")

# thread guard
try:
    torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", "2")))
    torch.set_num_interop_threads(1)
except Exception:
    pass

# sklearn metrics for AUC
try:
    from sklearn.metrics import roc_auc_score
except Exception as e:
    raise RuntimeError("scikit-learn metrics tidak tersedia.") from e

# ----------------------------
# 2) Open memmaps (fixed seq) — NO RAM load
# ----------------------------
FIX_DIR = Path(globals()["FIX_DIR"])
N = len(train_ids)
L = int(globals()["MAX_LEN"])
SEQ_FEATURE_NAMES = list(globals()["SEQ_FEATURE_NAMES"])
Fdim = len(SEQ_FEATURE_NAMES)
feat = {n:i for i,n in enumerate(SEQ_FEATURE_NAMES)}

train_X_path = FIX_DIR / "train_X.dat"
train_B_path = FIX_DIR / "train_B.dat"
train_M_path = FIX_DIR / "train_M.dat"

for p in [train_X_path, train_B_path, train_M_path]:
    if not p.exists():
        raise FileNotFoundError(f"Missing fixed cache file: {p}. Pastikan STAGE 6 sukses.")

X_mm = np.memmap(train_X_path, dtype=np.float32, mode="r", shape=(N, L, Fdim))
B_mm = np.memmap(train_B_path, dtype=np.int8,   mode="r", shape=(N, L))
M_mm = np.memmap(train_M_path, dtype=np.int8,   mode="r", shape=(N, L))

# detect token mode
SEQ_TOKEN_MODE = globals().get("SEQ_TOKEN_MODE", None)
if SEQ_TOKEN_MODE is None:
    if ("mag" in feat) and ("mag_err_log" in feat):
        SEQ_TOKEN_MODE = "mag"
    elif ("flux_asinh" in feat) and ("err_log1p" in feat):
        SEQ_TOKEN_MODE = "asinh"
    else:
        raise RuntimeError(f"Cannot infer token_mode from features: {SEQ_FEATURE_NAMES}")

# required
for k in ["snr_tanh","detected"]:
    if k not in feat:
        raise RuntimeError(f"Feature '{k}' not found in SEQ_FEATURE_NAMES.")

VAL_FEAT = "mag" if SEQ_TOKEN_MODE == "mag" else "flux_asinh"
if VAL_FEAT not in feat:
    raise RuntimeError(f"Feature '{VAL_FEAT}' missing for token_mode={SEQ_TOKEN_MODE}.")

# ----------------------------
# 3) Build RAW meta global features (NO scaling)
# ----------------------------
BASE_G_COLS = ["Z","Z_err","EBV","Z_missing","Z_err_missing","EBV_missing","is_photoz"]
for c in BASE_G_COLS:
    if c not in df_train_meta.columns:
        df_train_meta[c] = 0.0

G_meta = df_train_meta.loc[train_ids, BASE_G_COLS].copy()
for c in BASE_G_COLS:
    G_meta[c] = pd.to_numeric(G_meta[c], errors="coerce").fillna(0.0).astype(np.float32)

G_meta_np = G_meta.to_numpy(dtype=np.float32, copy=False)

with open(Path(LOG_DIR)/"global_meta_cols.json", "w", encoding="utf-8") as f:
    json.dump({"cols": BASE_G_COLS}, f, indent=2)

# ----------------------------
# 3b) NEW: Sequence aggregate features (global + per-band) — BIG BOOST in many cases
# ----------------------------
USE_AGG_SEQ_FEATURES = True
N_BANDS = 6

def _safe_div(a, b):
    return a / np.maximum(b, 1.0)

def build_agg_seq_features(X_mm, B_mm, M_mm, chunk=2048):
    """
    Returns agg (N, agg_dim) float32.
    Global:
      - tok_count, det_frac, mean_abs_snr, max_abs_snr,
      - value stats:
         mag  : mean_mag, std_mag, min_mag
         asinh: mean_abs_flux, std_flux, max_abs_flux
    Per-band (for each band 0..5):
      - count_b, det_frac_b, mean_abs_snr_b, mean_val_b
    """
    snr_i = feat["snr_tanh"]
    det_i = feat["detected"]
    val_i = feat[VAL_FEAT]

    agg_list = []
    for start in range(0, N, chunk):
        end = min(N, start+chunk)
        Xc = np.asarray(X_mm[start:end])  # (B,L,F)
        Bc = np.asarray(B_mm[start:end])  # (B,L)
        Mc = np.asarray(M_mm[start:end])  # (B,L)

        real = (Mc == 1)
        tok_count = real.sum(axis=1).astype(np.float32)  # (B,)

        snr = np.abs(Xc[:, :, snr_i]).astype(np.float32)
        det = (Xc[:, :, det_i] > 0.5).astype(np.float32)
        val = Xc[:, :, val_i].astype(np.float32)

        # mask apply
        snr_r = snr * real
        det_r = det * real

        det_frac = _safe_div(det_r.sum(axis=1), tok_count)
        mean_abs_snr = _safe_div(snr_r.sum(axis=1), tok_count)
        max_abs_snr = np.where(tok_count > 0, (snr * real).max(axis=1), 0.0).astype(np.float32)

        if SEQ_TOKEN_MODE == "mag":
            # for mag: keep only real tokens
            val_r = np.where(real, val, np.nan)
            mean_val = np.nanmean(val_r, axis=1).astype(np.float32)
            std_val  = np.nanstd(val_r, axis=1).astype(np.float32)
            min_val  = np.nanmin(val_r, axis=1).astype(np.float32)
            mean_val = np.nan_to_num(mean_val, nan=0.0).astype(np.float32)
            std_val  = np.nan_to_num(std_val,  nan=0.0).astype(np.float32)
            min_val  = np.nan_to_num(min_val,  nan=0.0).astype(np.float32)
            global_val_feats = np.stack([mean_val, std_val, min_val], axis=1)
        else:
            # asinh: abs flux is more informative
            aval = np.abs(val)
            aval_r = aval * real
            mean_aval = _safe_div(aval_r.sum(axis=1), tok_count)
            # std flux (non-abs) on real tokens
            val_r = np.where(real, val, np.nan)
            std_val = np.nanstd(val_r, axis=1).astype(np.float32)
            max_aval = np.where(tok_count > 0, (aval * real).max(axis=1), 0.0).astype(np.float32)
            std_val = np.nan_to_num(std_val, nan=0.0).astype(np.float32)
            global_val_feats = np.stack([mean_aval.astype(np.float32), std_val, max_aval], axis=1)

        # per-band features
        per_band = []
        for b in range(N_BANDS):
            bm = (Bc == b) & real
            cnt = bm.sum(axis=1).astype(np.float32)
            detb = (det * bm).sum(axis=1).astype(np.float32)
            snrb = (snr * bm).sum(axis=1).astype(np.float32)

            det_frac_b = _safe_div(detb, cnt)
            mean_abs_snr_b = _safe_div(snrb, cnt)

            if SEQ_TOKEN_MODE == "mag":
                val_b = np.where(bm, val, np.nan)
                mean_val_b = np.nanmean(val_b, axis=1).astype(np.float32)
                mean_val_b = np.nan_to_num(mean_val_b, nan=0.0).astype(np.float32)
            else:
                aval_b = np.abs(val) * bm
                mean_val_b = _safe_div(aval_b.sum(axis=1).astype(np.float32), cnt)

            per_band.append(np.stack([cnt, det_frac_b, mean_abs_snr_b, mean_val_b], axis=1))

        per_band = np.concatenate(per_band, axis=1).astype(np.float32)

        glob = np.stack([tok_count, det_frac, mean_abs_snr, max_abs_snr], axis=1).astype(np.float32)
        agg = np.concatenate([glob, global_val_feats.astype(np.float32), per_band], axis=1).astype(np.float32)
        agg_list.append(agg)

        del Xc, Bc, Mc
        if (start // chunk) % 3 == 0:
            gc.collect()

    agg_all = np.concatenate(agg_list, axis=0).astype(np.float32)
    return agg_all

if USE_AGG_SEQ_FEATURES:
    print("[Stage 8] Building AGG sequence features (one-time)...")
    t0 = time.time()
    G_seq_np = build_agg_seq_features(X_mm, B_mm, M_mm, chunk=2048)
    print(f"[Stage 8] AGG built: shape={G_seq_np.shape} | time={time.time()-t0:.1f}s")
else:
    G_seq_np = np.zeros((N,0), dtype=np.float32)

# final global raw matrix
G_raw_np = np.concatenate([G_meta_np, G_seq_np], axis=1).astype(np.float32)
g_dim = int(G_raw_np.shape[1])

with open(Path(LOG_DIR)/"global_feature_spec.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "meta_cols": BASE_G_COLS,
            "use_agg_seq": bool(USE_AGG_SEQ_FEATURES),
            "token_mode": SEQ_TOKEN_MODE,
            "val_feat": VAL_FEAT,
            "agg_dim": int(G_seq_np.shape[1]),
            "total_g_dim": int(g_dim),
            "per_band_block": ["count","det_frac","mean_abs_snr","mean_val"],
            "global_block": ["tok_count","det_frac","mean_abs_snr","max_abs_snr","val_stat1","val_stat2","val_stat3"],
        },
        f,
        indent=2,
    )

# ----------------------------
# 4) Dataset / Loader (num_workers=0) + optional augmentation
# ----------------------------
AUG_TOKENDROP_P = 0.05     # 0.0 disable
AUG_VALUE_NOISE = 0.01     # 0.0 disable (mag or flux noise, small)

class MemmapSeqDataset(torch.utils.data.Dataset):
    def __init__(self, idx, X_mm, B_mm, M_mm, G_scaled_np, y=None, train_mode=False):
        self.idx = np.asarray(idx, dtype=np.int32)
        self.X_mm = X_mm
        self.B_mm = B_mm
        self.M_mm = M_mm
        self.G = G_scaled_np
        self.y = None if y is None else np.asarray(y, dtype=np.int8)
        self.train_mode = bool(train_mode)
        self.rng = np.random.default_rng(SEED + (123 if train_mode else 0))

    def __len__(self):
        return len(self.idx)

    def __getitem__(self, i):
        j = int(self.idx[i])
        X = np.array(self.X_mm[j], copy=False)  # (L,F)
        B = np.array(self.B_mm[j], copy=False)  # (L,)
        M = np.array(self.M_mm[j], copy=False)  # (L,)
        G = np.array(self.G[j], copy=False)     # (g_dim,)

        if self.train_mode:
            # token dropout: randomly mask out a small fraction of real tokens (avoid all-pad)
            if AUG_TOKENDROP_P and AUG_TOKENDROP_P > 0:
                real = (M == 1)
                if real.any():
                    drop = (self.rng.random(real.shape[0]) < AUG_TOKENDROP_P) & real
                    # keep at least 1 token
                    if drop.all():
                        drop[self.rng.integers(0, real.sum())] = False
                    M = M.copy()
                    M[drop] = 0

            # small value noise on real tokens
            if AUG_VALUE_NOISE and AUG_VALUE_NOISE > 0:
                vi = feat[VAL_FEAT]
                real = (M == 1)
                if real.any():
                    X = X.copy()
                    noise = self.rng.normal(0.0, AUG_VALUE_NOISE, size=real.sum()).astype(np.float32)
                    X[real, vi] = (X[real, vi] + noise).astype(np.float32)

        Xt = torch.from_numpy(X.astype(np.float32, copy=False))
        Bt = torch.from_numpy(B.astype(np.int64, copy=False))
        Mt = torch.from_numpy(M.astype(np.int64, copy=False))
        Gt = torch.from_numpy(G.astype(np.float32, copy=False))

        if self.y is None:
            return Xt, Bt, Mt, Gt

        yy = float(self.y[j])
        return Xt, Bt, Mt, Gt, torch.tensor(yy, dtype=torch.float32)

def make_loader(ds, batch_size, shuffle, sampler=None):
    return torch.utils.data.DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=(sampler is None and shuffle),
        sampler=sampler,
        num_workers=0,
        pin_memory=False,
        drop_last=False,
    )

# ----------------------------
# 5) Model — slightly stronger + stable pooling
# ----------------------------
class MultibandEventTransformer(nn.Module):
    def __init__(self, feat_dim, max_len, n_bands=6, d_model=160, n_heads=4, n_layers=3, ff_mult=2, dropout=0.12, g_dim=0):
        super().__init__()
        self.n_bands = n_bands
        self.d_model = d_model
        self.max_len = max_len

        self.x_proj = nn.Sequential(
            nn.Linear(feat_dim, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
        )
        self.band_emb = nn.Embedding(n_bands, d_model)

        self.pos_emb = nn.Parameter(torch.zeros(1, max_len, d_model))
        nn.init.normal_(self.pos_emb, mean=0.0, std=0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=int(d_model * ff_mult),
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        # pooling: attn + mean mix (lebih stabil)
        self.attn = nn.Linear(d_model, 1)
        self.pool_ln = nn.LayerNorm(d_model)

        # global features projection
        g_out = max(32, d_model // 2)
        self.g_proj = nn.Sequential(
            nn.Linear(g_dim, g_out),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        self.head = nn.Sequential(
            nn.Linear(d_model + g_out, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, X, band_id, mask, G):
        X = X.to(torch.float32)
        band_id = band_id.clamp(0, self.n_bands - 1).to(torch.long)
        mask = mask.to(torch.long)

        pad_mask = (mask == 0)  # True=pad
        # ALL-PAD guard
        all_pad = pad_mask.all(dim=1)
        if all_pad.any():
            pad_mask = pad_mask.clone()
            pad_mask[all_pad, 0] = False

        h = self.x_proj(X) + self.band_emb(band_id) + self.pos_emb[:, :X.shape[1], :]
        h = self.encoder(h, src_key_padding_mask=pad_mask)

        # attn pooling
        a = self.attn(h).squeeze(-1)
        a = a.masked_fill(pad_mask, -1e9)
        w = torch.softmax(a, dim=1)
        pooled_attn = torch.sum(h * w.unsqueeze(-1), dim=1)

        # mean pooling on valid tokens
        valid = (~pad_mask).to(h.dtype).unsqueeze(-1)
        denom = valid.sum(dim=1).clamp_min(1.0)
        pooled_mean = (h * valid).sum(dim=1) / denom

        pooled = 0.6 * pooled_attn + 0.4 * pooled_mean
        pooled = self.pool_ln(pooled)

        g = self.g_proj(G.to(torch.float32))
        z = torch.cat([pooled, g], dim=1)
        return self.head(z).squeeze(-1)

# ----------------------------
# 6) Training config (CPU safe, but stronger default)
# ----------------------------
CFG = {
    "d_model": 160,
    "n_heads": 4,
    "n_layers": 3,
    "ff_mult": 2,
    "dropout": 0.12,

    "batch_size": 16,
    "grad_accum": 2,

    "epochs": 14,
    "lr": 5e-4,
    "weight_decay": 0.02,

    "patience": 4,            # early stop by AUC
    "max_grad_norm": 1.0,

    "use_weighted_sampler": True,  # imbalance
    "label_smoothing": 0.03,        # small smoothing for stability
    "scheduler": "onecycle",
}

# auto soften for long seq
if L >= 512:
    CFG["d_model"] = 128
    CFG["n_heads"] = 4
    CFG["n_layers"] = 2
    CFG["batch_size"] = 12
    CFG["grad_accum"] = 2
    CFG["lr"] = 4e-4

cfg_path = Path(LOG_DIR) / "train_cfg_stage8.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(CFG, f, indent=2)

pos_all = int((y == 1).sum())
neg_all = int((y == 0).sum())
print("[Stage 8] TRAIN CONFIG (CPU)")
print(f"- N={N:,} | pos={pos_all:,} | neg={neg_all:,} | pos%={pos_all/max(N,1)*100:.6f}%")
print(f"- token_mode={SEQ_TOKEN_MODE} | val_feat={VAL_FEAT} | g_dim={g_dim} | use_agg_seq={USE_AGG_SEQ_FEATURES}")
print(f"- Model: d_model={CFG['d_model']} heads={CFG['n_heads']} layers={CFG['n_layers']} dropout={CFG['dropout']}")
print(f"- Batch={CFG['batch_size']} grad_accum={CFG['grad_accum']} epochs={CFG['epochs']} lr={CFG['lr']}")
print(f"- WeightedSampler={CFG['use_weighted_sampler']} | label_smoothing={CFG['label_smoothing']}")
print(f"- CKPT_DIR={CKPT_DIR}")
print(f"- OOF_DIR ={OOF_DIR}")
print(f"- LOG_DIR ={LOG_DIR}")

# ----------------------------
# 7) Helpers
# ----------------------------
def sigmoid_np(x):
    x = np.clip(x, -50, 50)
    return 1.0 / (1.0 + np.exp(-x))

def f1_binary(y_true, y_pred01):
    y_true = y_true.astype(np.int32)
    y_pred01 = y_pred01.astype(np.int32)
    tp = int(((y_true == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true == 1) & (y_pred01 == 0)).sum())
    if tp == 0:
        return 0.0
    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    if prec + rec == 0:
        return 0.0
    return float(2 * prec * rec / (prec + rec))

@torch.no_grad()
def eval_model(model, loader, criterion):
    model.eval()
    losses, logits_all, y_all = [], [], []
    for batch in loader:
        Xb, Bb, Mb, Gb, yb = batch
        Xb = Xb.to(device); Bb = Bb.to(device); Mb = Mb.to(device); Gb = Gb.to(device); yb = yb.to(device)
        logit = model(Xb, Bb, Mb, Gb)
        loss = criterion(logit, yb)
        losses.append(float(loss.item()))
        logits_all.append(logit.detach().cpu().numpy())
        y_all.append(yb.detach().cpu().numpy())
    logits_all = np.concatenate(logits_all, axis=0) if logits_all else np.zeros((0,), dtype=np.float32)
    y_all = np.concatenate(y_all, axis=0).astype(np.int8) if y_all else np.zeros((0,), dtype=np.int8)
    probs = sigmoid_np(logits_all)
    pred01 = (probs >= 0.5).astype(np.int8)
    f1 = f1_binary(y_all, pred01)
    auc = float(roc_auc_score(y_all, probs)) if (len(np.unique(y_all)) == 2) else float("nan")
    return float(np.mean(losses) if losses else np.nan), probs, y_all, f1, auc

# fold-wise scaler (NO leakage)
def fit_scaler_fold(G_raw_np, tr_idx):
    X = G_raw_np[tr_idx]
    mean = X.mean(axis=0).astype(np.float32)
    std  = X.std(axis=0).astype(np.float32)
    std  = np.where(std < 1e-6, 1.0, std).astype(np.float32)
    return mean, std

def apply_scaler(G_raw_np, mean, std):
    return ((G_raw_np - mean) / std).astype(np.float32)

# ----------------------------
# 8) CV Train
# ----------------------------
oof_prob = np.zeros((N,), dtype=np.float32)
fold_metrics = []

all_idx = np.arange(N, dtype=np.int32)
n_splits = int(globals()["n_splits"])

start_time = time.time()

for fold_info in globals()["folds"]:
    fold = int(fold_info["fold"])
    val_idx = np.asarray(fold_info["val_idx"], dtype=np.int32)

    val_mask = np.zeros(N, dtype=bool)
    val_mask[val_idx] = True
    tr_idx = all_idx[~val_mask]

    # fold-wise pos_weight (train only)
    y_tr = y[tr_idx]
    pos = int((y_tr == 1).sum())
    neg = int((y_tr == 0).sum())
    if pos == 0:
        raise RuntimeError(f"[Stage 8] Fold {fold}: no positives in training split.")
    pos_weight = float(neg / max(pos, 1))
    pos_weight_t = torch.tensor([pos_weight], dtype=torch.float32, device=device)

    # label smoothing for BCE
    ls = float(CFG["label_smoothing"])
    def smooth(yb):
        if ls <= 0:
            return yb
        return yb * (1.0 - ls) + 0.5 * ls

    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)

    print(f"\n[Stage 8] FOLD {fold}/{n_splits-1} | train={len(tr_idx):,} val={len(val_idx):,} | pos={pos:,} neg={neg:,} | pos_weight={pos_weight:.4f}")

    # fold-wise scaler (NO leakage)
    g_mean, g_std = fit_scaler_fold(G_raw_np, tr_idx)
    G_fold_z = apply_scaler(G_raw_np, g_mean, g_std)

    # datasets
    ds_tr = MemmapSeqDataset(tr_idx, X_mm, B_mm, M_mm, G_fold_z, y=y, train_mode=True)
    ds_va = MemmapSeqDataset(val_idx, X_mm, B_mm, M_mm, G_fold_z, y=y, train_mode=False)

    # optional weighted sampler (train only)
    sampler = None
    if bool(CFG["use_weighted_sampler"]):
        # weights: inverse freq in TRAIN split
        w = np.ones((len(tr_idx),), dtype=np.float32)
        ytr_local = y[tr_idx]
        # heavier for positives
        w[ytr_local == 1] = float(neg / max(pos, 1))
        w = torch.from_numpy(w)
        sampler = torch.utils.data.WeightedRandomSampler(weights=w, num_samples=len(tr_idx), replacement=True)

    dl_tr = make_loader(ds_tr, batch_size=int(CFG["batch_size"]), shuffle=True, sampler=sampler)
    dl_va = make_loader(ds_va, batch_size=int(CFG["batch_size"]), shuffle=False)

    model = MultibandEventTransformer(
        feat_dim=Fdim,
        max_len=L,
        n_bands=6,
        d_model=int(CFG["d_model"]),
        n_heads=int(CFG["n_heads"]),
        n_layers=int(CFG["n_layers"]),
        ff_mult=int(CFG["ff_mult"]),
        dropout=float(CFG["dropout"]),
        g_dim=g_dim,
    ).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=float(CFG["lr"]), weight_decay=float(CFG["weight_decay"]))

    # scheduler
    scheduler = None
    if str(CFG.get("scheduler","")).lower() == "onecycle":
        steps_per_epoch = max(len(dl_tr), 1)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            opt,
            max_lr=float(CFG["lr"]),
            epochs=int(CFG["epochs"]),
            steps_per_epoch=steps_per_epoch,
            pct_start=0.1,
            anneal_strategy="cos",
            div_factor=10.0,
            final_div_factor=50.0,
        )

    best_val_auc = -1e9
    best_val_loss = float("inf")
    best_epoch = -1
    best_probs = None
    patience_left = int(CFG["patience"])

    grad_accum = int(CFG["grad_accum"])

    for epoch in range(1, int(CFG["epochs"]) + 1):
        model.train()
        opt.zero_grad(set_to_none=True)

        total_loss_true = 0.0
        n_batches = 0
        accum = 0

        for batch in dl_tr:
            Xb, Bb, Mb, Gb, yb = batch
            Xb = Xb.to(device); Bb = Bb.to(device); Mb = Mb.to(device); Gb = Gb.to(device); yb = yb.to(device)

            yb_s = smooth(yb)

            logit = model(Xb, Bb, Mb, Gb)
            loss = criterion(logit, yb_s)

            total_loss_true += float(loss.item())
            n_batches += 1

            (loss / float(grad_accum)).backward()
            accum += 1

            if accum == grad_accum:
                if CFG["max_grad_norm"] is not None:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), float(CFG["max_grad_norm"]))
                opt.step()
                opt.zero_grad(set_to_none=True)
                accum = 0
                if scheduler is not None:
                    scheduler.step()

        # remainder step (IMPORTANT)
        if accum > 0:
            if CFG["max_grad_norm"] is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), float(CFG["max_grad_norm"]))
            opt.step()
            opt.zero_grad(set_to_none=True)
            if scheduler is not None:
                scheduler.step()

        train_loss = total_loss_true / max(n_batches, 1)

        # validate
        val_loss, probs, y_val, f1_05, val_auc = eval_model(model, dl_va, criterion)

        improved = (val_auc > best_val_auc + 1e-6) or (math.isnan(best_val_auc) and not math.isnan(val_auc))
        # fallback tie-breaker by loss
        if (not improved) and (abs(val_auc - best_val_auc) <= 1e-6) and (val_loss < best_val_loss - 1e-6):
            improved = True

        if improved:
            best_val_auc = float(val_auc)
            best_val_loss = float(val_loss)
            best_epoch = int(epoch)
            best_probs = probs.copy()

            ckpt_path = CKPT_DIR / f"fold_{fold}.pt"
            torch.save(
                {
                    "fold": fold,
                    "epoch": epoch,
                    "model_state": model.state_dict(),
                    "cfg": CFG,
                    "seq_feature_names": SEQ_FEATURE_NAMES,
                    "max_len": L,
                    "token_mode": SEQ_TOKEN_MODE,
                    "val_feat": VAL_FEAT,
                    "global_meta_cols": BASE_G_COLS,
                    "use_agg_seq_features": bool(USE_AGG_SEQ_FEATURES),
                    "global_scaler": {"mean": g_mean, "std": g_std},
                    "pos_weight": pos_weight,
                },
                ckpt_path,
            )
            patience_left = int(CFG["patience"])
        else:
            patience_left -= 1

        lr_now = opt.param_groups[0]["lr"]
        print(f"  epoch {epoch:02d} | lr={lr_now:.2e} | train_loss={train_loss:.5f} | val_loss={val_loss:.5f} | val_auc={val_auc:.5f} | f1@0.5={f1_05:.4f} | best_ep={best_epoch} | pat={patience_left}")

        if patience_left <= 0:
            break

    if best_probs is None:
        raise RuntimeError(f"Fold {fold}: best_probs is None (unexpected).")

    # fill OOF
    oof_prob[val_idx] = best_probs.astype(np.float32)

    pred01 = (best_probs >= 0.5).astype(np.int8)
    best_f1_05 = f1_binary(y[val_idx], pred01)

    fold_metrics.append({
        "fold": fold,
        "val_size": int(len(val_idx)),
        "best_epoch": int(best_epoch),
        "best_val_auc": float(best_val_auc),
        "best_val_loss": float(best_val_loss),
        "f1_at_0p5": float(best_f1_05),
        "pos_weight": float(pos_weight),
        "g_dim": int(g_dim),
        "use_agg_seq": bool(USE_AGG_SEQ_FEATURES),
    })

    del model, opt, ds_tr, ds_va, dl_tr, dl_va, G_fold_z
    gc.collect()

elapsed = time.time() - start_time

# ----------------------------
# 9) Save OOF artifacts + summary
# ----------------------------
oof_path_npy = OOF_DIR / "oof_prob.npy"
np.save(oof_path_npy, oof_prob)

df_oof = pd.DataFrame({"object_id": train_ids, "target": y.astype(int), "oof_prob": oof_prob.astype(np.float32)})
oof_path_csv = OOF_DIR / "oof_prob.csv"
df_oof.to_csv(oof_path_csv, index=False)

metrics_path = OOF_DIR / "fold_metrics.json"
with open(metrics_path, "w", encoding="utf-8") as f:
    json.dump({"fold_metrics": fold_metrics, "elapsed_sec": float(elapsed)}, f, indent=2)

# overall quick metrics
oof_pred01 = (oof_prob >= 0.5).astype(np.int8)
oof_f1_05 = f1_binary(y, oof_pred01)
oof_auc = float(roc_auc_score(y, oof_prob)) if (len(np.unique(y)) == 2) else float("nan")

print("\n[Stage 8] CV TRAIN DONE")
print(f"- elapsed: {elapsed/60:.2f} min")
print(f"- OOF saved: {oof_path_npy}")
print(f"- OOF saved: {oof_path_csv}")
print(f"- fold metrics: {metrics_path}")
print(f"- OOF AUC (rough): {oof_auc:.5f}")
print(f"- OOF F1@0.5 (rough): {oof_f1_05:.4f}")

globals().update({
    "oof_prob": oof_prob,
    "OOF_PROB_PATH": oof_path_npy,
    "OOF_CSV_PATH": oof_path_csv,
    "FOLD_METRICS_PATH": metrics_path,
    "TRAIN_CFG_PATH": cfg_path,
})

gc.collect()


[Stage 8] Building AGG sequence features (one-time)...


  mean_val_b = np.nanmean(val_b, axis=1).astype(np.float32)


[Stage 8] AGG built: shape=(3043, 31) | time=0.3s
[Stage 8] TRAIN CONFIG (CPU)
- N=3,043 | pos=148 | neg=2,895 | pos%=4.863621%
- token_mode=mag | val_feat=mag | g_dim=38 | use_agg_seq=True
- Model: d_model=160 heads=4 layers=3 dropout=0.12
- Batch=16 grad_accum=2 epochs=14 lr=0.0005
- WeightedSampler=True | label_smoothing=0.03
- CKPT_DIR=/kaggle/working/mallorn_run/run_20260102_184148_9f34156418/checkpoints
- OOF_DIR =/kaggle/working/mallorn_run/run_20260102_184148_9f34156418/oof
- LOG_DIR =/kaggle/working/mallorn_run/run_20260102_184148_9f34156418/logs

[Stage 8] FOLD 0/4 | train=2,434 val=609 | pos=118 neg=2,316 | pos_weight=19.6271


  Xt = torch.from_numpy(X.astype(np.float32, copy=False))


  epoch 01 | lr=1.80e-04 | train_loss=3.02638 | val_loss=2.76462 | val_auc=0.66177 | f1@0.5=0.0939 | best_ep=1 | pat=4
  epoch 02 | lr=4.20e-04 | train_loss=1.86852 | val_loss=2.62025 | val_auc=0.68192 | f1@0.5=0.0980 | best_ep=2 | pat=4
  epoch 03 | lr=5.00e-04 | train_loss=1.72584 | val_loss=2.06933 | val_auc=0.70351 | f1@0.5=0.1128 | best_ep=3 | pat=4
  epoch 04 | lr=4.97e-04 | train_loss=1.53122 | val_loss=2.08821 | val_auc=0.74145 | f1@0.5=0.1198 | best_ep=4 | pat=4
  epoch 05 | lr=4.90e-04 | train_loss=1.45095 | val_loss=2.03033 | val_auc=0.77012 | f1@0.5=0.1210 | best_ep=5 | pat=4
  epoch 06 | lr=4.80e-04 | train_loss=1.34805 | val_loss=1.59334 | val_auc=0.80455 | f1@0.5=0.1405 | best_ep=6 | pat=4
  epoch 07 | lr=4.66e-04 | train_loss=1.30704 | val_loss=1.37585 | val_auc=0.82372 | f1@0.5=0.1443 | best_ep=7 | pat=4
  epoch 08 | lr=4.48e-04 | train_loss=1.24240 | val_loss=1.75375 | val_auc=0.82637 | f1@0.5=0.1408 | best_ep=8 | pat=4
  epoch 09 | lr=4.28e-04 | train_loss=1.14137 | 



  epoch 01 | lr=1.80e-04 | train_loss=3.03170 | val_loss=2.78285 | val_auc=0.67617 | f1@0.5=0.0939 | best_ep=1 | pat=4
  epoch 02 | lr=4.20e-04 | train_loss=1.80916 | val_loss=2.12647 | val_auc=0.67340 | f1@0.5=0.1093 | best_ep=1 | pat=3
  epoch 03 | lr=5.00e-04 | train_loss=1.71554 | val_loss=2.47786 | val_auc=0.70104 | f1@0.5=0.1085 | best_ep=3 | pat=4
  epoch 04 | lr=4.97e-04 | train_loss=1.50153 | val_loss=2.11699 | val_auc=0.74116 | f1@0.5=0.1210 | best_ep=4 | pat=4
  epoch 05 | lr=4.90e-04 | train_loss=1.41878 | val_loss=2.12937 | val_auc=0.77444 | f1@0.5=0.1307 | best_ep=5 | pat=4
  epoch 06 | lr=4.80e-04 | train_loss=1.37165 | val_loss=1.98347 | val_auc=0.79591 | f1@0.5=0.1379 | best_ep=6 | pat=4
  epoch 07 | lr=4.66e-04 | train_loss=1.24760 | val_loss=2.10138 | val_auc=0.80558 | f1@0.5=0.1357 | best_ep=7 | pat=4
  epoch 08 | lr=4.48e-04 | train_loss=1.21571 | val_loss=2.05527 | val_auc=0.81071 | f1@0.5=0.1461 | best_ep=8 | pat=4
  epoch 09 | lr=4.28e-04 | train_loss=1.17871 | 



  epoch 01 | lr=1.80e-04 | train_loss=3.30461 | val_loss=2.77594 | val_auc=0.62642 | f1@0.5=0.0939 | best_ep=1 | pat=4
  epoch 02 | lr=4.20e-04 | train_loss=1.88812 | val_loss=2.71278 | val_auc=0.65889 | f1@0.5=0.1012 | best_ep=2 | pat=4
  epoch 03 | lr=5.00e-04 | train_loss=1.69895 | val_loss=2.33578 | val_auc=0.71255 | f1@0.5=0.1154 | best_ep=3 | pat=4
  epoch 04 | lr=4.97e-04 | train_loss=1.53493 | val_loss=1.82583 | val_auc=0.81007 | f1@0.5=0.1247 | best_ep=4 | pat=4
  epoch 05 | lr=4.90e-04 | train_loss=1.41205 | val_loss=1.82155 | val_auc=0.83765 | f1@0.5=0.1319 | best_ep=5 | pat=4
  epoch 06 | lr=4.80e-04 | train_loss=1.38400 | val_loss=1.73573 | val_auc=0.86868 | f1@0.5=0.1333 | best_ep=6 | pat=4
  epoch 07 | lr=4.66e-04 | train_loss=1.31153 | val_loss=1.93138 | val_auc=0.89194 | f1@0.5=0.1313 | best_ep=7 | pat=4
  epoch 08 | lr=4.48e-04 | train_loss=1.23989 | val_loss=1.36608 | val_auc=0.89194 | f1@0.5=0.1567 | best_ep=8 | pat=4
  epoch 09 | lr=4.28e-04 | train_loss=1.21021 | 



  epoch 01 | lr=1.80e-04 | train_loss=3.09341 | val_loss=2.82513 | val_auc=0.72205 | f1@0.5=0.0911 | best_ep=1 | pat=4
  epoch 02 | lr=4.20e-04 | train_loss=1.83907 | val_loss=2.35031 | val_auc=0.71568 | f1@0.5=0.1010 | best_ep=1 | pat=3
  epoch 03 | lr=5.00e-04 | train_loss=1.61819 | val_loss=2.55919 | val_auc=0.74129 | f1@0.5=0.1090 | best_ep=3 | pat=4
  epoch 04 | lr=4.97e-04 | train_loss=1.52580 | val_loss=1.84636 | val_auc=0.84962 | f1@0.5=0.1191 | best_ep=4 | pat=4
  epoch 05 | lr=4.90e-04 | train_loss=1.39777 | val_loss=1.85208 | val_auc=0.85451 | f1@0.5=0.1207 | best_ep=5 | pat=4
  epoch 06 | lr=4.80e-04 | train_loss=1.33287 | val_loss=1.78760 | val_auc=0.87928 | f1@0.5=0.1264 | best_ep=6 | pat=4
  epoch 07 | lr=4.66e-04 | train_loss=1.33138 | val_loss=1.32653 | val_auc=0.87827 | f1@0.5=0.1440 | best_ep=6 | pat=3
  epoch 08 | lr=4.48e-04 | train_loss=1.27359 | val_loss=1.40111 | val_auc=0.88678 | f1@0.5=0.1472 | best_ep=8 | pat=4
  epoch 09 | lr=4.28e-04 | train_loss=1.17156 | 



  epoch 01 | lr=1.80e-04 | train_loss=3.23834 | val_loss=2.99323 | val_auc=0.62027 | f1@0.5=0.0911 | best_ep=1 | pat=4
  epoch 02 | lr=4.20e-04 | train_loss=1.83392 | val_loss=2.52247 | val_auc=0.66708 | f1@0.5=0.0961 | best_ep=2 | pat=4
  epoch 03 | lr=5.00e-04 | train_loss=1.65972 | val_loss=2.25719 | val_auc=0.70484 | f1@0.5=0.1081 | best_ep=3 | pat=4
  epoch 04 | lr=4.97e-04 | train_loss=1.59363 | val_loss=2.20872 | val_auc=0.77428 | f1@0.5=0.1131 | best_ep=4 | pat=4
  epoch 05 | lr=4.90e-04 | train_loss=1.43721 | val_loss=1.53585 | val_auc=0.82211 | f1@0.5=0.1217 | best_ep=5 | pat=4
  epoch 06 | lr=4.80e-04 | train_loss=1.32790 | val_loss=1.66908 | val_auc=0.83277 | f1@0.5=0.1339 | best_ep=6 | pat=4
  epoch 07 | lr=4.66e-04 | train_loss=1.29979 | val_loss=1.38749 | val_auc=0.84754 | f1@0.5=0.1429 | best_ep=7 | pat=4
  epoch 08 | lr=4.48e-04 | train_loss=1.23394 | val_loss=1.28281 | val_auc=0.85302 | f1@0.5=0.1518 | best_ep=8 | pat=4
  epoch 09 | lr=4.28e-04 | train_loss=1.18229 | 

33

# OOF Prediction + Threshold Tuning

In [11]:
# ============================================================
# STAGE 9 — OOF Prediction + Threshold Tuning (ONE CELL, Kaggle CPU-SAFE)
# REVISI FULL v3 (ALIGN SUPER ROBUST + MULTI-METRIC)
#
# Upgrade v3:
# - Auto-detect target column in df_train_meta (target/y/label/class/...)
# - Prefer oof_prob.csv (object_id + oof_prob) for safest alignment
# - Clean oof_prob NaN/inf + clip [0,1]
# - Threshold tuning for: F1, Accuracy, Balanced Accuracy, MCC (+ Precision/Recall)
# - Exports multiple BEST thresholds, not only F1
# ============================================================

import gc, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
need = ["OOF_DIR", "df_train_meta"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing `{k}`. Jalankan STAGE 0..8 dulu.")

OOF_DIR = Path(OOF_DIR)
OOF_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Helper: robust stringify id
# ----------------------------
def _to_str_list(ids):
    out = []
    for x in ids:
        if isinstance(x, (bytes, np.bytes_)):
            out.append(x.decode("utf-8", errors="ignore").strip())
        else:
            out.append(str(x).strip())
    return out

# ----------------------------
# Helper: robust load float32 1D
# ----------------------------
def _as_1d_float32(arr):
    a = np.asarray(arr)
    if a.dtype == object and a.ndim == 0:
        try:
            a = np.asarray(a.item())
        except Exception:
            pass
    a = np.asarray(a, dtype=np.float32)
    if a.ndim == 0:
        return a
    if a.ndim > 1:
        a = a.reshape(-1)
    return a

# ----------------------------
# Detect target column in df_train_meta
# ----------------------------
def _detect_target_col(df):
    for cand in ["target","y","label","class","is_tde","binary_target"]:
        if cand in df.columns:
            return cand
    return None

TARGET_COL = _detect_target_col(df_train_meta)
if TARGET_COL is None:
    raise RuntimeError(
        "Cannot detect target column in df_train_meta. "
        f"Columns sample: {list(df_train_meta.columns)[:60]}"
    )

# ----------------------------
# Load OOF (prefer CSV for safest alignment)
# ----------------------------
def _load_oof():
    # 1) CSV (best alignment)
    pcsv = OOF_DIR / "oof_prob.csv"
    if pcsv.exists():
        df = pd.read_csv(pcsv)
        if ("object_id" in df.columns) and ("oof_prob" in df.columns):
            df["object_id"] = df["object_id"].astype(str).str.strip()
            prob = _as_1d_float32(df["oof_prob"].to_numpy())
            return df["object_id"].tolist(), prob, "csv"

    # 2) globals
    if "oof_prob" in globals():
        prob = _as_1d_float32(globals()["oof_prob"])
        if isinstance(prob, np.ndarray) and prob.ndim != 0:
            # need ids
            if "train_ids_ordered" in globals():
                ids = _to_str_list(list(globals()["train_ids_ordered"]))
                return ids, prob, "globals(train_ids_ordered)"
            # fallback to df_train_meta order if same length
            if len(prob) == len(df_train_meta):
                return df_train_meta.index.astype(str).tolist(), prob, "globals(df_train_meta.index)"

    # 3) npy
    pnpy = OOF_DIR / "oof_prob.npy"
    if pnpy.exists():
        prob = _as_1d_float32(np.load(pnpy, allow_pickle=False))
        if "train_ids_ordered" in globals():
            ids = _to_str_list(list(globals()["train_ids_ordered"]))
            return ids, prob, "npy(train_ids_ordered)"
        if len(prob) == len(df_train_meta):
            return df_train_meta.index.astype(str).tolist(), prob, "npy(df_train_meta.index)"

    raise FileNotFoundError("OOF prob not found (csv/globals/npy). Jalankan STAGE 8 dulu.")

train_ids, oof_prob, src = _load_oof()

# guard scalar
if not isinstance(oof_prob, np.ndarray) or oof_prob.ndim == 0:
    raise TypeError(
        f"Invalid oof_prob (scalar). Type={type(oof_prob)} ndim={getattr(oof_prob,'ndim',None)}."
    )

# sanitize prob
oof_prob = np.nan_to_num(oof_prob, nan=0.0, posinf=1.0, neginf=0.0).astype(np.float32)
oof_prob = np.clip(oof_prob, 0.0, 1.0).astype(np.float32)

# validate ids existence in meta
missing = [oid for oid in train_ids if oid not in df_train_meta.index]
if missing:
    raise KeyError(f"OOF ids not in df_train_meta (examples): {missing[:10]} | missing_n={len(missing)}")

# load y aligned
y = pd.to_numeric(df_train_meta.loc[train_ids, TARGET_COL], errors="coerce").fillna(0).astype(np.int16).to_numpy()
y = (y > 0).astype(np.int8)

if len(oof_prob) != len(y):
    raise RuntimeError(f"Length mismatch: oof_prob={len(oof_prob)} vs y={len(y)}")

uy = set(np.unique(y).tolist())
if not uy.issubset({0, 1}):
    raise ValueError(f"y must be binary 0/1. Found: {sorted(list(uy))}")

N = int(len(y))
pos = int((y == 1).sum())
neg = int((y == 0).sum())

print(f"[Stage 9] Loaded OOF from: {src}")
print(f"[Stage 9] N={N:,} | pos={pos:,} | neg={neg:,} | pos%={pos/max(N,1)*100:.6f}% | target_col={TARGET_COL}")

# ----------------------------
# 1) Metrics
# ----------------------------
def _counts(y_true, y_pred01):
    y_true = y_true.astype(np.int32)
    y_pred01 = y_pred01.astype(np.int32)
    tp = int(((y_true == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true == 1) & (y_pred01 == 0)).sum())
    tn = int(((y_true == 0) & (y_pred01 == 0)).sum())
    return tp, fp, fn, tn

def f1_prec_rec(tp, fp, fn):
    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    if tp == 0 or (prec + rec) == 0:
        return 0.0, float(prec), float(rec)
    f1 = 2 * prec * rec / (prec + rec)
    return float(f1), float(prec), float(rec)

def accuracy(tp, fp, fn, tn):
    return float((tp + tn) / max(tp + fp + fn + tn, 1))

def balanced_accuracy(tp, fp, fn, tn):
    tpr = tp / max(tp + fn, 1)
    tnr = tn / max(tn + fp, 1)
    return float(0.5 * (tpr + tnr))

def mcc(tp, fp, fn, tn):
    num = tp * tn - fp * fn
    den = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if den <= 0:
        return 0.0
    return float(num / np.sqrt(den))

# ----------------------------
# 2) Threshold candidates (grid + quantiles + unique-prob sampling)
# ----------------------------
grid = np.concatenate([
    np.linspace(0.00, 0.10, 41),
    np.linspace(0.10, 0.90, 161),
    np.linspace(0.90, 1.00, 41),
]).astype(np.float32)

qs = np.linspace(0.001, 0.999, 999, dtype=np.float32)
quant_thr = np.quantile(oof_prob, qs).astype(np.float32)

# add a small sample of unique probs (if too many, subsample)
uniq = np.unique(oof_prob)
if len(uniq) > 4000:
    take = np.linspace(0, len(uniq)-1, 4000, dtype=int)
    uniq = uniq[take].astype(np.float32)

thr_candidates = np.unique(np.clip(np.concatenate([grid, quant_thr, uniq]), 0.0, 1.0)).astype(np.float32)

# ----------------------------
# 3) Sweep
# ----------------------------
rows = []
best_f1  = {"thr": 0.5, "score": -1.0, "rec": -1.0, "fp": 10**18}
best_acc = {"thr": 0.5, "score": -1.0, "bacc": -1.0}
best_bac = {"thr": 0.5, "score": -1.0, "acc": -1.0}
best_mcc = {"thr": 0.5, "score": -1.0}

for thr in thr_candidates:
    pred = (oof_prob >= thr).astype(np.int8)
    tp, fp, fn, tn = _counts(y, pred)

    f1, prec, rec = f1_prec_rec(tp, fp, fn)
    acc  = accuracy(tp, fp, fn, tn)
    bacc = balanced_accuracy(tp, fp, fn, tn)
    mmc  = mcc(tp, fp, fn, tn)
    pos_pred = int(pred.sum())

    rows.append((float(thr), f1, prec, rec, acc, bacc, mmc, tp, fp, fn, tn, pos_pred))

    # best F1 (tie: higher recall, then lower fp)
    if (f1 > best_f1["score"] + 1e-12) or (
        abs(f1 - best_f1["score"]) <= 1e-12 and rec > best_f1["rec"] + 1e-12
    ) or (
        abs(f1 - best_f1["score"]) <= 1e-12 and abs(rec - best_f1["rec"]) <= 1e-12 and fp < best_f1["fp"]
    ):
        best_f1.update({"thr": float(thr), "score": float(f1), "rec": float(rec), "fp": int(fp)})

    # best ACC (tie: higher BACC)
    if (acc > best_acc["score"] + 1e-12) or (
        abs(acc - best_acc["score"]) <= 1e-12 and bacc > best_acc["bacc"] + 1e-12
    ):
        best_acc.update({"thr": float(thr), "score": float(acc), "bacc": float(bacc)})

    # best BACC (tie: higher ACC)
    if (bacc > best_bac["score"] + 1e-12) or (
        abs(bacc - best_bac["score"]) <= 1e-12 and acc > best_bac["acc"] + 1e-12
    ):
        best_bac.update({"thr": float(thr), "score": float(bacc), "acc": float(acc)})

    # best MCC
    if (mmc > best_mcc["score"] + 1e-12):
        best_mcc.update({"thr": float(thr), "score": float(mmc)})

thr_table = pd.DataFrame(
    rows,
    columns=["thr","f1","precision","recall","accuracy","balanced_accuracy","mcc","tp","fp","fn","tn","pos_pred"]
)

# baselines
def _eval_at(thr):
    pred = (oof_prob >= float(thr)).astype(np.int8)
    tp, fp, fn, tn = _counts(y, pred)
    f1, prec, rec = f1_prec_rec(tp, fp, fn)
    acc  = accuracy(tp, fp, fn, tn)
    bacc = balanced_accuracy(tp, fp, fn, tn)
    mmc  = mcc(tp, fp, fn, tn)
    return {
        "thr": float(thr), "f1": float(f1), "precision": float(prec), "recall": float(rec),
        "accuracy": float(acc), "balanced_accuracy": float(bacc), "mcc": float(mmc),
        "tp": int(tp), "fp": int(fp), "fn": int(fn), "tn": int(tn), "pos_pred": int(pred.sum())
    }

base05 = _eval_at(0.5)

BEST_THR_F1  = float(best_f1["thr"])
BEST_THR_ACC = float(best_acc["thr"])
BEST_THR_BACC= float(best_bac["thr"])
BEST_THR_MCC = float(best_mcc["thr"])

best_f1_full   = _eval_at(BEST_THR_F1)
best_acc_full  = _eval_at(BEST_THR_ACC)
best_bac_full  = _eval_at(BEST_THR_BACC)
best_mcc_full  = _eval_at(BEST_THR_MCC)

# ----------------------------
# 4) Save artifacts
# ----------------------------
out_json = OOF_DIR / "threshold_tuning.json"
out_txt  = OOF_DIR / "threshold_report.txt"
out_csv  = OOF_DIR / "threshold_table_top500.csv"

payload = {
    "source": src,
    "target_col": TARGET_COL,
    "n_objects": int(N),
    "pos": int(pos),
    "neg": int(neg),

    "baseline_thr_0p5": base05,

    "best_thr_f1": best_f1_full,
    "best_thr_accuracy": best_acc_full,
    "best_thr_balanced_accuracy": best_bac_full,
    "best_thr_mcc": best_mcc_full,
}

with open(out_json, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2)

# sort tables for convenience
top_f1   = thr_table.sort_values(["f1","recall","precision"], ascending=[False, False, False]).head(50).reset_index(drop=True)
top_acc  = thr_table.sort_values(["accuracy","balanced_accuracy","f1"], ascending=[False, False, False]).head(50).reset_index(drop=True)
top_bacc = thr_table.sort_values(["balanced_accuracy","accuracy","f1"], ascending=[False, False, False]).head(50).reset_index(drop=True)
top_mcc  = thr_table.sort_values(["mcc","f1","balanced_accuracy"], ascending=[False, False, False]).head(50).reset_index(drop=True)

# save a big top slice sorted by f1
thr_table.sort_values(["f1","recall","precision"], ascending=[False, False, False]).head(500).to_csv(out_csv, index=False)

lines = []
lines.append("OOF Threshold Tuning Report (v3)")
lines.append(f"- source={src}")
lines.append(f"- target_col={TARGET_COL}")
lines.append(f"- N={N} | pos={pos} | neg={neg} | pos%={pos/max(N,1)*100:.6f}%")
lines.append("")
lines.append("Baseline @ thr=0.5")
lines.append(f"- F1={base05['f1']:.6f} | P={base05['precision']:.6f} | R={base05['recall']:.6f} | ACC={base05['accuracy']:.6f} | BACC={base05['balanced_accuracy']:.6f} | MCC={base05['mcc']:.6f}")
lines.append(f"- tp={base05['tp']} fp={base05['fp']} fn={base05['fn']} tn={base05['tn']} | pos_pred={base05['pos_pred']}")
lines.append("")
lines.append(f"BEST-F1   @ thr={best_f1_full['thr']:.6f} | F1={best_f1_full['f1']:.6f} | P={best_f1_full['precision']:.6f} | R={best_f1_full['recall']:.6f} | pos_pred={best_f1_full['pos_pred']}")
lines.append(f"BEST-ACC  @ thr={best_acc_full['thr']:.6f} | ACC={best_acc_full['accuracy']:.6f} | BACC={best_acc_full['balanced_accuracy']:.6f} | F1={best_acc_full['f1']:.6f}")
lines.append(f"BEST-BACC @ thr={best_bac_full['thr']:.6f} | BACC={best_bac_full['balanced_accuracy']:.6f} | ACC={best_bac_full['accuracy']:.6f} | F1={best_bac_full['f1']:.6f}")
lines.append(f"BEST-MCC  @ thr={best_mcc_full['thr']:.6f} | MCC={best_mcc_full['mcc']:.6f} | F1={best_mcc_full['f1']:.6f} | BACC={best_mcc_full['balanced_accuracy']:.6f}")
lines.append("")
lines.append("Top 10 by F1:")
for i in range(min(10, len(top_f1))):
    r = top_f1.iloc[i]
    lines.append(f"{i+1:02d}. thr={r['thr']:.6f} | f1={r['f1']:.6f} | P={r['precision']:.6f} | R={r['recall']:.6f} | ACC={r['accuracy']:.6f} | BACC={r['balanced_accuracy']:.6f} | MCC={r['mcc']:.6f} | pos_pred={int(r['pos_pred'])}")

with open(out_txt, "w", encoding="utf-8") as f:
    f.write("\n".join(lines) + "\n")

print("[Stage 9] DONE")
print(f"- Saved: {out_json}")
print(f"- Saved: {out_txt}")
print(f"- Saved: {out_csv}")
print(f"- BEST_THR_F1  ={BEST_THR_F1:.6f} | F1={best_f1_full['f1']:.6f} (P={best_f1_full['precision']:.6f} R={best_f1_full['recall']:.6f})")
print(f"- BEST_THR_ACC ={BEST_THR_ACC:.6f} | ACC={best_acc_full['accuracy']:.6f} BACC={best_acc_full['balanced_accuracy']:.6f} F1={best_acc_full['f1']:.6f}")
print(f"- BEST_THR_BACC={BEST_THR_BACC:.6f} | BACC={best_bac_full['balanced_accuracy']:.6f} ACC={best_bac_full['accuracy']:.6f} F1={best_bac_full['f1']:.6f}")
print(f"- BEST_THR_MCC ={BEST_THR_MCC:.6f} | MCC={best_mcc_full['mcc']:.6f} F1={best_mcc_full['f1']:.6f} BACC={best_mcc_full['balanced_accuracy']:.6f}")

globals().update({
    "train_ids_oof": train_ids,
    "oof_prob": oof_prob,
    "BEST_THR": BEST_THR_F1,          # default tetap F1
    "BEST_THR_F1": BEST_THR_F1,
    "BEST_THR_ACC": BEST_THR_ACC,
    "BEST_THR_BACC": BEST_THR_BACC,
    "BEST_THR_MCC": BEST_THR_MCC,
    "thr_table": thr_table,
    "THR_JSON_PATH": out_json,
    "THR_REPORT_PATH": out_txt,
    "THR_TABLE_CSV_PATH": out_csv,
})

gc.collect()


[Stage 9] Loaded OOF from: csv
[Stage 9] N=3,043 | pos=148 | neg=2,895 | pos%=4.863621% | target_col=target
[Stage 9] DONE
- Saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/oof/threshold_tuning.json
- Saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/oof/threshold_report.txt
- Saved: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/oof/threshold_table_top500.csv
- BEST_THR_F1  =0.970070 | F1=0.332117 (P=0.227500 R=0.614865)
- BEST_THR_ACC =0.999111 | ACC=0.952350 BACC=0.516546 F1=0.064516
- BEST_THR_BACC=0.840000 | BACC=0.786457 ACC=0.685179 F1=0.217320
- BEST_THR_MCC =0.970070 | MCC=0.323482 F1=0.332117 BACC=0.754065


33

# Test Inference (Fold Ensemble)

In [12]:
# ============================================================
# STAGE 10 — Test Inference (Fold Ensemble) (ONE CELL, Kaggle CPU-SAFE)
# REVISI FULL v3.1 (AUTO-ARCH FROM CKPT + LOGIT ENSEMBLE + ID ALIGN HARD)
#
# Fix untuk error mismatch:
# - Auto-detect x_proj as Linear vs Sequential(Linear)
# - Auto-detect optional pool_ln
# - Infer d_model / max_len / dim_feedforward / n_layers / g_dim langsung dari state_dict
# - If ckpt is pure state_dict (no metadata), still runs (best-effort G features)
#
# Output:
# - ART_DIR/preds/test_logit_folds.npy
# - ART_DIR/preds/test_logit_ens.npy
# - ART_DIR/preds/test_prob_folds.npy
# - ART_DIR/preds/test_prob_ens.npy
# - ART_DIR/preds/test_prob_ens.csv
# - ART_DIR/preds/test_infer_config.json
# ============================================================

import os, gc, json, re, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
# mute nested tensor warning (harmless)
warnings.filterwarnings("ignore", message="enable_nested_tensor is True.*")

# ----------------------------
# 0) Require previous stages
# ----------------------------
need = ["ART_DIR","FIX_DIR","MAX_LEN","SEQ_FEATURE_NAMES","df_test_meta","CKPT_DIR","n_splits"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing `{k}`. Jalankan STAGE 0..9 dulu.")

# Torch
try:
    import torch
    import torch.nn as nn
except Exception as e:
    raise RuntimeError("PyTorch tidak tersedia di environment ini.") from e

device = torch.device("cpu")
SEED = int(globals().get("SEED", 2025))
torch.manual_seed(SEED)
np.random.seed(SEED)

# Thread guard (CPU)
try:
    torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", "2")))
    torch.set_num_interop_threads(1)
except Exception:
    pass

FIX_DIR = Path(FIX_DIR)
ART_DIR = Path(ART_DIR); ART_DIR.mkdir(parents=True, exist_ok=True)
CKPT_DIR = Path(CKPT_DIR)

OUT_DIR = ART_DIR / "preds"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# helper: normalize id robustly
# ----------------------------
def _norm_id(x):
    if isinstance(x, (bytes, np.bytes_)):
        try:
            x = x.decode("utf-8", errors="ignore")
        except Exception:
            x = str(x)
    s = str(x).strip()
    if (s.startswith("b'") and s.endswith("'")) or (s.startswith('b"') and s.endswith('"')):
        s = s[2:-1]
    return s.strip()

def _load_ids_npy(path: Path):
    arr = np.load(path, allow_pickle=False)
    xs = arr.tolist() if hasattr(arr, "tolist") else list(arr)
    return [_norm_id(z) for z in xs]

# ----------------------------
# 1) Load TEST ordering (must match STAGE 6)
# ----------------------------
test_ids_path = FIX_DIR / "test_ids.npy"
if not test_ids_path.exists():
    raise FileNotFoundError(f"Missing: {test_ids_path}. Pastikan STAGE 6 sukses.")

test_ids = _load_ids_npy(test_ids_path)
NTE = len(test_ids)
if NTE <= 0:
    raise RuntimeError("test_ids kosong (NTE=0). Pastikan STAGE 6 sukses membuat test_ids.npy.")

# Normalize df_test_meta.index (always)
df_test_meta = df_test_meta.copy(deep=False)
df_test_meta.index = pd.Index([_norm_id(z) for z in df_test_meta.index], name=df_test_meta.index.name)

missing_ids = [oid for oid in test_ids if oid not in df_test_meta.index]
if missing_ids:
    raise KeyError(f"Some test_ids not found in df_test_meta.index (examples): {missing_ids[:10]} | missing_n={len(missing_ids)}")

if len(set(test_ids)) != len(test_ids):
    s = pd.Series(test_ids)
    dup = s[s.duplicated()].head(10).tolist()
    raise ValueError(f"Duplicate object_id in test_ids ordering (examples): {dup}")

# ----------------------------
# 2) Open fixed-length TEST memmaps
# ----------------------------
SEQ_FEATURE_NAMES = list(SEQ_FEATURE_NAMES)
Fdim = len(SEQ_FEATURE_NAMES)
L = int(MAX_LEN)

test_X_path = FIX_DIR / "test_X.dat"
test_B_path = FIX_DIR / "test_B.dat"
test_M_path = FIX_DIR / "test_M.dat"
for p in [test_X_path, test_B_path, test_M_path]:
    if not p.exists():
        raise FileNotFoundError(f"Missing fixed cache file: {p}. Pastikan STAGE 6 sukses.")

Xte = np.memmap(test_X_path, dtype=np.float32, mode="r", shape=(NTE, L, Fdim))
Bte = np.memmap(test_B_path, dtype=np.int8,   mode="r", shape=(NTE, L))
Mte = np.memmap(test_M_path, dtype=np.int8,   mode="r", shape=(NTE, L))

# ----------------------------
# 3) Dataset/Loader for inference
# ----------------------------
class TestMemmapDataset(torch.utils.data.Dataset):
    def __init__(self, Xmm, Bmm, Mmm, G_np_z):
        self.Xmm = Xmm
        self.Bmm = Bmm
        self.Mmm = Mmm
        self.G = G_np_z

    def __len__(self):
        return int(self.Xmm.shape[0])

    def __getitem__(self, i):
        X = self.Xmm[i]
        B = self.Bmm[i].astype(np.int64, copy=False)
        M = self.Mmm[i].astype(np.int64, copy=False)
        G = self.G[i]
        return (
            torch.from_numpy(X),
            torch.from_numpy(B),
            torch.from_numpy(M),
            torch.from_numpy(G),
        )

def make_loader(ds, batch_size=64):
    return torch.utils.data.DataLoader(
        ds, batch_size=int(batch_size), shuffle=False,
        num_workers=0, pin_memory=False, drop_last=False
    )

# ----------------------------
# 4) Safe/compat checkpoint loader
# ----------------------------
def torch_load_compat(path: Path):
    try:
        obj = torch.load(path, map_location="cpu", weights_only=True)
        # if dict with metadata, keep
        if isinstance(obj, dict) and ("model_state" in obj or "cfg" in obj or "global_scaler" in obj or "global_cols" in obj):
            return obj
        # likely pure state_dict -> reload full
        return torch.load(path, map_location="cpu", weights_only=False)
    except TypeError:
        return torch.load(path, map_location="cpu")
    except Exception:
        return torch.load(path, map_location="cpu", weights_only=False)

def extract_state_and_meta(ckpt_obj):
    """
    Returns: state_dict, meta_dict
    - if ckpt is full dict -> state_dict=ckpt["model_state"], meta=ckpt
    - if ckpt is pure state_dict -> state_dict=ckpt, meta={}
    """
    if isinstance(ckpt_obj, dict) and "model_state" in ckpt_obj and isinstance(ckpt_obj["model_state"], dict):
        return ckpt_obj["model_state"], ckpt_obj
    if isinstance(ckpt_obj, dict):
        # could be pure state_dict OR a metadata dict without model_state
        # assume pure state_dict if it contains tensor-ish values
        any_tensor = any(hasattr(v, "shape") for v in ckpt_obj.values())
        if any_tensor:
            return ckpt_obj, {}
        return ckpt_obj, ckpt_obj
    raise RuntimeError(f"Unsupported ckpt object type: {type(ckpt_obj)}")

# checkpoints
ckpts = []
for f in range(int(n_splits)):
    p = CKPT_DIR / f"fold_{f}.pt"
    if not p.exists():
        raise FileNotFoundError(f"Missing checkpoint: {p}. Pastikan STAGE 8 menyimpan ckpt per fold.")
    ckpts.append(p)

# ----------------------------
# 5) Infer architecture from state_dict (IMPORTANT)
# ----------------------------
def infer_from_state(sd: dict):
    keys = set(sd.keys())

    # d_model & n_bands from band_emb
    if "band_emb.weight" not in sd:
        raise RuntimeError("state_dict missing band_emb.weight (ckpt tidak cocok dengan model ini).")
    n_bands = int(sd["band_emb.weight"].shape[0])
    d_model = int(sd["band_emb.weight"].shape[1])

    # max_len from pos_emb
    if "pos_emb" not in sd:
        raise RuntimeError("state_dict missing pos_emb.")
    max_len_ckpt = int(sd["pos_emb"].shape[1])

    # feat_dim from x_proj
    xproj_is_seq = ("x_proj.0.weight" in keys)
    if xproj_is_seq:
        feat_dim = int(sd["x_proj.0.weight"].shape[1])
    else:
        if "x_proj.weight" not in sd:
            raise RuntimeError("state_dict missing x_proj.weight or x_proj.0.weight.")
        feat_dim = int(sd["x_proj.weight"].shape[1])

    # g_dim from g_proj
    g_dim = None
    if "g_proj.0.weight" in sd:
        g_dim = int(sd["g_proj.0.weight"].shape[1])
        g_hidden = int(sd["g_proj.0.weight"].shape[0])
    else:
        # fallback: infer from head input
        g_dim = 0
        g_hidden = d_model // 2

    # encoder layers count + dim_ff
    layer_ids = set()
    dim_ff = None
    for k in keys:
        m = re.match(r"encoder\.layers\.(\d+)\.", k)
        if m:
            layer_ids.add(int(m.group(1)))
    n_layers = (max(layer_ids) + 1) if layer_ids else 0
    if n_layers <= 0:
        raise RuntimeError("Cannot infer n_layers from state_dict (encoder.layers.* not found).")

    # dim_feedforward from linear1.weight
    k_lin1 = f"encoder.layers.0.linear1.weight"
    if k_lin1 in sd:
        dim_ff = int(sd[k_lin1].shape[0])
    else:
        # older/newer naming unlikely, but try search
        lin1_keys = [k for k in keys if k.endswith("linear1.weight")]
        if not lin1_keys:
            raise RuntimeError("Cannot infer dim_feedforward (linear1.weight not found).")
        dim_ff = int(sd[sorted(lin1_keys)[0]].shape[0])

    # optional pool_ln
    has_pool_ln = ("pool_ln.weight" in keys and "pool_ln.bias" in keys)

    # head layout (detect weight indices)
    head_w_idx = []
    for k in keys:
        m = re.match(r"head\.(\d+)\.weight", k)
        if m:
            head_w_idx.append(int(m.group(1)))
    head_w_idx = sorted(set(head_w_idx))
    if not head_w_idx:
        raise RuntimeError("Cannot infer head structure (head.*.weight not found).")

    # determine final linear index
    # common patterns: weights at {0,3} or {0,2} or {0,1}
    final_idx = max(head_w_idx)

    return {
        "n_bands": n_bands,
        "d_model": d_model,
        "max_len_ckpt": max_len_ckpt,
        "feat_dim": feat_dim,
        "g_dim": g_dim,
        "g_hidden": g_hidden,
        "n_layers": n_layers,
        "dim_ff": dim_ff,
        "has_pool_ln": has_pool_ln,
        "xproj_is_seq": xproj_is_seq,
        "head_final_idx": final_idx,
    }

# ----------------------------
# 6) Build model matching ckpt
# ----------------------------
class FlexMultibandEventTransformer(nn.Module):
    def __init__(self, feat_dim, max_len, n_bands, d_model, n_heads, n_layers, dim_ff, dropout, g_dim, g_hidden,
                 xproj_is_seq=False, has_pool_ln=False, head_final_idx=3):
        super().__init__()
        self.n_bands = n_bands
        self.max_len = max_len
        self.d_model = d_model

        if xproj_is_seq:
            self.x_proj = nn.Sequential(nn.Linear(feat_dim, d_model))
        else:
            self.x_proj = nn.Linear(feat_dim, d_model)

        self.band_emb = nn.Embedding(n_bands, d_model)

        self.pos_emb = nn.Parameter(torch.zeros(1, max_len, d_model))
        nn.init.normal_(self.pos_emb, mean=0.0, std=0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=int(dim_ff),
            dropout=float(dropout),
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=int(n_layers))

        self.attn = nn.Linear(d_model, 1)

        # keep same param naming for g_proj: g_proj.0.weight/bias
        self.g_proj = nn.Sequential(
            nn.Linear(int(g_dim), int(g_hidden)),
            nn.GELU(),
            nn.Dropout(float(dropout)),
        )

        self.has_pool_ln = bool(has_pool_ln)
        if self.has_pool_ln:
            self.pool_ln = nn.LayerNorm(d_model)

        # head structure based on detected final idx
        # final_idx=3 => Linear, GELU, Dropout, Linear
        # final_idx=2 => Linear, GELU, Linear
        # final_idx=1 => Linear, Linear
        in_head = int(d_model + g_hidden)
        if head_final_idx == 3:
            self.head = nn.Sequential(
                nn.Linear(in_head, d_model),
                nn.GELU(),
                nn.Dropout(float(dropout)),
                nn.Linear(d_model, 1),
            )
        elif head_final_idx == 2:
            self.head = nn.Sequential(
                nn.Linear(in_head, d_model),
                nn.GELU(),
                nn.Linear(d_model, 1),
            )
        else:
            self.head = nn.Sequential(
                nn.Linear(in_head, d_model),
                nn.Linear(d_model, 1),
            )

    def forward(self, X, band_id, mask, G):
        X = X.to(torch.float32)
        band_id = band_id.clamp(0, self.n_bands - 1).to(torch.long)
        mask = mask.to(torch.long)

        pad_mask = (mask == 0)
        all_pad = pad_mask.all(dim=1)
        if all_pad.any():
            pad_mask = pad_mask.clone()
            pad_mask[all_pad, 0] = False

        h = self.x_proj(X) + self.band_emb(band_id) + self.pos_emb[:, :X.shape[1], :]
        h = self.encoder(h, src_key_padding_mask=pad_mask)

        a = self.attn(h).squeeze(-1)
        a = a.masked_fill(pad_mask, -1e9)
        w = torch.softmax(a, dim=1)
        pooled = torch.sum(h * w.unsqueeze(-1), dim=1)

        if self.has_pool_ln:
            pooled = self.pool_ln(pooled)

        g = self.g_proj(G.to(torch.float32))
        z = torch.cat([pooled, g], dim=1)
        return self.head(z).squeeze(-1)  # logit

def sigmoid_np(x):
    x = np.clip(x, -50, 50)
    return 1.0 / (1.0 + np.exp(-x))

@torch.no_grad()
def predict_logits(model, loader):
    model.eval()
    outs = []
    with torch.inference_mode():
        for Xb, Bb, Mb, Gb in loader:
            logit = model(Xb.to(device), Bb.to(device), Mb.to(device), Gb.to(device))
            outs.append(logit.detach().cpu().numpy())
    return (np.concatenate(outs, axis=0) if outs else np.zeros((0,), dtype=np.float32)).astype(np.float32)

# ----------------------------
# 7) Build G features for test (best-effort)
# ----------------------------
def load_global_cols_fallback():
    # try explicit GLOBAL_COLS_PATH (if you saved it)
    p = globals().get("GLOBAL_COLS_PATH", None)
    if p is not None:
        p = Path(p)
        if p.exists():
            try:
                j = json.load(open(p, "r", encoding="utf-8"))
                if isinstance(j, dict) and "cols" in j and isinstance(j["cols"], list):
                    return [str(x) for x in j["cols"]]
            except Exception:
                pass

    # try LOG_DIR/global_feature_cols.json
    log_dir = Path(globals().get("LOG_DIR", ART_DIR.parent / "logs"))
    p2 = log_dir / "global_feature_cols.json"
    if p2.exists():
        try:
            j = json.load(open(p2, "r", encoding="utf-8"))
            if isinstance(j, dict) and "cols" in j and isinstance(j["cols"], list):
                return [str(x) for x in j["cols"]]
        except Exception:
            pass
    return None

# ----------------------------
# 8) Inference per fold -> LOGIT ensemble
# ----------------------------
BATCH_SIZE = 64
test_logit_folds = np.zeros((NTE, int(n_splits)), dtype=np.float32)

print(f"[Stage 10] Test inference (AUTO-ARCH + LOGIT ensemble): N_test={NTE:,} | folds={n_splits} | batch={BATCH_SIZE} | CPU")

arch_used = None
gcols_used = None

for fold, ckpt_path in enumerate(ckpts):
    ckpt_obj = torch_load_compat(ckpt_path)
    sd, meta = extract_state_and_meta(ckpt_obj)

    arch = infer_from_state(sd)
    if arch_used is None:
        arch_used = dict(arch)

    # choose n_heads (must divide d_model; if cfg exists use it)
    cfg = meta.get("cfg", {}) if isinstance(meta, dict) else {}
    n_heads = int(cfg.get("n_heads", 0)) if isinstance(cfg, dict) else 0
    if n_heads <= 0 or (arch["d_model"] % n_heads != 0):
        # fallback: pick a divisor
        for h in [8, 4, 2, 1, 16, 32]:
            if arch["d_model"] % h == 0:
                n_heads = h
                break
        if n_heads <= 0:
            n_heads = 4

    # HARD SCHEMA CHECKS to avoid silent mismatch
    if arch["feat_dim"] != Fdim:
        raise RuntimeError(
            f"Fold {fold}: feature_dim mismatch.\n"
            f"- ckpt expects feat_dim={arch['feat_dim']}\n"
            f"- memmap has Fdim={Fdim}\n"
            "Solusi: pastikan STAGE 6 feature list sama saat training ckpt dibuat."
        )
    if arch["max_len_ckpt"] != L:
        raise RuntimeError(
            f"Fold {fold}: max_len mismatch.\n"
            f"- ckpt max_len={arch['max_len_ckpt']}\n"
            f"- memmap MAX_LEN={L}\n"
        )

    # Determine global cols
    G_COLS = meta.get("global_cols", None) if isinstance(meta, dict) else None
    if G_COLS is None:
        G_COLS = load_global_cols_fallback()

    # Build G_np with correct g_dim
    g_dim = int(arch["g_dim"])
    if G_COLS is not None and isinstance(G_COLS, (list, tuple)) and len(G_COLS) > 0:
        G_COLS = [str(x) for x in G_COLS]
        # adjust length to g_dim
        if len(G_COLS) > g_dim:
            G_COLS = G_COLS[:g_dim]
        for c in G_COLS:
            if c not in df_test_meta.columns:
                df_test_meta[c] = 0.0
        G_raw = df_test_meta.loc[test_ids, G_COLS].copy()
        for c in G_COLS:
            G_raw[c] = pd.to_numeric(G_raw[c], errors="coerce").fillna(0.0).astype(np.float32)
        G_np = G_raw.to_numpy(dtype=np.float32, copy=False)
        if G_np.shape[1] < g_dim:
            pad = np.zeros((NTE, g_dim - G_np.shape[1]), dtype=np.float32)
            G_np = np.concatenate([G_np, pad], axis=1)
    else:
        # last resort: take first g_dim numeric columns from df_test_meta
        cand = []
        for c in df_test_meta.columns:
            if c in ("target", "split", "object_id"):
                continue
            if pd.api.types.is_numeric_dtype(df_test_meta[c]):
                cand.append(c)
        use = cand[:g_dim]
        if len(use) > 0:
            G_raw = df_test_meta.loc[test_ids, use].copy()
            for c in use:
                G_raw[c] = pd.to_numeric(G_raw[c], errors="coerce").fillna(0.0).astype(np.float32)
            G_np = G_raw.to_numpy(dtype=np.float32, copy=False)
        else:
            G_np = np.zeros((NTE, 0), dtype=np.float32)

        if G_np.shape[1] < g_dim:
            pad = np.zeros((NTE, g_dim - G_np.shape[1]), dtype=np.float32)
            G_np = np.concatenate([G_np, pad], axis=1)

        G_COLS = use if len(use) else [f"_auto_{i}" for i in range(g_dim)]

    # Apply scaler if available (meta dict)
    scaler = meta.get("global_scaler", None) if isinstance(meta, dict) else None
    if scaler is not None and isinstance(scaler, dict) and ("mean" in scaler) and ("std" in scaler):
        g_mean = np.asarray(scaler["mean"], dtype=np.float32).reshape(-1)
        g_std  = np.asarray(scaler["std"],  dtype=np.float32).reshape(-1)
        g_std  = np.where(g_std < 1e-6, 1.0, g_std).astype(np.float32)
        if g_mean.shape[0] == g_dim and g_std.shape[0] == g_dim:
            G_np_z = ((G_np - g_mean) / g_std).astype(np.float32)
        else:
            G_np_z = G_np.astype(np.float32, copy=False)
    else:
        G_np_z = G_np.astype(np.float32, copy=False)

    if gcols_used is None:
        gcols_used = list(G_COLS)

    # Build model to match ckpt
    model = FlexMultibandEventTransformer(
        feat_dim=arch["feat_dim"],
        max_len=arch["max_len_ckpt"],
        n_bands=arch["n_bands"],
        d_model=arch["d_model"],
        n_heads=n_heads,
        n_layers=arch["n_layers"],
        dim_ff=arch["dim_ff"],
        dropout=float(cfg.get("dropout", 0.0)) if isinstance(cfg, dict) else 0.0,
        g_dim=g_dim,
        g_hidden=arch["g_hidden"],
        xproj_is_seq=arch["xproj_is_seq"],
        has_pool_ln=arch["has_pool_ln"],
        head_final_idx=arch["head_final_idx"],
    ).to(device)

    # load weights
    model.load_state_dict(sd, strict=True)

    # predict logits
    ds_test = TestMemmapDataset(Xte, Bte, Mte, G_np_z)
    dl_test = make_loader(ds_test, batch_size=BATCH_SIZE)

    logits = predict_logits(model, dl_test)
    if len(logits) != NTE:
        raise RuntimeError(f"Fold {fold}: logits length mismatch {len(logits)} vs {NTE}")

    test_logit_folds[:, fold] = logits
    probs_tmp = sigmoid_np(logits)
    print(f"  fold {fold}: d_model={arch['d_model']} g_dim={g_dim} | logit_mean={float(logits.mean()):.6f} | prob_mean={float(probs_tmp.mean()):.6f} | prob_std={float(probs_tmp.std()):.6f}")

    del model, ds_test, dl_test, logits, probs_tmp, G_np, G_np_z
    gc.collect()

# ensemble on logits
test_logit_ens = test_logit_folds.mean(axis=1).astype(np.float32)
test_prob_folds = sigmoid_np(test_logit_folds).astype(np.float32)
test_prob_ens   = sigmoid_np(test_logit_ens).astype(np.float32)

# ----------------------------
# 9) Save artifacts
# ----------------------------
logit_fold_path = OUT_DIR / "test_logit_folds.npy"
logit_ens_path  = OUT_DIR / "test_logit_ens.npy"
prob_fold_path  = OUT_DIR / "test_prob_folds.npy"
prob_ens_path   = OUT_DIR / "test_prob_ens.npy"
csv_path        = OUT_DIR / "test_prob_ens.csv"
cfg_path        = OUT_DIR / "test_infer_config.json"

np.save(logit_fold_path, test_logit_folds)
np.save(logit_ens_path,  test_logit_ens)
np.save(prob_fold_path,  test_prob_folds)
np.save(prob_ens_path,   test_prob_ens)

pd.DataFrame({"object_id": test_ids, "prob": test_prob_ens}).to_csv(csv_path, index=False)

infer_cfg = {
    "seed": int(SEED),
    "n_splits": int(n_splits),
    "ensemble": "mean_logits_then_sigmoid",
    "batch_size": int(BATCH_SIZE),
    "max_len": int(L),
    "feature_dim": int(Fdim),
    "feature_names": list(SEQ_FEATURE_NAMES),
    "ckpt_dir": str(CKPT_DIR),
    "ckpts": [str(p) for p in ckpts],
    "arch_inferred_from_first_fold": arch_used,
    "global_cols_used_first_fold": gcols_used,
    "outputs": {
        "test_logit_folds": str(logit_fold_path),
        "test_logit_ens": str(logit_ens_path),
        "test_prob_folds": str(prob_fold_path),
        "test_prob_ens": str(prob_ens_path),
        "test_prob_ens_csv": str(csv_path),
    }
}
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(infer_cfg, f, indent=2)

print("\n[Stage 10] DONE")
print(f"- Saved logits folds: {logit_fold_path}")
print(f"- Saved logits ens  : {logit_ens_path}")
print(f"- Saved probs folds : {prob_fold_path}")
print(f"- Saved probs ens   : {prob_ens_path}")
print(f"- Saved csv         : {csv_path}")
print(f"- Saved config      : {cfg_path}")
print(f"- ens prob mean={float(test_prob_ens.mean()):.6f} | std={float(test_prob_ens.std()):.6f} | min={float(test_prob_ens.min()):.6f} | max={float(test_prob_ens.max()):.6f}")

# Export globals for submission stage
globals().update({
    "test_ids": test_ids,
    "test_logit_folds": test_logit_folds,
    "test_logit_ens": test_logit_ens,
    "test_prob_folds": test_prob_folds,
    "test_prob_ens": test_prob_ens,
    "TEST_LOGIT_FOLDS_PATH": logit_fold_path,
    "TEST_LOGIT_ENS_PATH": logit_ens_path,
    "TEST_PROB_FOLDS_PATH": prob_fold_path,
    "TEST_PROB_ENS_PATH": prob_ens_path,
    "TEST_PROB_CSV_PATH": csv_path,
    "TEST_INFER_CFG_PATH": cfg_path,
})

gc.collect()


[Stage 10] Test inference (AUTO-ARCH + LOGIT ensemble): N_test=7,135 | folds=5 | batch=64 | CPU
  fold 0: d_model=160 g_dim=38 | logit_mean=6.670060 | prob_mean=0.973709 | prob_std=0.050629
  fold 1: d_model=160 g_dim=38 | logit_mean=13.037735 | prob_mean=0.999956 | prob_std=0.000081
  fold 2: d_model=160 g_dim=38 | logit_mean=2.777563 | prob_mean=0.729435 | prob_std=0.301560
  fold 3: d_model=160 g_dim=38 | logit_mean=8.799269 | prob_mean=0.999768 | prob_std=0.000222
  fold 4: d_model=160 g_dim=38 | logit_mean=-3.924105 | prob_mean=0.020096 | prob_std=0.005309

[Stage 10] DONE
- Saved logits folds: /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/preds/test_logit_folds.npy
- Saved logits ens  : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/preds/test_logit_ens.npy
- Saved probs folds : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/artifacts/preds/test_prob_folds.npy
- Saved probs ens   : /kaggle/working/mallorn_run/run_20260102_1

33

# Evalution 

In [13]:
# ============================================================
# ONE CELL — EVALUATION (Precision / Recall / F1) + Threshold Sweep (OOF)
# (REVISI FULL v2 — Robust Align + Sanitize + Fbeta option + AUC optional)
#
# Input minimal:
# - df_train_meta (kolom: target; index: object_id)
# - oof_prob (globals) ATAU file OOF_DIR/oof_prob.npy ATAU OOF_DIR/oof_prob.csv
#
# Output:
# - Print ringkasan metrik
# - Save: eval_report.txt + eval_threshold_table.csv + eval_summary.json
# - Export globals: BEST_THR_F1, BEST_THR_F05, BEST_THR_F2, thr_table_eval
# ============================================================

import gc, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require minimal
# ----------------------------
if "df_train_meta" not in globals():
    raise RuntimeError("Missing df_train_meta. Jalankan STAGE 2 dulu (meta).")

ART_DIR = Path(globals().get("ART_DIR", "/kaggle/working"))
OOF_DIR = Path(globals().get("OOF_DIR", ART_DIR / "oof"))
OOF_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Utils: id normalize + robust 1D float32
# ----------------------------
def _norm_id(x):
    if isinstance(x, (bytes, np.bytes_)):
        try:
            x = x.decode("utf-8", errors="ignore")
        except Exception:
            x = str(x)
    s = str(x).strip()
    if (s.startswith("b'") and s.endswith("'")) or (s.startswith('b"') and s.endswith('"')):
        s = s[2:-1]
    return s.strip()

def _as_1d_float32(arr):
    a = np.asarray(arr)
    if a.dtype == object and a.ndim == 0:
        try:
            a = np.asarray(a.item())
        except Exception:
            pass
    a = np.asarray(a, dtype=np.float32)
    if a.ndim == 0:
        return a
    if a.ndim > 1:
        a = a.reshape(-1)
    return a

def _sanitize_prob(p):
    p = np.asarray(p, dtype=np.float32)
    p = np.nan_to_num(p, nan=0.0, posinf=1.0, neginf=0.0)
    p = np.clip(p, 0.0, 1.0)
    return p.astype(np.float32)

# ensure meta index normalized (no data copy)
df_train_meta = df_train_meta.copy(deep=False)
df_train_meta.index = pd.Index([_norm_id(z) for z in df_train_meta.index], name=df_train_meta.index.name)

# ----------------------------
# 1) Load oof_prob (prefer csv if exists for alignment)
# ----------------------------
def load_oof():
    # If oof_prob.csv exists -> most robust (has object_id)
    pcsv = OOF_DIR / "oof_prob.csv"
    if pcsv.exists():
        df = pd.read_csv(pcsv)
        if ("object_id" in df.columns) and ("oof_prob" in df.columns):
            df["object_id"] = df["object_id"].apply(_norm_id)
            p = _sanitize_prob(_as_1d_float32(df["oof_prob"].to_numpy()))
            # ensure same length
            if len(p) != len(df):
                raise RuntimeError("oof_prob.csv: length mismatch after parsing.")
            return p, df

    # globals
    if "oof_prob" in globals():
        p = _as_1d_float32(globals()["oof_prob"])
        if isinstance(p, np.ndarray) and p.ndim != 0:
            return _sanitize_prob(p), None

    # npy
    pnpy = OOF_DIR / "oof_prob.npy"
    if pnpy.exists():
        p = _sanitize_prob(_as_1d_float32(np.load(pnpy, allow_pickle=False)))
        return p, None

    raise FileNotFoundError("OOF prob tidak ditemukan (oof_prob.csv / globals oof_prob / oof_prob.npy). Jalankan STAGE 8 dulu.")

oof_prob, df_oof_csv = load_oof()
if not isinstance(oof_prob, np.ndarray) or oof_prob.ndim == 0:
    raise TypeError(f"Invalid oof_prob (scalar/unsized). type={type(oof_prob)} ndim={getattr(oof_prob,'ndim',None)}")

# ----------------------------
# 2) Align y (target) ke urutan oof_prob
# ----------------------------
train_ids = None
y = None

if df_oof_csv is not None:
    train_ids = df_oof_csv["object_id"].tolist()

    # drop rows whose id not in meta (avoid KeyError)
    mask_ok = [oid in df_train_meta.index for oid in train_ids]
    if not all(mask_ok):
        bad = [train_ids[i] for i, ok in enumerate(mask_ok) if not ok][:10]
        n_bad = int((~np.asarray(mask_ok, dtype=bool)).sum())
        print(f"[WARN] oof_prob.csv contains ids not in df_train_meta: missing_n={n_bad} examples={bad}")
        df_oof_csv = df_oof_csv.loc[mask_ok].reset_index(drop=True)
        train_ids = df_oof_csv["object_id"].tolist()
        oof_prob = _sanitize_prob(_as_1d_float32(df_oof_csv["oof_prob"].to_numpy()))

    y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

# fallback: train_ids_ordered
if y is None and ("train_ids_ordered" in globals()):
    ids = [_norm_id(z) for z in list(globals()["train_ids_ordered"])]
    if len(ids) == len(oof_prob):
        missing = [oid for oid in ids if oid not in df_train_meta.index]
        if missing:
            raise KeyError(f"train_ids_ordered contains ids not in df_train_meta. examples={missing[:10]} missing_n={len(missing)}")
        train_ids = ids
        y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

# last fallback: meta order must match length
if y is None:
    if len(oof_prob) != len(df_train_meta):
        raise RuntimeError(
            f"Tidak bisa align y. len(oof_prob)={len(oof_prob)} != len(df_train_meta)={len(df_train_meta)} "
            "dan tidak ada oof_prob.csv (object_id)."
        )
    train_ids = df_train_meta.index.astype(str).tolist()
    y = df_train_meta["target"].to_numpy(dtype=np.int8, copy=False)

if len(y) != len(oof_prob):
    raise RuntimeError(f"Length mismatch: y={len(y)} vs oof_prob={len(oof_prob)}")

# y sanity
uy = set(np.unique(y).tolist())
if not uy.issubset({0, 1}):
    raise ValueError(f"y must be binary 0/1. Found: {sorted(list(uy))}")

# ----------------------------
# 3) Metrics sesuai materi (TP/FP/FN -> P/R/F1)
# ----------------------------
def prf_from_pred(y_true, y_pred01):
    y_true = np.asarray(y_true, dtype=np.int32)
    y_pred01 = np.asarray(y_pred01, dtype=np.int32)

    tp = int(((y_true == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true == 1) & (y_pred01 == 0)).sum())
    tn = int(((y_true == 0) & (y_pred01 == 0)).sum())

    precision = tp / max(tp + fp, 1)   # TP/(TP+FP)
    recall    = tp / max(tp + fn, 1)   # TP/(TP+FN)
    f1 = 0.0 if (precision + recall) == 0 else (2.0 * precision * recall / (precision + recall))

    return {
        "tp": tp, "fp": fp, "fn": fn, "tn": tn,
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "pos_pred": int(y_pred01.sum()),
        "acc": float((tp + tn) / max(len(y_true), 1)),
    }

def fbeta_from_pr(precision, recall, beta=1.0):
    b2 = beta * beta
    denom = (b2 * precision + recall)
    if denom <= 0:
        return 0.0
    return float((1 + b2) * precision * recall / denom)

def eval_at_threshold(prob, y_true, thr):
    pred = (prob >= float(thr)).astype(np.int8)
    met = prf_from_pred(y_true, pred)
    met["thr"] = float(thr)
    met["f0.5"] = fbeta_from_pr(met["precision"], met["recall"], beta=0.5)
    met["f2"]   = fbeta_from_pr(met["precision"], met["recall"], beta=2.0)
    return met

# optional AUC (if sklearn exists)
roc_auc = None
pr_auc  = None
try:
    from sklearn.metrics import roc_auc_score, average_precision_score
    # only valid if both classes exist
    if (y.max() == 1) and (y.min() == 0):
        roc_auc = float(roc_auc_score(y, oof_prob))
        pr_auc  = float(average_precision_score(y, oof_prob))
except Exception:
    roc_auc = None
    pr_auc  = None

# Baseline thr=0.5
base = eval_at_threshold(oof_prob, y, 0.5)

# ----------------------------
# 4) Threshold sweep (lebih stabil: grid + quantile)
# ----------------------------
grid = np.concatenate([
    np.linspace(0.01, 0.10, 19),
    np.linspace(0.10, 0.90, 81),
    np.linspace(0.90, 0.99, 19),
]).astype(np.float32)

qs = np.linspace(0.01, 0.99, 99, dtype=np.float32)
quant_thr = np.quantile(oof_prob, qs).astype(np.float32)

thr_candidates = np.unique(np.clip(np.concatenate([grid, quant_thr]), 0.0, 1.0)).astype(np.float32)

rows = []
best_f1  = base.copy()
best_f05 = base.copy()
best_f2  = base.copy()

for thr in thr_candidates:
    met = eval_at_threshold(oof_prob, y, float(thr))
    rows.append([
        met["thr"], met["f1"], met["f0.5"], met["f2"],
        met["precision"], met["recall"], met["acc"],
        met["tp"], met["fp"], met["fn"], met["tn"], met["pos_pred"]
    ])

    # best F1 (tie-break: recall higher, then fp lower)
    if (met["f1"] > best_f1["f1"] + 1e-12) or (
        abs(met["f1"] - best_f1["f1"]) <= 1e-12 and (met["recall"] > best_f1["recall"] + 1e-12)
    ) or (
        abs(met["f1"] - best_f1["f1"]) <= 1e-12 and abs(met["recall"] - best_f1["recall"]) <= 1e-12 and (met["fp"] < best_f1["fp"])
    ):
        best_f1 = met.copy()

    # best F0.5 (precision-leaning)
    if met["f0.5"] > best_f05.get("f0.5", -1) + 1e-12:
        best_f05 = met.copy()

    # best F2 (recall-leaning)
    if met["f2"] > best_f2.get("f2", -1) + 1e-12:
        best_f2 = met.copy()

thr_table = pd.DataFrame(
    rows,
    columns=["thr","f1","f0.5","f2","precision","recall","acc","tp","fp","fn","tn","pos_pred"]
).sort_values(["f1","recall","precision"], ascending=[False, False, False]).reset_index(drop=True)

BEST_THR_F1  = float(best_f1["thr"])
BEST_THR_F05 = float(best_f05["thr"])
BEST_THR_F2  = float(best_f2["thr"])

# ----------------------------
# 5) Print report
# ----------------------------
pos = int((y == 1).sum())
neg = int((y == 0).sum())
N = int(len(y))

print("EVALUATION (OOF) — Precision/Recall/F1")
print(f"- N={N:,} | pos={pos:,} | neg={neg:,} | pos%={pos/max(N,1)*100:.4f}%")
if roc_auc is not None:
    print(f"- ROC-AUC={roc_auc:.6f} | PR-AUC={pr_auc:.6f}")
print("")

print("Baseline @ thr=0.5")
print(f"- F1={base['f1']:.6f} | P={base['precision']:.6f} | R={base['recall']:.6f} | ACC={base['acc']:.6f}")
print(f"  tp={base['tp']} fp={base['fp']} fn={base['fn']} tn={base['tn']} | pos_pred={base['pos_pred']}\n")

print(f"BEST-F1  @ thr={BEST_THR_F1:.6f}")
print(f"- F1={best_f1['f1']:.6f} | P={best_f1['precision']:.6f} | R={best_f1['recall']:.6f} | ACC={best_f1['acc']:.6f}")
print(f"  tp={best_f1['tp']} fp={best_f1['fp']} fn={best_f1['fn']} tn={best_f1['tn']} | pos_pred={best_f1['pos_pred']}\n")

print(f"BEST-F0.5 @ thr={BEST_THR_F05:.6f} (lebih condong precision)")
print(f"- F0.5={best_f05['f0.5']:.6f} | P={best_f05['precision']:.6f} | R={best_f05['recall']:.6f} | F1={best_f05['f1']:.6f}\n")

print(f"BEST-F2   @ thr={BEST_THR_F2:.6f} (lebih condong recall)")
print(f"- F2={best_f2['f2']:.6f} | P={best_f2['precision']:.6f} | R={best_f2['recall']:.6f} | F1={best_f2['f1']:.6f}\n")

print("Top 10 thresholds by F1:")
for i in range(min(10, len(thr_table))):
    r = thr_table.iloc[i]
    print(f"{i+1:02d}. thr={r['thr']:.6f} | f1={r['f1']:.6f} | f0.5={r['f0.5']:.6f} | f2={r['f2']:.6f} | "
          f"P={r['precision']:.6f} R={r['recall']:.6f} | tp={int(r['tp'])} fp={int(r['fp'])} fn={int(r['fn'])} | pos_pred={int(r['pos_pred'])}")

# ----------------------------
# 6) Save artifacts
# ----------------------------
out_txt  = OOF_DIR / "eval_report.txt"
out_csv  = OOF_DIR / "eval_threshold_table.csv"
out_json = OOF_DIR / "eval_summary.json"

lines = []
lines.append("OOF Evaluation Report (Precision/Recall/F1)")
lines.append(f"N={N} | pos={pos} | neg={neg} | pos%={pos/max(N,1)*100:.6f}%")
if roc_auc is not None:
    lines.append(f"ROC-AUC={roc_auc:.8f} | PR-AUC={pr_auc:.8f}")
lines.append("")
lines.append("Baseline @ thr=0.5")
lines.append(f"F1={base['f1']:.8f} | P={base['precision']:.8f} | R={base['recall']:.8f} | ACC={base['acc']:.8f}")
lines.append(f"tp={base['tp']} fp={base['fp']} fn={base['fn']} tn={base['tn']} | pos_pred={base['pos_pred']}")
lines.append("")
lines.append(f"BEST-F1 @ thr={BEST_THR_F1:.8f}")
lines.append(f"F1={best_f1['f1']:.8f} | P={best_f1['precision']:.8f} | R={best_f1['recall']:.8f} | ACC={best_f1['acc']:.8f}")
lines.append(f"tp={best_f1['tp']} fp={best_f1['fp']} fn={best_f1['fn']} tn={best_f1['tn']} | pos_pred={best_f1['pos_pred']}")
lines.append("")
lines.append(f"BEST-F0.5 @ thr={BEST_THR_F05:.8f}")
lines.append(f"F0.5={best_f05['f0.5']:.8f} | P={best_f05['precision']:.8f} | R={best_f05['recall']:.8f} | F1={best_f05['f1']:.8f}")
lines.append("")
lines.append(f"BEST-F2 @ thr={BEST_THR_F2:.8f}")
lines.append(f"F2={best_f2['f2']:.8f} | P={best_f2['precision']:.8f} | R={best_f2['recall']:.8f} | F1={best_f2['f1']:.8f}")
lines.append("")
lines.append("Top 10 thresholds by F1:")
for i in range(min(10, len(thr_table))):
    r = thr_table.iloc[i]
    lines.append(f"{i+1:02d}. thr={r['thr']:.8f} | f1={r['f1']:.8f} | f0.5={r['f0.5']:.8f} | f2={r['f2']:.8f} | "
                 f"P={r['precision']:.8f} R={r['recall']:.8f} | tp={int(r['tp'])} fp={int(r['fp'])} fn={int(r['fn'])} | pos_pred={int(r['pos_pred'])}")

with open(out_txt, "w", encoding="utf-8") as f:
    f.write("\n".join(lines) + "\n")

thr_table.to_csv(out_csv, index=False)

payload = {
    "N": N, "pos": pos, "neg": neg,
    "roc_auc": roc_auc, "pr_auc": pr_auc,
    "baseline_thr_0p5": base,
    "best_f1": best_f1,
    "best_f0.5": best_f05,
    "best_f2": best_f2,
    "paths": {"report": str(out_txt), "table": str(out_csv)}
}
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2)

print("\nSaved:")
print(f"- {out_txt}")
print(f"- {out_csv}")
print(f"- {out_json}")

# Export for next stages
globals().update({
    "BEST_THR_F1": BEST_THR_F1,
    "BEST_THR_F05": BEST_THR_F05,
    "BEST_THR_F2": BEST_THR_F2,
    "thr_table_eval": thr_table,
})

gc.collect()


EVALUATION (OOF) — Precision/Recall/F1
- N=3,043 | pos=148 | neg=2,895 | pos%=4.8636%
- ROC-AUC=0.863769 | PR-AUC=0.243643

Baseline @ thr=0.5
- F1=0.156352 | P=0.085006 | R=0.972973 | ACC=0.489320
  tp=144 fp=1550 fn=4 tn=1345 | pos_pred=1694

BEST-F1  @ thr=0.970000
- F1=0.331512 | P=0.226933 | R=0.614865 | ACC=0.879395
  tp=91 fp=310 fn=57 tn=2585 | pos_pred=401

BEST-F0.5 @ thr=0.991352 (lebih condong precision)
- F0.5=0.283019 | P=0.295082 | R=0.243243 | F1=0.266667

BEST-F2   @ thr=0.961601 (lebih condong recall)
- F2=0.458758 | P=0.203285 | R=0.668919 | F1=0.311811

Top 10 thresholds by F1:
01. thr=0.970000 | f1=0.331512 | f0.5=0.259703 | f2=0.458207 | P=0.226933 R=0.614865 | tp=91 fp=310 fn=57 | pos_pred=401
02. thr=0.970762 | f1=0.330882 | f0.5=0.259815 | f2=0.455466 | P=0.227273 R=0.608108 | tp=90 fp=306 fn=58 | pos_pred=396
03. thr=0.975000 | f1=0.329317 | f0.5=0.264858 | f2=0.435244 | P=0.234286 R=0.554054 | tp=82 fp=268 fn=66 | pos_pred=350
04. thr=0.973192 | f1=0.326848 |

33

# Submission Build

In [14]:
# ============================================================
# STAGE 11 — Submission Build (ONE CELL, Kaggle CPU-SAFE) — REVISI FULL v2
#
# Output wajib kompetisi:
# - header: object_id,prediction
# - prediction HARUS 0/1
# - file utama: /kaggle/working/submission.csv
# ============================================================

import gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require STAGE 0 globals
# ----------------------------
for need in ["PATHS", "SUB_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 dulu (setup).")

sample_path = Path(PATHS["SAMPLE_SUB"])
if not sample_path.exists():
    raise FileNotFoundError(f"Missing sample_submission.csv: {sample_path}")

df_sub = pd.read_csv(sample_path)
if not {"object_id", "prediction"}.issubset(df_sub.columns):
    raise ValueError(f"sample_submission must have columns object_id,prediction. Found: {list(df_sub.columns)}")

# ----------------------------
# Helpers
# ----------------------------
def _norm_id(x):
    if isinstance(x, (bytes, np.bytes_)):
        try:
            x = x.decode("utf-8", errors="ignore")
        except Exception:
            x = str(x)
    s = str(x).strip()
    if (s.startswith("b'") and s.endswith("'")) or (s.startswith('b"') and s.endswith('"')):
        s = s[2:-1]
    return s.strip()

def _as_1d_float32(arr):
    a = np.asarray(arr)
    if a.dtype == object and a.ndim == 0:
        try:
            a = np.asarray(a.item())
        except Exception:
            pass
    a = np.asarray(a, dtype=np.float32)
    if a.ndim == 0:
        return a
    if a.ndim > 1:
        a = a.reshape(-1)
    return a

def _sanitize_prob(p):
    p = np.asarray(p, dtype=np.float32)
    p = np.nan_to_num(p, nan=0.0, posinf=1.0, neginf=0.0)
    p = np.clip(p, 0.0, 1.0)
    return p.astype(np.float32)

def _load_ids_npy(path: Path):
    arr = np.load(path, allow_pickle=False)
    ids = arr.tolist() if hasattr(arr, "tolist") else list(arr)
    return [_norm_id(x) for x in ids]

def _load_pred_df():
    """
    Return df_pred with columns: object_id, prob
    Priority:
      A) globals: test_ids + test_prob_ens
      B) csv: TEST_PROB_CSV_PATH / ART_DIR/test_prob_ens.csv
      C) npy: FIX_DIR/test_ids.npy + ART_DIR/test_prob_ens.npy
    """
    # ---- A) globals ----
    if ("test_prob_ens" in globals()) and (globals()["test_prob_ens"] is not None) and \
       ("test_ids" in globals()) and (globals()["test_ids"] is not None):
        try:
            ids = [_norm_id(x) for x in list(globals()["test_ids"])]
            prob = _sanitize_prob(_as_1d_float32(globals()["test_prob_ens"]))
            if isinstance(prob, np.ndarray) and prob.ndim != 0 and len(ids) == len(prob) and len(ids) > 0:
                return pd.DataFrame({"object_id": ids, "prob": prob})
        except Exception:
            pass

    # ---- B) csv fallback (best if already aligned with object_id) ----
    art_dir = Path(globals().get("ART_DIR", "/kaggle/working"))
    cand_csv = []

    if "TEST_PROB_CSV_PATH" in globals() and globals()["TEST_PROB_CSV_PATH"] is not None:
        cand_csv.append(Path(globals()["TEST_PROB_CSV_PATH"]))
    cand_csv.append(art_dir / "test_prob_ens.csv")

    for p in cand_csv:
        if p.exists():
            df = pd.read_csv(p)
            # support either (object_id, prob) or (object_id, prediction) naming
            if "object_id" in df.columns and ("prob" in df.columns or "prediction" in df.columns):
                df = df.copy()
                df["object_id"] = df["object_id"].apply(_norm_id)
                colp = "prob" if "prob" in df.columns else "prediction"
                prob = _sanitize_prob(_as_1d_float32(df[colp].to_numpy()))
                if len(prob) != len(df):
                    raise RuntimeError(f"CSV prob length mismatch: {p}")
                return pd.DataFrame({"object_id": df["object_id"].tolist(), "prob": prob})

    # ---- C) npy fallback ----
    fix_dir = Path(globals().get("FIX_DIR", "/kaggle/working/mallorn_run/artifacts/fixed_seq"))
    p_ids = fix_dir / "test_ids.npy"
    if not p_ids.exists():
        raise RuntimeError("Missing test_ids. Pastikan STAGE 6 membuat fixed_seq/test_ids.npy atau STAGE 10 export test_ids.")

    ids = _load_ids_npy(p_ids)
    if len(ids) == 0:
        raise RuntimeError("test_ids.npy kosong.")

    # test prob npy from globals path or default
    cand_npy = []
    if "TEST_PROB_ENS_PATH" in globals() and globals()["TEST_PROB_ENS_PATH"] is not None:
        cand_npy.append(Path(globals()["TEST_PROB_ENS_PATH"]))
    cand_npy.append(art_dir / "test_prob_ens.npy")

    prob = None
    for p in cand_npy:
        if p.exists():
            prob = _sanitize_prob(_as_1d_float32(np.load(p, allow_pickle=False)))
            break
    if prob is None:
        raise RuntimeError("Missing test_prob_ens. Jalankan STAGE 10 dulu (Test Inference).")

    if not isinstance(prob, np.ndarray) or prob.ndim == 0:
        raise TypeError(f"Invalid test_prob (scalar/unsized). type={type(prob)} ndim={getattr(prob,'ndim',None)}")

    if len(prob) != len(ids):
        raise RuntimeError(f"Length mismatch (NPY): test_prob={len(prob)} vs test_ids={len(ids)}")

    return pd.DataFrame({"object_id": ids, "prob": prob})

# ----------------------------
# 1) Load prediction df
# ----------------------------
df_pred = _load_pred_df()
if df_pred.empty:
    raise RuntimeError("df_pred empty (unexpected).")

df_pred["object_id"] = df_pred["object_id"].apply(_norm_id)
if df_pred["object_id"].duplicated().any():
    dup = df_pred.loc[df_pred["object_id"].duplicated(), "object_id"].iloc[:10].tolist()
    raise ValueError(f"Duplicated object_id in predictions (examples): {dup}")

p = df_pred["prob"].to_numpy(dtype=np.float32, copy=False)
if not np.isfinite(p).all():
    bad = int((~np.isfinite(p)).sum())
    raise ValueError(f"Found non-finite probabilities in test predictions: {bad} rows")
df_pred["prob"] = _sanitize_prob(p)

# ----------------------------
# 2) Threshold selection (priority)
# ----------------------------
FORCE_THR = None  # set manual if you want, e.g. 0.37
thr = None
if FORCE_THR is not None:
    thr = float(FORCE_THR)
elif "BEST_THR_F1" in globals() and globals()["BEST_THR_F1"] is not None:
    thr = float(globals()["BEST_THR_F1"])
elif "BEST_THR" in globals() and globals()["BEST_THR"] is not None:
    thr = float(globals()["BEST_THR"])
else:
    thr = 0.5

thr = float(np.clip(thr, 0.0, 1.0))

# ----------------------------
# 3) Align to sample_submission order + build BINARY prediction (0/1)
# ----------------------------
df_sub = df_sub.copy()
df_sub["object_id"] = df_sub["object_id"].apply(_norm_id)

if df_sub["object_id"].duplicated().any():
    dup = df_sub.loc[df_sub["object_id"].duplicated(), "object_id"].iloc[:10].tolist()
    raise ValueError(f"sample_submission has duplicate object_id (unexpected). examples={dup}")

df_out = df_sub[["object_id"]].merge(df_pred, on="object_id", how="left")

if df_out["prob"].isna().any():
    missing_n = int(df_out["prob"].isna().sum())
    miss_ids = df_out.loc[df_out["prob"].isna(), "object_id"].iloc[:10].tolist()
    raise ValueError(
        f"Some sample_submission object_id have no prediction: missing_n={missing_n}. Examples: {miss_ids}\n"
        "Biasanya karena mismatch id normalization atau pred df tidak lengkap."
    )

# REQUIRED: binary 0/1
df_out["prediction"] = (df_out["prob"].to_numpy(dtype=np.float32) >= np.float32(thr)).astype(np.int8)
df_out = df_out[["object_id", "prediction"]]

# strict format checks
u = set(np.unique(df_out["prediction"].to_numpy()).tolist())
if not u.issubset({0, 1}):
    raise RuntimeError(f"submission prediction contains values outside {{0,1}}: {sorted(list(u))}")

if len(df_out) != len(df_sub):
    raise RuntimeError("submission row count mismatch with sample_submission.")

# quick stats
pos_pred = int(df_out["prediction"].sum())
print("[Stage 11] SUBMISSION READY (BINARY 0/1)")
print(f"- threshold_used={thr:.6f}")
print(f"- rows={len(df_out):,} | pos_pred={pos_pred:,} ({pos_pred/max(len(df_out),1)*100:.4f}%)")

# ----------------------------
# 4) Write files
# ----------------------------
SUB_DIR = Path(SUB_DIR)
SUB_DIR.mkdir(parents=True, exist_ok=True)

out_main  = Path("/kaggle/working/submission.csv")
out_copy  = SUB_DIR / "submission.csv"

df_out.to_csv(out_main, index=False)
df_out.to_csv(out_copy, index=False)

print(f"- wrote: {out_main}")
print(f"- copy : {out_copy}")

print("\nPreview:")
print(df_out.head(8).to_string(index=False))

globals().update({
    "SUBMISSION_PATH": out_main,
    "SUBMISSION_COPY_PATH": out_copy,
    "SUBMISSION_MODE": "binary",
    "SUBMISSION_THRESHOLD": thr,
})

gc.collect()


[Stage 11] SUBMISSION READY (BINARY 0/1)
- threshold_used=0.970000
- rows=7,135 | pos_pred=5,381 (75.4170%)
- wrote: /kaggle/working/submission.csv
- copy : /kaggle/working/mallorn_run/run_20260102_184148_9f34156418/submissions/submission.csv

Preview:
                   object_id  prediction
    Eluwaith_Mithrim_nothrim           1
          Eru_heledir_archam           1
           Gonhir_anann_fuin           1
Gwathuirim_haradrim_tegilbor           1
            achas_minai_maen           1
               adab_fae_gath           1
             adel_draug_gaur           1
     aderthad_cuil_galadhrim           1


0