In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mallorn-dataset/sample_submission.csv
/kaggle/input/mallorn-dataset/test_log.csv
/kaggle/input/mallorn-dataset/train_log.csv
/kaggle/input/mallorn-dataset/split_17/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_17/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_01/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_01/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_02/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_02/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_08/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_08/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_04/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_04/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_07/train_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_07/test_full_lightcurves.csv
/kaggle/input/mallorn-dataset/split_15/train_full_lightcurves.csv
/kaggle/i

 # Kaggle CPU Environment Setup

In [2]:
# ============================================================
# STAGE 0 — Kaggle CPU Environment Setup (ONE CELL, SAFE + COHESIVE) — REVISI FULL
# Fokus:
# - Path sesuai dataset kamu: /kaggle/input/mallorn-dataset
# - Hard guards (file + split + konsistensi id)
# - Thread limits anti-freeze
# - Siap untuk pipeline ASTROMER (tanpa install / tanpa load lightcurve besar)
# ============================================================

import os, sys, gc, json, random, warnings
from pathlib import Path

import numpy as np
import pandas as pd

# ----------------------------
# 0) Quiet + deterministic
# ----------------------------
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

SEED = 2025
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

# ----------------------------
# 1) CPU thread limits (anti-freeze on Kaggle CPU)
# ----------------------------
# BLAS/OMP oversubscription sering bikin notebook lambat/hang
os.environ.setdefault("OMP_NUM_THREADS", "2")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "2")
os.environ.setdefault("MKL_NUM_THREADS", "2")
os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "2")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "2")
os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")

# Siap ASTROMER (umumnya pakai TensorFlow/Keras di belakang)
# (Tidak meng-import TF di sini; hanya set env agar aman saat stage ASTROMER nanti)
os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2")
os.environ.setdefault("TF_NUM_INTRAOP_THREADS", "2")
os.environ.setdefault("TF_NUM_INTEROP_THREADS", "1")

# Torch optional (boleh tetap ada; pipeline ASTROMER tidak wajib torch)
try:
    import torch
    torch.manual_seed(SEED)
    torch.set_num_threads(2)
    torch.set_num_interop_threads(1)
except Exception:
    torch = None

# ----------------------------
# 2) Paths (sesuai yang kamu tulis)
# ----------------------------
DATA_ROOT = Path("/kaggle/input/mallorn-dataset")

PATHS = {
    "DATA_ROOT": DATA_ROOT,
    "SAMPLE_SUB": DATA_ROOT / "sample_submission.csv",
    "TRAIN_LOG":  DATA_ROOT / "train_log.csv",
    "TEST_LOG":   DATA_ROOT / "test_log.csv",
    "SPLITS":     [DATA_ROOT / f"split_{i:02d}" for i in range(1, 21)],
}

# ----------------------------
# 3) Working directories (writeable on Kaggle)
# ----------------------------
WORKDIR = Path("/kaggle/working")
RUN_DIR = WORKDIR / "mallorn_run"
ART_DIR = RUN_DIR / "artifacts"
CACHE_DIR = RUN_DIR / "cache"
EMB_DIR = CACHE_DIR / "embeddings"     # untuk ASTROMER embedding (nanti)
FEAT_DIR = CACHE_DIR / "features"      # untuk tabular final (nanti)
OOF_DIR = RUN_DIR / "oof"
SUB_DIR = RUN_DIR / "submissions"
LOG_DIR = RUN_DIR / "logs"

for d in [RUN_DIR, ART_DIR, CACHE_DIR, EMB_DIR, FEAT_DIR, OOF_DIR, SUB_DIR, LOG_DIR]:
    d.mkdir(parents=True, exist_ok=True)

# Flag pipeline (dipakai stage berikutnya)
USE_ASTROMER = True

# ----------------------------
# 4) Hard guards: files must exist
# ----------------------------
def _must_exist(p: Path, what: str):
    if not p.exists():
        raise FileNotFoundError(f"[MISSING] {what}: {p}")

_must_exist(PATHS["SAMPLE_SUB"], "sample_submission.csv")
_must_exist(PATHS["TRAIN_LOG"], "train_log.csv")
_must_exist(PATHS["TEST_LOG"],  "test_log.csv")

missing_splits = [s for s in PATHS["SPLITS"] if not s.exists()]
if missing_splits:
    sample = "\n".join(str(x) for x in missing_splits[:5])
    raise FileNotFoundError(f"Some split folders are missing (showing up to 5):\n{sample}")

bad = []
for sd in PATHS["SPLITS"]:
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"
    if (not tr.exists()) or (not te.exists()):
        bad.append((sd.name, tr.exists(), te.exists()))
if bad:
    msg = "\n".join([f"- {name}: train={tr_ok}, test={te_ok}" for name, tr_ok, te_ok in bad[:10]])
    raise FileNotFoundError(
        "Some split lightcurve files are missing (showing up to 10):\n"
        f"{msg}"
    )

# ----------------------------
# 5) Load small metadata only (safe on CPU)
# ----------------------------
# dtype ringan untuk kolom string supaya parsing stabil
dtype_sub = {"object_id": "string"}
df_sub = pd.read_csv(PATHS["SAMPLE_SUB"], dtype=dtype_sub)

# sample_submission harus punya header object_id,prediction
df_sub.columns = [c.strip() for c in df_sub.columns]
if not {"object_id", "prediction"}.issubset(df_sub.columns):
    raise ValueError(f"sample_submission columns must include object_id,prediction. Found: {list(df_sub.columns)}")

dtype_log = {"object_id": "string", "split": "string"}
df_train_log = pd.read_csv(PATHS["TRAIN_LOG"], dtype=dtype_log)
df_test_log  = pd.read_csv(PATHS["TEST_LOG"],  dtype=dtype_log)

# rapikan nama kolom
def _norm_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

df_train_log = _norm_cols(df_train_log)
df_test_log  = _norm_cols(df_test_log)

# Validasi minimal kolom wajib (sesuai deskripsi dataset kamu)
need_train = {"object_id", "EBV", "Z", "split", "target"}
need_test  = {"object_id", "EBV", "Z", "split"}  # Z_err opsional (bisa ada di test)
missing_train = sorted(list(need_train - set(df_train_log.columns)))
missing_test  = sorted(list(need_test - set(df_test_log.columns)))

if missing_train:
    raise ValueError(f"train_log.csv missing required columns: {missing_train}")
if missing_test:
    raise ValueError(f"test_log.csv missing required columns: {missing_test}")

# ----------------------------
# 6) Normalize split name -> "split_XX"
# ----------------------------
valid_split_names = {f"split_{i:02d}" for i in range(1, 21)}

def _normalize_split(x):
    if pd.isna(x):
        return ""
    s = str(x).strip()
    if not s:
        return ""
    # kasus: "01" / "1"
    if s.isdigit():
        i = int(s)
        return f"split_{i:02d}"
    # kasus: "split_01" / "split-01" / "Split 01"
    s2 = s.lower().replace("-", "_").replace(" ", "_")
    if s2.startswith("split_"):
        tail = s2.split("split_", 1)[1]
        tail = tail.strip("_")
        if tail.isdigit():
            i = int(tail)
            return f"split_{i:02d}"
        return s2
    return s

df_train_log["split"] = df_train_log["split"].map(_normalize_split)
df_test_log["split"]  = df_test_log["split"].map(_normalize_split)

bad_train_split = sorted(set(df_train_log["split"]) - valid_split_names)
bad_test_split  = sorted(set(df_test_log["split"]) - valid_split_names)
if bad_train_split:
    raise ValueError(f"train_log has invalid split values (examples): {bad_train_split[:10]}")
if bad_test_split:
    raise ValueError(f"test_log has invalid split values (examples): {bad_test_split[:10]}")

# ----------------------------
# 7) Basic sanity: uniqueness + target binary
# ----------------------------
if df_train_log["object_id"].duplicated().any():
    dup_n = int(df_train_log["object_id"].duplicated().sum())
    raise ValueError(f"train_log.csv has duplicated object_id rows: {dup_n} duplicates found.")
if df_test_log["object_id"].duplicated().any():
    dup_n = int(df_test_log["object_id"].duplicated().sum())
    raise ValueError(f"test_log.csv has duplicated object_id rows: {dup_n} duplicates found.")

# target harus 0/1
df_train_log["target"] = pd.to_numeric(df_train_log["target"], errors="coerce")
if df_train_log["target"].isna().any():
    n_na = int(df_train_log["target"].isna().sum())
    raise ValueError(f"train_log target has NaN after numeric coercion: {n_na} rows.")
uniq_t = set(pd.unique(df_train_log["target"]).tolist())
if not uniq_t.issubset({0, 1}):
    raise ValueError(f"train_log target must be binary 0/1. Found: {sorted(list(uniq_t))}")

# ----------------------------
# 8) Submission ↔ test_log consistency (HARUS match)
# ----------------------------
sub_ids = set(df_sub["object_id"].astype("string"))
test_ids = set(df_test_log["object_id"].astype("string"))

missing_in_test = sub_ids - test_ids
missing_in_sub  = test_ids - sub_ids

if missing_in_test:
    sample = list(missing_in_test)[:5]
    raise ValueError(f"sample_submission has object_id not found in test_log (showing up to 5): {sample}")
if missing_in_sub:
    sample = list(missing_in_sub)[:5]
    raise ValueError(f"test_log has object_id not present in sample_submission (showing up to 5): {sample}")

# ----------------------------
# 9) Quick dataset summary (ringkas)
# ----------------------------
pos = int((df_train_log["target"] == 1).sum())
neg = int((df_train_log["target"] == 0).sum())
tot = int(len(df_train_log))

print("ENV OK (Kaggle CPU)")
print(f"- Python: {sys.version.split()[0]}")
print(f"- Numpy:  {np.__version__}")
print(f"- Pandas: {pd.__version__}")
if torch is not None:
    print(f"- Torch:  {torch.__version__} | CUDA available: {torch.cuda.is_available()}")
else:
    print("- Torch:  not available")
print(f"- USE_ASTROMER: {USE_ASTROMER}")

print("\nDATA OK")
print(f"- train_log: {len(df_train_log):,} objects | pos(TDE)={pos:,} | neg={neg:,} | pos%={(pos/max(tot,1))*100:.2f}%")
print(f"- test_log:  {len(df_test_log):,} objects")
print(f"- submission template rows: {len(df_sub):,}")
print(f"- splits detected: {len(PATHS['SPLITS'])} folders (split_01..split_20)")

# ----------------------------
# 10) Save config snapshot for reproducibility
# ----------------------------
cfg = {
    "SEED": SEED,
    "DATA_ROOT": str(DATA_ROOT),
    "WORKDIR": str(WORKDIR),
    "USE_ASTROMER": bool(USE_ASTROMER),
    "THREADS": {k: os.environ.get(k, "") for k in [
        "OMP_NUM_THREADS","OPENBLAS_NUM_THREADS","MKL_NUM_THREADS","NUMEXPR_NUM_THREADS",
        "TF_NUM_INTRAOP_THREADS","TF_NUM_INTEROP_THREADS"
    ]},
}
cfg_path_txt = RUN_DIR / "env_config.txt"
cfg_path_json = RUN_DIR / "env_config.json"

with open(cfg_path_txt, "w", encoding="utf-8") as f:
    for k, v in cfg.items():
        if k != "THREADS":
            f.write(f"{k}={v}\n")
    f.write("THREADS:\n")
    for k, v in cfg["THREADS"].items():
        f.write(f"  {k}={v}\n")

with open(cfg_path_json, "w", encoding="utf-8") as f:
    json.dump(cfg, f, indent=2)

print(f"\nSaved env snapshot: {cfg_path_txt}")
print(f"Saved env snapshot: {cfg_path_json}")

# ----------------------------
# 11) Export to globals (dipakai stage berikutnya)
# ----------------------------
globals().update({
    "SEED": SEED,
    "USE_ASTROMER": USE_ASTROMER,
    "PATHS": PATHS,
    "RUN_DIR": RUN_DIR,
    "ART_DIR": ART_DIR,
    "CACHE_DIR": CACHE_DIR,
    "EMB_DIR": EMB_DIR,
    "FEAT_DIR": FEAT_DIR,
    "OOF_DIR": OOF_DIR,
    "SUB_DIR": SUB_DIR,
    "LOG_DIR": LOG_DIR,
    "df_sub": df_sub,
    "df_train_log": df_train_log,
    "df_test_log": df_test_log,
})

gc.collect()


ENV OK (Kaggle CPU)
- Python: 3.12.12
- Numpy:  2.0.2
- Pandas: 2.2.2
- Torch:  2.8.0+cu126 | CUDA available: False
- USE_ASTROMER: True

DATA OK
- train_log: 3,043 objects | pos(TDE)=148 | neg=2,895 | pos%=4.86%
- test_log:  7,135 objects
- submission template rows: 7,135
- splits detected: 20 folders (split_01..split_20)

Saved env snapshot: /kaggle/working/mallorn_run/env_config.txt
Saved env snapshot: /kaggle/working/mallorn_run/env_config.json


63

# Verify Dataset Paths & Split Discovery

In [3]:
# ============================================================
# STAGE 1 — Verify Dataset Paths & Split Discovery (ONE CELL, CPU-SAFE) — REVISI FULL
# - Uses globals from STAGE 0: PATHS, df_train_log, df_test_log
# - DOES NOT load full lightcurves (header-only checks + tiny samples)
# - Confirms:
#   * split folders exist (split_01..split_20)
#   * required lightcurve files exist per split
#   * lightcurve schema matches expected columns
#   * Filter values look sane (u,g,r,i,z,y) on small samples
# - Summarizes:
#   * object counts per split (from logs)
#   * file sizes
# - Exports: DATA_ROOT, SPLIT_DIRS, SPLIT_LIST
# ============================================================

import re, gc
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require STAGE 0 globals
# ----------------------------
for need in ["PATHS", "df_train_log", "df_test_log"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 dulu.")

DATA_ROOT = PATHS["DATA_ROOT"]
SPLIT_DIRS = {p.name: p for p in PATHS["SPLITS"]}  # split_01..split_20 -> Path

# ----------------------------
# 1) Canonical split names (keep consistent with STAGE 0)
# ----------------------------
VALID_SPLITS = {f"split_{i:02d}" for i in range(1, 21)}

def normalize_split_name(x) -> str:
    """
    Normalize split formats to canonical 'split_XX'.
    Accepts: 'split_01', '01', '1', 'split1', 'Split 01', 'split-01', etc.
    """
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    s = str(x).strip()
    if not s:
        return ""
    s2 = s.lower().replace("-", "_").replace(" ", "_")
    # digits only
    if s2.isdigit():
        k = int(s2)
        return f"split_{k:02d}"
    # split_XX
    m = re.fullmatch(r"split_(\d{1,2})", s2)
    if m:
        k = int(m.group(1))
        return f"split_{k:02d}"
    # splitXX
    m = re.fullmatch(r"split(\d{1,2})", s2)
    if m:
        k = int(m.group(1))
        return f"split_{k:02d}"
    return s2

def sizeof_mb(p: Path) -> float:
    try:
        return p.stat().st_size / (1024**2)
    except Exception:
        return float("nan")

# Expected lightcurve schema (as per dataset description)
REQ_LC_COLS = {"object_id", "Time (MJD)", "Flux", "Flux_err", "Filter"}
ALLOWED_FILTERS = {"u", "g", "r", "i", "z", "y"}

def read_header_cols(p: Path):
    """Read header only (nrows=0) to validate schema with minimal IO."""
    df0 = pd.read_csv(p, nrows=0)
    return [c.strip() for c in df0.columns]

def sample_filter_values(p: Path, nrows: int = 200):
    """Read tiny sample of Filter only to sanity-check values."""
    df = pd.read_csv(p, usecols=["Filter"], nrows=nrows)
    vals = (
        df["Filter"]
        .astype(str)
        .str.strip()
        .str.lower()
        .unique()
        .tolist()
    )
    return sorted(set(vals))

# ----------------------------
# 2) Ensure split column in logs is canonical (idempotent)
# ----------------------------
for df, name in [(df_train_log, "train_log"), (df_test_log, "test_log")]:
    if "split" not in df.columns:
        raise ValueError(f"{name} missing 'split' column.")
    df["split"] = df["split"].map(normalize_split_name)

bad_train_split = sorted(set(df_train_log["split"].unique()) - VALID_SPLITS)
bad_test_split  = sorted(set(df_test_log["split"].unique())  - VALID_SPLITS)
if bad_train_split:
    raise ValueError(f"train_log has invalid split values (examples): {bad_train_split[:10]}")
if bad_test_split:
    raise ValueError(f"test_log has invalid split values (examples): {bad_test_split[:10]}")

# ----------------------------
# 3) Verify disk splits are complete and match expected 20 folders
# ----------------------------
disk_splits = set(SPLIT_DIRS.keys())
if disk_splits != VALID_SPLITS:
    missing = sorted(list(VALID_SPLITS - disk_splits))
    extra   = sorted(list(disk_splits - VALID_SPLITS))
    msg = []
    if missing:
        msg.append(f"Missing split folders: {missing[:10]}")
    if extra:
        msg.append(f"Unexpected split folders: {extra[:10]}")
    raise RuntimeError("Split folder set mismatch.\n" + "\n".join(msg))

# Verify logs reference only splits on disk (should pass if above checks pass)
train_splits = set(df_train_log["split"].unique())
test_splits  = set(df_test_log["split"].unique())
bad_train = sorted([s for s in train_splits if s not in disk_splits])
bad_test  = sorted([s for s in test_splits  if s not in disk_splits])
if bad_train:
    raise FileNotFoundError(f"train_log references split(s) not found on disk: {bad_train[:10]}")
if bad_test:
    raise FileNotFoundError(f"test_log references split(s) not found on disk: {bad_test[:10]}")

# ----------------------------
# 4) Verify required files per split exist
# ----------------------------
missing_files = []
split_file_info = []  # (split, train_mb, test_mb)

for split_name in sorted(disk_splits):
    sd = SPLIT_DIRS[split_name]
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"
    if not tr.exists():
        missing_files.append(str(tr))
    if not te.exists():
        missing_files.append(str(te))
    split_file_info.append((split_name, sizeof_mb(tr), sizeof_mb(te)))

if missing_files:
    sample = "\n".join(missing_files[:10])
    raise FileNotFoundError(f"Some lightcurve files missing (showing up to 10):\n{sample}")

# ----------------------------
# 5) Header-only schema check (very light)
# ----------------------------
col_issues = []
for split_name in sorted(disk_splits):
    sd = SPLIT_DIRS[split_name]
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"

    cols_tr = read_header_cols(tr)
    cols_te = read_header_cols(te)

    miss_tr = sorted(list(REQ_LC_COLS - set(cols_tr)))
    miss_te = sorted(list(REQ_LC_COLS - set(cols_te)))

    if miss_tr or miss_te:
        col_issues.append((split_name, miss_tr, miss_te, cols_tr, cols_te))

if col_issues:
    s, miss_tr, miss_te, cols_tr, cols_te = col_issues[0]
    raise ValueError(
        "Lightcurve column mismatch detected.\n"
        f"Example split: {s}\n"
        f"Missing in train_full_lightcurves.csv: {miss_tr}\n"
        f"Missing in test_full_lightcurves.csv : {miss_te}\n"
        f"Train columns: {cols_tr}\n"
        f"Test columns : {cols_te}\n"
    )

# ----------------------------
# 6) Tiny filter-value sanity check (small IO)
# ----------------------------
filter_issues = []
for split_name in sorted(disk_splits):
    sd = SPLIT_DIRS[split_name]
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"

    vals_tr = sample_filter_values(tr, nrows=200)
    vals_te = sample_filter_values(te, nrows=200)

    bad_tr = sorted([v for v in vals_tr if v not in ALLOWED_FILTERS and v != "nan"])
    bad_te = sorted([v for v in vals_te if v not in ALLOWED_FILTERS and v != "nan"])

    if bad_tr:
        filter_issues.append((split_name, "train", bad_tr, vals_tr))
    if bad_te:
        filter_issues.append((split_name, "test", bad_te, vals_te))

if filter_issues:
    ex = filter_issues[0]
    raise ValueError(
        "Unexpected Filter values detected (example):\n"
        f"split={ex[0]} file={ex[1]} bad={ex[2]} all_sampled={ex[3]}\n"
        "Fix by stripping/lowercasing Filter during preprocessing."
    )

# ----------------------------
# 7) Summaries (counts per split, file sizes)
# ----------------------------
train_counts = df_train_log["split"].value_counts().to_dict()
test_counts  = df_test_log["split"].value_counts().to_dict()

# Sanity: sums must match log lengths
if int(sum(train_counts.values())) != int(len(df_train_log)):
    raise RuntimeError("Train split counts do not sum to train_log length (unexpected).")
if int(sum(test_counts.values())) != int(len(df_test_log)):
    raise RuntimeError("Test split counts do not sum to test_log length (unexpected).")

print("SPLIT DISCOVERY OK")
print(f"- DATA_ROOT: {DATA_ROOT}")
print(f"- Splits on disk: {len(disk_splits)} (split_01..split_20)")

print("\nOBJECT COUNTS PER SPLIT (from logs)")
for s in sorted(disk_splits):
    print(f"- {s}: train_objects={train_counts.get(s,0):,} | test_objects={test_counts.get(s,0):,}")

print("\nLIGHTCURVE FILE SIZES (MB)")
for s, mb_tr, mb_te in split_file_info:
    print(f"- {s}: train_full={mb_tr:8.1f} MB | test_full={mb_te:8.1f} MB")

# ----------------------------
# 8) Export split index for later stages (routing + loops)
# ----------------------------
SPLIT_LIST = [f"split_{i:02d}" for i in range(1, 21)]
globals().update({
    "DATA_ROOT": DATA_ROOT,
    "SPLIT_DIRS": SPLIT_DIRS,
    "SPLIT_LIST": SPLIT_LIST,
})

gc.collect()
print("\nStage 1 complete: splits ready for split-wise preprocessing.")


SPLIT DISCOVERY OK
- DATA_ROOT: /kaggle/input/mallorn-dataset
- Splits on disk: 20 (split_01..split_20)

OBJECT COUNTS PER SPLIT (from logs)
- split_01: train_objects=155 | test_objects=364
- split_02: train_objects=170 | test_objects=414
- split_03: train_objects=138 | test_objects=338
- split_04: train_objects=145 | test_objects=332
- split_05: train_objects=165 | test_objects=375
- split_06: train_objects=155 | test_objects=374
- split_07: train_objects=165 | test_objects=398
- split_08: train_objects=162 | test_objects=387
- split_09: train_objects=128 | test_objects=289
- split_10: train_objects=144 | test_objects=331
- split_11: train_objects=146 | test_objects=325
- split_12: train_objects=155 | test_objects=353
- split_13: train_objects=143 | test_objects=379
- split_14: train_objects=154 | test_objects=351
- split_15: train_objects=158 | test_objects=342
- split_16: train_objects=155 | test_objects=354
- split_17: train_objects=153 | test_objects=351
- split_18: train_objects=

# Load and Validate Train/Test Logs

In [4]:
# ============================================================
# STAGE 2 — Load and Validate Train/Test Logs (ONE CELL, CPU-SAFE) — REVISI FULL
# - Ringan (tanpa load full lightcurves)
# - Output:
#   * df_train_meta, df_test_meta  (index=object_id, bersih & siap dipakai)
#   * id2split_train, id2split_test (routing cepat ke split folder)
#   * artifacts/train_log_clean.parquet (atau .csv fallback)
#   * artifacts/test_log_clean.parquet  (atau .csv fallback)
#   * artifacts/split_stats.csv
# ============================================================

import re, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require STAGE 0/1 globals
# ----------------------------
for need in ["PATHS", "ART_DIR", "SPLIT_DIRS"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 & STAGE 1 dulu.")

TRAIN_LOG_PATH = Path(PATHS["TRAIN_LOG"])
TEST_LOG_PATH  = Path(PATHS["TEST_LOG"])
disk_splits = set(SPLIT_DIRS.keys())
VALID_SPLITS = {f"split_{i:02d}" for i in range(1, 21)}

# ----------------------------
# 1) Helpers (selaras dengan STAGE 0/1)
# ----------------------------
def normalize_split_name(x) -> str:
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return ""
    s = str(x).strip()
    if not s:
        return ""
    s2 = s.lower().replace("-", "_").replace(" ", "_")
    if s2.isdigit():
        k = int(s2)
        return f"split_{k:02d}"
    m = re.fullmatch(r"split_(\d{1,2})", s2)
    if m:
        k = int(m.group(1))
        return f"split_{k:02d}"
    m = re.fullmatch(r"split(\d{1,2})", s2)
    if m:
        k = int(m.group(1))
        return f"split_{k:02d}"
    return s2

def _norm_cols(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]
    return df

def _coerce_num(df: pd.DataFrame, col: str):
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# ----------------------------
# 2) Load logs (fresh read for consistency)
# ----------------------------
dtype_log = {"object_id": "string", "split": "string"}
df_train = pd.read_csv(TRAIN_LOG_PATH, dtype=dtype_log)
df_test  = pd.read_csv(TEST_LOG_PATH,  dtype=dtype_log)

df_train = _norm_cols(df_train)
df_test  = _norm_cols(df_test)

# ----------------------------
# 3) Required columns check
# ----------------------------
req_common = {"object_id", "split", "EBV", "Z"}
req_train  = req_common | {"target"}
req_test   = req_common

miss_train = sorted(list(req_train - set(df_train.columns)))
miss_test  = sorted(list(req_test  - set(df_test.columns)))

if miss_train:
    raise ValueError(f"train_log.csv missing required columns: {miss_train} | found={list(df_train.columns)}")
if miss_test:
    raise ValueError(f"test_log.csv missing required columns: {miss_test} | found={list(df_test.columns)}")

# ----------------------------
# 4) Basic cleaning (types + canonical split)
# ----------------------------
df_train["object_id"] = df_train["object_id"].astype("string").str.strip()
df_test["object_id"]  = df_test["object_id"].astype("string").str.strip()

df_train["split"] = df_train["split"].astype("string").map(normalize_split_name)
df_test["split"]  = df_test["split"].astype("string").map(normalize_split_name)

# Split validity
bad_train_split = sorted(set(df_train["split"].unique()) - VALID_SPLITS)
bad_test_split  = sorted(set(df_test["split"].unique())  - VALID_SPLITS)
if bad_train_split:
    raise ValueError(f"train_log has invalid split values (examples): {bad_train_split[:10]}")
if bad_test_split:
    raise ValueError(f"test_log has invalid split values (examples): {bad_test_split[:10]}")

bad_train_disk = sorted([s for s in set(df_train["split"].unique()) if s not in disk_splits])
bad_test_disk  = sorted([s for s in set(df_test["split"].unique())  if s not in disk_splits])
if bad_train_disk:
    raise FileNotFoundError(f"train_log references unknown split(s) not on disk: {bad_train_disk[:10]}")
if bad_test_disk:
    raise FileNotFoundError(f"test_log references unknown split(s) not on disk: {bad_test_disk[:10]}")

# Numeric coercion
for c in ["EBV", "Z", "Z_err"]:
    _coerce_num(df_train, c)
    _coerce_num(df_test, c)

# Ensure Z_err exists in both (schema consistency)
if "Z_err" not in df_train.columns:
    df_train["Z_err"] = np.nan
if "Z_err" not in df_test.columns:
    df_test["Z_err"] = np.nan

# ----------------------------
# 5) Duplicates / overlap checks (hard fail)
# ----------------------------
if df_train["object_id"].duplicated().any():
    ex = df_train.loc[df_train["object_id"].duplicated(), "object_id"].head(5).tolist()
    raise ValueError(f"Duplicated object_id in train_log (examples): {ex}")
if df_test["object_id"].duplicated().any():
    ex = df_test.loc[df_test["object_id"].duplicated(), "object_id"].head(5).tolist()
    raise ValueError(f"Duplicated object_id in test_log (examples): {ex}")

overlap = set(df_train["object_id"].tolist()) & set(df_test["object_id"].tolist())
if overlap:
    ex = list(overlap)[:5]
    raise ValueError(f"object_id overlap between train_log and test_log (examples): {ex}")

# ----------------------------
# 6) Target validation (train)
# ----------------------------
df_train["target"] = pd.to_numeric(df_train["target"], errors="coerce")
if df_train["target"].isna().any():
    n_na = int(df_train["target"].isna().sum())
    raise ValueError(f"train_log target has NaN after coercion: {n_na} rows.")
uniq_t = set(pd.unique(df_train["target"]).tolist())
if not uniq_t.issubset({0, 1}):
    raise ValueError(f"train_log target must be binary 0/1. Found: {sorted(list(uniq_t))}")
df_train["target"] = df_train["target"].astype(np.int8)

# ----------------------------
# 7) Missing flags + fills (vectorized, CPU-safe)
# ----------------------------
for df in [df_train, df_test]:
    df["EBV_missing"] = df["EBV"].isna().astype(np.int8)
    df["Z_missing"]   = df["Z"].isna().astype(np.int8)
    df["Z_err_missing"] = df["Z_err"].isna().astype(np.int8)

# EBV: fill NaN -> 0.0
df_train["EBV"] = df_train["EBV"].fillna(0.0)
df_test["EBV"]  = df_test["EBV"].fillna(0.0)

# Z: fill NaN -> median per split (fallback global median)
def fill_z_by_split(df: pd.DataFrame) -> pd.Series:
    z = df["Z"]
    if not z.isna().any():
        return z
    z_fill = z.fillna(df.groupby("split")["Z"].transform("median"))
    global_med = float(z.median()) if z.notna().any() else 0.0
    z_fill = z_fill.fillna(global_med)
    return z_fill

df_train["Z"] = fill_z_by_split(df_train)
df_test["Z"]  = fill_z_by_split(df_test)

# Z_err: fill NaN -> 0.0 (schema stability)
df_train["Z_err"] = df_train["Z_err"].fillna(0.0)
df_test["Z_err"]  = df_test["Z_err"].fillna(0.0)

# Domain flag (analysis-only; jangan dipakai sebagai fitur model)
df_train["is_photoz"] = np.int8(0)
df_test["is_photoz"]  = np.int8(1)

# ----------------------------
# 8) Build meta tables (index=object_id) + routing dicts
# ----------------------------
keep_train = [
    "object_id","split","EBV","Z","Z_err",
    "EBV_missing","Z_missing","Z_err_missing",
    "is_photoz","target"
]
keep_test = [
    "object_id","split","EBV","Z","Z_err",
    "EBV_missing","Z_missing","Z_err_missing",
    "is_photoz"
]

# Optional: keep SpecType for analysis
if "SpecType" in df_train.columns:
    keep_train.append("SpecType")

df_train_meta = df_train[keep_train].copy()
df_test_meta  = df_test[keep_test].copy()

df_train_meta = df_train_meta.set_index("object_id", drop=True).sort_index()
df_test_meta  = df_test_meta.set_index("object_id", drop=True).sort_index()

id2split_train = df_train_meta["split"].to_dict()
id2split_test  = df_test_meta["split"].to_dict()

# ----------------------------
# 9) Save cleaned logs (parquet preferred, csv fallback)
# ----------------------------
train_out_pq = Path(ART_DIR) / "train_log_clean.parquet"
test_out_pq  = Path(ART_DIR) / "test_log_clean.parquet"
train_out_csv = Path(ART_DIR) / "train_log_clean.csv"
test_out_csv  = Path(ART_DIR) / "test_log_clean.csv"

saved_train = None
saved_test  = None
try:
    df_train_meta.to_parquet(train_out_pq, index=True)
    df_test_meta.to_parquet(test_out_pq, index=True)
    saved_train = str(train_out_pq)
    saved_test  = str(test_out_pq)
except Exception:
    df_train_meta.to_csv(train_out_csv, index=True)
    df_test_meta.to_csv(test_out_csv, index=True)
    saved_train = str(train_out_csv)
    saved_test  = str(test_out_csv)

# Split stats (debug)
split_stats = pd.DataFrame({
    "train_objects": df_train_meta["split"].value_counts().reindex(sorted(disk_splits)).fillna(0).astype(int),
    "test_objects":  df_test_meta["split"].value_counts().reindex(sorted(disk_splits)).fillna(0).astype(int),
})
split_stats.index.name = "split"
split_stats_path = Path(ART_DIR) / "split_stats.csv"
split_stats.to_csv(split_stats_path)

# ----------------------------
# 10) Print summary
# ----------------------------
pos = int((df_train_meta["target"] == 1).sum())
neg = int((df_train_meta["target"] == 0).sum())
tot = int(len(df_train_meta))

print("LOGS OK (clean + validated)")
print(f"- train objects: {tot:,} | pos(TDE)={pos:,} | neg={neg:,} | pos%={(pos/max(tot,1))*100:.3f}%")
print(f"- test objects : {len(df_test_meta):,}")
print(f"- saved train  : {saved_train}")
print(f"- saved test   : {saved_test}")
print(f"- saved stats  : {split_stats_path}")

# ----------------------------
# 11) Export globals for next stages
# ----------------------------
globals().update({
    "df_train_meta": df_train_meta,
    "df_test_meta": df_test_meta,
    "id2split_train": id2split_train,
    "id2split_test": id2split_test,
    "split_stats": split_stats,
})

gc.collect()


LOGS OK (clean + validated)
- train objects: 3,043 | pos(TDE)=148 | neg=2,895 | pos%=4.864%
- test objects : 7,135
- saved train  : /kaggle/working/mallorn_run/artifacts/train_log_clean.parquet
- saved test   : /kaggle/working/mallorn_run/artifacts/test_log_clean.parquet
- saved stats  : /kaggle/working/mallorn_run/artifacts/split_stats.csv


0

# Lightcurve Loading Strategy

In [5]:
# ============================================================
# STAGE 3 — Lightcurve Loading Strategy (ONE CELL, Kaggle CPU-SAFE) — REVISI FULL
# - Split-wise file mapping + chunked reader utilities (no full concat)
# - Builds:
#   * SPLIT_FILES: {split_XX: {"train": Path, "test": Path}}
#   * train_ids_by_split / test_ids_by_split: routing object_ids per split
#   * iter_lightcurve_chunks(): generator read_csv(chunksize=...)
#   * load_object_lightcurve(): debug-safe per-object extraction (streaming, optional guard)
# - Saves:
#   * artifacts/split_file_manifest.csv
#   * artifacts/object_counts_by_split.csv
# ============================================================

import gc
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["SPLIT_DIRS", "SPLIT_LIST", "df_train_meta", "df_test_meta", "ART_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 -> STAGE 1 -> STAGE 2 dulu.")

# ----------------------------
# 1) Build split file mapping (train/test lightcurves)
# ----------------------------
SPLIT_FILES = {}
for s in SPLIT_LIST:
    sd = SPLIT_DIRS[s]
    tr = sd / "train_full_lightcurves.csv"
    te = sd / "test_full_lightcurves.csv"
    if (not tr.exists()) or (not te.exists()):
        raise FileNotFoundError(f"Missing lightcurve file(s) in {sd}: train={tr.exists()} test={te.exists()}")
    SPLIT_FILES[s] = {"train": tr, "test": te}

# Save split file manifest
manifest = []
for s in SPLIT_LIST:
    p_tr = SPLIT_FILES[s]["train"]
    p_te = SPLIT_FILES[s]["test"]
    manifest.append({
        "split": s,
        "train_path": str(p_tr),
        "test_path": str(p_te),
        "train_mb": float(p_tr.stat().st_size) / (1024**2),
        "test_mb":  float(p_te.stat().st_size) / (1024**2),
    })
df_manifest = pd.DataFrame(manifest).sort_values("split")
manifest_path = Path(ART_DIR) / "split_file_manifest.csv"
df_manifest.to_csv(manifest_path, index=False)

# ----------------------------
# 2) Build object routing by split (FAST + correct)
# ----------------------------
train_ids_by_split = {s: [] for s in SPLIT_LIST}
test_ids_by_split  = {s: [] for s in SPLIT_LIST}

# df_train_meta / df_test_meta index = object_id, and column "split" exists
for oid, split_name in df_train_meta["split"].items():
    train_ids_by_split[str(split_name)].append(str(oid))
for oid, split_name in df_test_meta["split"].items():
    test_ids_by_split[str(split_name)].append(str(oid))

df_counts = pd.DataFrame({
    "split": SPLIT_LIST,
    "train_objects": [len(train_ids_by_split[s]) for s in SPLIT_LIST],
    "test_objects":  [len(test_ids_by_split[s]) for s in SPLIT_LIST],
})
counts_path = Path(ART_DIR) / "object_counts_by_split.csv"
df_counts.to_csv(counts_path, index=False)

# ----------------------------
# 3) Column normalization (canonical: object_id, mjd, flux, flux_err, filter)
# ----------------------------
REQ_LC_KEYS = ["object_id", "mjd", "flux", "flux_err", "filter"]
ALLOWED_FILTERS = {"u", "g", "r", "i", "z", "y"}

# Header config cache to avoid re-reading headers repeatedly
_LC_CFG_CACHE = {}  # (split_name, which) -> dict(usecols=[...], dtype={...}, rename={...})

def _build_lc_read_cfg(p: Path):
    """
    Build a robust read config by:
    - reading header only once
    - matching required columns by stripped name (tolerant to whitespace)
    - selecting best time column among known options
    """
    header = pd.read_csv(p, nrows=0)
    orig_cols = list(header.columns)

    # map stripped -> original (first occurrence)
    strip2orig = {}
    for c in orig_cols:
        cs = str(c).strip()
        if cs not in strip2orig:
            strip2orig[cs] = c

    # choose time column
    time_candidates = ["Time (MJD)", "Time(MJD)", "Time"]
    time_col = None
    for tc in time_candidates:
        if tc in strip2orig:
            time_col = strip2orig[tc]
            break
    if time_col is None:
        raise ValueError(f"Cannot find time column in {p}. Expected one of {time_candidates}. Found: {orig_cols[:20]}")

    # required columns (match by stripped name)
    required = {
        "object_id": strip2orig.get("object_id", None),
        "mjd": time_col,
        "flux": strip2orig.get("Flux", None),
        "flux_err": strip2orig.get("Flux_err", None),
        "filter": strip2orig.get("Filter", None),
    }
    missing = [k for k, v in required.items() if v is None]
    if missing:
        raise ValueError(f"Missing required lightcurve columns in {p}: {missing}. Found: {orig_cols[:30]}")

    usecols = [required["object_id"], required["mjd"], required["flux"], required["flux_err"], required["filter"]]
    rename = {
        required["object_id"]: "object_id",
        required["mjd"]: "mjd",
        required["flux"]: "flux",
        required["flux_err"]: "flux_err",
        required["filter"]: "filter",
    }

    # dtypes: keep safe and small
    dtypes = {
        required["object_id"]: "string",
        required["filter"]: "string",
        required["flux"]: "float32",
        required["flux_err"]: "float32",
        required["mjd"]: "float32",
    }
    return {"usecols": usecols, "dtype": dtypes, "rename": rename}

def _normalize_lc_chunk(df: pd.DataFrame) -> pd.DataFrame:
    """
    Normalize a raw chunk to canonical columns and clean string fields.
    Output columns: object_id, mjd, flux, flux_err, filter
    """
    df = df.rename(columns={c: str(c).strip() for c in df.columns})
    # sometimes dtype coercion still yields float64; cast down
    df = df.rename(columns={
        "Time (MJD)": "mjd", "Time(MJD)": "mjd", "Time": "mjd",
        "Flux": "flux", "Flux_err": "flux_err", "Filter": "filter",
        "object_id": "object_id"
    })
    need = {"object_id", "mjd", "flux", "flux_err", "filter"}
    miss = sorted(list(need - set(df.columns)))
    if miss:
        raise ValueError(f"Lightcurve chunk missing columns after rename: {miss}. Found: {list(df.columns)}")

    df["object_id"] = df["object_id"].astype("string").str.strip()
    df["filter"] = df["filter"].astype("string").str.strip().str.lower()

    # numeric cast
    df["mjd"] = pd.to_numeric(df["mjd"], errors="coerce").astype("float32")
    df["flux"] = pd.to_numeric(df["flux"], errors="coerce").astype("float32")
    df["flux_err"] = pd.to_numeric(df["flux_err"], errors="coerce").astype("float32")

    return df[REQ_LC_KEYS]

# ----------------------------
# 4) Chunked readers (core strategy)
# ----------------------------
def iter_lightcurve_chunks(split_name: str, which: str, chunksize: int = 400_000):
    """
    Stream read a split lightcurve CSV in chunks.
    Yields normalized chunks with columns:
      object_id, mjd, flux, flux_err, filter
    """
    if split_name not in SPLIT_FILES:
        raise KeyError(f"Unknown split_name={split_name}.")
    if which not in ("train", "test"):
        raise ValueError("which must be 'train' or 'test'")

    p = SPLIT_FILES[split_name][which]
    key = (split_name, which)
    if key not in _LC_CFG_CACHE:
        _LC_CFG_CACHE[key] = _build_lc_read_cfg(p)

    cfg = _LC_CFG_CACHE[key]
    reader = pd.read_csv(
        p,
        usecols=cfg["usecols"],
        dtype=cfg["dtype"],
        chunksize=int(chunksize),
    )
    for chunk in reader:
        chunk = chunk.rename(columns=cfg["rename"])
        yield _normalize_lc_chunk(chunk)

def load_split_lightcurves(split_name: str, which: str, chunksize: int = 400_000):
    """
    Convenience: load entire split file (NOT recommended for large files).
    Use only for quick debugging.
    """
    parts = []
    for ch in iter_lightcurve_chunks(split_name, which, chunksize=chunksize):
        parts.append(ch)
    if not parts:
        return pd.DataFrame(columns=REQ_LC_KEYS)
    return pd.concat(parts, ignore_index=True)

def load_object_lightcurve(object_id: str, which: str, chunksize: int = 400_000, sort_time: bool = True, max_chunks: int = None):
    """
    Debug-safe per-object extraction by streaming the relevant split file.
    WARNING: This scans the split CSV in chunks. Use only occasionally.
    max_chunks: optional guard to stop after N chunks (useful to avoid accidental long scans).
    """
    object_id = str(object_id).strip()
    if which == "train":
        if object_id not in df_train_meta.index:
            raise KeyError(f"object_id not found in df_train_meta: {object_id}")
        split_name = str(df_train_meta.loc[object_id, "split"])
    elif which == "test":
        if object_id not in df_test_meta.index:
            raise KeyError(f"object_id not found in df_test_meta: {object_id}")
        split_name = str(df_test_meta.loc[object_id, "split"])
    else:
        raise ValueError("which must be 'train' or 'test'")

    pieces = []
    n_seen = 0
    for ch in iter_lightcurve_chunks(split_name, which, chunksize=chunksize):
        n_seen += 1
        sub = ch[ch["object_id"] == object_id]
        if not sub.empty:
            pieces.append(sub)
        if max_chunks is not None and n_seen >= int(max_chunks):
            break

    if not pieces:
        out = pd.DataFrame(columns=REQ_LC_KEYS)
    else:
        out = pd.concat(pieces, ignore_index=True)
        if sort_time and len(out) > 1:
            out = out.sort_values(["mjd", "filter"], kind="mergesort").reset_index(drop=True)
    return out

# ----------------------------
# 5) Quick smoke test (VERY light, no full scan)
# ----------------------------
# Tujuan: memastikan chunk reader bekerja + kolom benar + filter values wajar
_smoke_splits = ["split_01", "split_08", "split_17"]
for s in _smoke_splits:
    if len(train_ids_by_split.get(s, [])) == 0 or len(test_ids_by_split.get(s, [])) == 0:
        raise RuntimeError(f"Split {s} has 0 objects in train/test log (unexpected).")

    # read one small chunk from train and test
    ch_tr = next(iter_lightcurve_chunks(s, "train", chunksize=50_000))
    ch_te = next(iter_lightcurve_chunks(s, "test",  chunksize=50_000))

    # schema
    if list(ch_tr.columns) != REQ_LC_KEYS:
        raise RuntimeError(f"Train chunk schema mismatch in {s}: {list(ch_tr.columns)}")
    if list(ch_te.columns) != REQ_LC_KEYS:
        raise RuntimeError(f"Test chunk schema mismatch in {s}: {list(ch_te.columns)}")

    # filter sanity on sample chunk
    badf_tr = sorted(set(ch_tr["filter"].dropna().unique()) - ALLOWED_FILTERS)
    badf_te = sorted(set(ch_te["filter"].dropna().unique()) - ALLOWED_FILTERS)
    if badf_tr or badf_te:
        raise ValueError(f"Unexpected filter values in smoke chunk split={s}: train_bad={badf_tr} test_bad={badf_te}")

print("LIGHTCURVE LOADING STRATEGY OK (split-wise + chunked)")
print(f"- Saved: {manifest_path}")
print(f"- Saved: {counts_path}")
print("- Ready for next stage: photometric preprocessing + ASTROMER input building (split-wise loop).")

# ----------------------------
# 6) Export globals for next stages
# ----------------------------
globals().update({
    "SPLIT_FILES": SPLIT_FILES,
    "train_ids_by_split": train_ids_by_split,
    "test_ids_by_split": test_ids_by_split,
    "iter_lightcurve_chunks": iter_lightcurve_chunks,
    "load_object_lightcurve": load_object_lightcurve,
})

gc.collect()


LIGHTCURVE LOADING STRATEGY OK (split-wise + chunked)
- Saved: /kaggle/working/mallorn_run/artifacts/split_file_manifest.csv
- Saved: /kaggle/working/mallorn_run/artifacts/object_counts_by_split.csv
- Ready for next stage: photometric preprocessing + ASTROMER input building (split-wise loop).


301

# Photometric Cleaning (De-extinction + Negative Flux Safe Transform)

In [6]:
# ============================================================
# STAGE 4 — Photometric Cleaning (De-extinction + NEGATIVE FLUX -> MAG SAFE) — REVISI FULL (ASTROMER-READY)
# ONE CELL, Kaggle CPU-SAFE, split-wise + chunked
#
# Prasyarat:
# - iter_lightcurve_chunks (STAGE 3)  -> yields: object_id,mjd,flux,flux_err,filter
# - df_train_meta, df_test_meta (STAGE 2) with EBV
# - ART_DIR, SPLIT_LIST (STAGE 0/1)
#
# Output (per observation, siap untuk ASTROMER input builder):
# - lc_clean_mag/split_XX/{train|test}/part_*.parquet (atau .csv.gz fallback)
#   columns:
#     object_id (string), mjd(float32), band_id(int8), mag(float32), mag_err(float32),
#     snr(float32), detected(int8)
# - manifest + summary + config.json
# ============================================================

import gc, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["iter_lightcurve_chunks", "df_train_meta", "df_test_meta", "ART_DIR", "SPLIT_LIST"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 -> 1 -> 2 -> 3 dulu.")

# ----------------------------
# 1) Settings (CPU-safe defaults)
# ----------------------------
CHUNKSIZE   = 350_000   # bigger=faster but more RAM
ERR_EPS     = 1e-6      # avoid div-by-zero
SNR_DET     = 3.0       # detection threshold (proxy)
DET_SIGMA   = 3.0       # for non-detection "limit" flux = DET_SIGMA * err

# magnitude stability
MIN_FLUX_POS_UJY   = 1e-6   # clamp minimum positive flux to avoid log10(0)
MAG_MIN, MAG_MAX   = -10.0, 50.0
MAGERR_FLOOR_DET   = 1e-3
MAGERR_FLOOR_ND    = 0.75   # inflate uncertainty for non-detections (helps encoder + head)
MAGERR_CAP         = 10.0

WRITE_FORMAT = "parquet"     # parquet recommended; auto-fallback to csv.gz if parquet fails
ONLY_SPLITS  = None          # None = process all 20 splits; else e.g. ["split_01","split_02"]

# Debug: keep flux_deext? (biasanya tidak perlu untuk ASTROMER, bikin file lebih besar)
KEEP_FLUX_DEBUG = False

# ----------------------------
# 2) Extinction coefficients (R_lambda)
# NOTE: kalau kamu punya koefisien resmi dari notebook "Using_the_Data", ganti nilai di sini.
# A_lambda = R_lambda * EBV
# Flux de-extinction: flux_deext = flux * 10^(0.4 * A_lambda)
# ----------------------------
EXT_RLAMBDA = {
    "u": 4.8,
    "g": 3.6,
    "r": 2.7,
    "i": 2.1,
    "z": 1.6,
    "y": 1.3,
}

BAND2ID = {"u": 0, "g": 1, "r": 2, "i": 3, "z": 4, "y": 5}
ID2BAND = {v: k for k, v in BAND2ID.items()}
ALLOWED_BANDS = set(BAND2ID.keys())

# EBV mapping Series (index=object_id)
EBV_TRAIN_SER = df_train_meta["EBV"]
EBV_TEST_SER  = df_test_meta["EBV"]

# AB magnitude zero-point for flux in microJansky (uJy):
# mag_AB = 23.9 - 2.5*log10(flux_uJy)  (because 3631 Jy = 3.631e9 uJy)
MAG_ZP = float(2.5 * np.log10(3631e6))  # ~23.899999...

# ----------------------------
# 3) Core cleaning: chunk -> (de-extinct flux) -> mag/mag_err (negative-safe)
# ----------------------------
def clean_chunk_to_mag(ch: pd.DataFrame, ebv_ser: pd.Series) -> pd.DataFrame:
    """
    Input chunk columns: object_id,mjd,flux,flux_err,filter
    Output columns:
      object_id, mjd, band_id, mag, mag_err, snr, detected  (+optional flux_deext, err_deext)
    """
    # base arrays
    oid = ch["object_id"].astype("string").to_numpy()
    mjd = ch["mjd"].to_numpy(dtype=np.float32, copy=False)

    flux = ch["flux"].to_numpy(dtype=np.float32, copy=False)
    err  = ch["flux_err"].to_numpy(dtype=np.float32, copy=False)
    err  = np.maximum(err, np.float32(ERR_EPS))

    filt = ch["filter"].astype("string").to_numpy()
    # normalize to lowercase single char
    # (stage 3 already lowercased, but keep safe)
    filt = np.char.lower(np.char.strip(filt.astype(str)))

    # band_id (vectorized)
    band_id = np.full(len(ch), -1, dtype=np.int8)
    for b, bid in BAND2ID.items():
        band_id[filt == b] = np.int8(bid)
    if np.any(band_id < 0):
        bad = sorted(set(filt[band_id < 0].tolist()))
        raise ValueError(f"Unknown filter values encountered (example up to 10): {bad[:10]}")

    # EBV lookup
    ebv = ch["object_id"].map(ebv_ser).fillna(0.0).to_numpy(dtype=np.float32)

    # R_lambda lookup (vectorized)
    rlam = np.zeros(len(ch), dtype=np.float32)
    for b, rv in EXT_RLAMBDA.items():
        rlam[filt == b] = np.float32(rv)

    A = (rlam * ebv).astype(np.float32)  # A_lambda
    mul = np.power(np.float32(10.0), (np.float32(0.4) * A)).astype(np.float32)

    # de-extinct in flux domain
    flux_deext = (flux * mul).astype(np.float32)
    err_deext  = (err  * mul).astype(np.float32)

    # SNR + detected
    snr = (flux_deext / np.maximum(err_deext, np.float32(ERR_EPS))).astype(np.float32)
    detected = (snr > np.float32(SNR_DET)).astype(np.int8)

    # Negative-safe magnitude:
    # - if detected: use measured de-extinct flux (clamped positive)
    # - else: use detection-limit flux = DET_SIGMA * err_deext (clamped positive)
    flux_detlim = (np.float32(DET_SIGMA) * err_deext).astype(np.float32)

    flux_for_mag = np.where(
        detected == 1,
        np.maximum(flux_deext, np.float32(MIN_FLUX_POS_UJY)),
        np.maximum(flux_detlim, np.float32(MIN_FLUX_POS_UJY)),
    ).astype(np.float32)

    # mag
    mag = (np.float32(MAG_ZP) - np.float32(2.5) * np.log10(flux_for_mag)).astype(np.float32)
    mag = np.clip(mag, np.float32(MAG_MIN), np.float32(MAG_MAX)).astype(np.float32)

    # mag_err ~ (2.5/ln 10) * (err/flux_for_mag)
    mag_err = (np.float32(1.0857362) * (err_deext / flux_for_mag)).astype(np.float32)
    mag_err = np.clip(mag_err, np.float32(MAGERR_FLOOR_DET), np.float32(MAGERR_CAP)).astype(np.float32)

    # inflate uncertainty for non-detections (important)
    if MAGERR_FLOOR_ND is not None and float(MAGERR_FLOOR_ND) > 0:
        mag_err = np.where(
            detected == 1,
            mag_err,
            np.maximum(mag_err, np.float32(MAGERR_FLOOR_ND))
        ).astype(np.float32)

    out = pd.DataFrame({
        "object_id": oid,
        "mjd": mjd,
        "band_id": band_id,
        "mag": mag,
        "mag_err": mag_err,
        "snr": snr,
        "detected": detected,
    })

    if KEEP_FLUX_DEBUG:
        out["flux_deext"] = flux_deext
        out["err_deext"]  = err_deext

    return out

# ----------------------------
# 4) Writer (parquet preferred; fallback csv.gz)
# ----------------------------
def write_part(df: pd.DataFrame, out_path: Path, fmt: str):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if fmt == "parquet":
        try:
            df.to_parquet(out_path, index=False)
            return "parquet", out_path
        except Exception as e:
            alt = out_path.with_suffix(".csv.gz")
            df.to_csv(alt, index=False, compression="gzip")
            return f"csv.gz (fallback from parquet: {type(e).__name__})", alt
    elif fmt == "csv.gz":
        alt = out_path.with_suffix(".csv.gz")
        df.to_csv(alt, index=False, compression="gzip")
        return "csv.gz", alt
    else:
        raise ValueError("fmt must be 'parquet' or 'csv.gz'")

# ----------------------------
# 5) Process all splits split-wise (stream -> clean -> write parts)
# ----------------------------
splits_to_use = ONLY_SPLITS if (ONLY_SPLITS is not None) else SPLIT_LIST

LC_CLEAN_DIR = Path(ART_DIR) / "lc_clean_mag"
LC_CLEAN_DIR.mkdir(parents=True, exist_ok=True)

summary_rows = []
manifest_rows = []

def process_split(split_name: str, which: str):
    ebv_ser = EBV_TRAIN_SER if which == "train" else EBV_TEST_SER
    out_dir = LC_CLEAN_DIR / split_name / which
    out_dir.mkdir(parents=True, exist_ok=True)

    part_idx = 0
    n_rows_total = 0
    n_flux_neg = 0
    n_det = 0
    n_finite_mag = 0
    mag_min = np.inf
    mag_max = -np.inf

    for ch in iter_lightcurve_chunks(split_name, which, chunksize=CHUNKSIZE):
        cleaned = clean_chunk_to_mag(ch, ebv_ser)

        n_rows = int(len(cleaned))
        n_rows_total += n_rows

        # stats
        if KEEP_FLUX_DEBUG and "flux_deext" in cleaned.columns:
            n_flux_neg += int((cleaned["flux_deext"].to_numpy(dtype=np.float32) < 0).sum())
        else:
            # if not keeping flux, approximate neg fraction using snr sign:
            # (snr negative implies flux negative)
            n_flux_neg += int((cleaned["snr"].to_numpy(dtype=np.float32) < 0).sum())

        det_arr = cleaned["detected"].to_numpy(dtype=np.int8)
        n_det += int(det_arr.sum())

        mag_arr = cleaned["mag"].to_numpy(dtype=np.float32)
        fin = np.isfinite(mag_arr)
        n_finite_mag += int(fin.sum())
        if fin.any():
            mag_min = float(min(mag_min, float(np.min(mag_arr[fin]))))
            mag_max = float(max(mag_max, float(np.max(mag_arr[fin]))))

        # write
        out_path = out_dir / f"part_{part_idx:04d}.parquet"
        used_fmt, final_path = write_part(cleaned, out_path, WRITE_FORMAT)

        manifest_rows.append({
            "split": split_name,
            "which": which,
            "part": part_idx,
            "path": str(final_path),
            "rows": n_rows,
            "format": used_fmt,
        })

        part_idx += 1
        del cleaned, ch
        if part_idx % 10 == 0:
            gc.collect()

    summary_rows.append({
        "split": split_name,
        "which": which,
        "parts": part_idx,
        "rows": n_rows_total,
        "neg_flux_frac_proxy": (n_flux_neg / max(n_rows_total, 1)),
        "det_frac_snr_gt_thr": (n_det / max(n_rows_total, 1)),
        "finite_mag_frac": (n_finite_mag / max(n_rows_total, 1)),
        "mag_min": (mag_min if np.isfinite(mag_min) else np.nan),
        "mag_max": (mag_max if np.isfinite(mag_max) else np.nan),
    })

    print(
        f"[Stage 4] {split_name}/{which}: parts={part_idx} | rows={n_rows_total:,} | "
        f"det%={100*(n_det/max(n_rows_total,1)):.2f}% | "
        f"neg_proxy%={100*(n_flux_neg/max(n_rows_total,1)):.2f}% | "
        f"mag_range=[{(mag_min if np.isfinite(mag_min) else np.nan):.2f}, {(mag_max if np.isfinite(mag_max) else np.nan):.2f}]"
    )

print("[Stage 4] Building cleaned MAG lightcurve cache (split-wise) ...")
for s in splits_to_use:
    process_split(s, "train")
    process_split(s, "test")

# ----------------------------
# 6) Save manifests + summary + config
# ----------------------------
df_manifest = pd.DataFrame(manifest_rows)
df_summary  = pd.DataFrame(summary_rows)

manifest_path = LC_CLEAN_DIR / "lc_clean_mag_manifest.csv"
summary_path  = LC_CLEAN_DIR / "lc_clean_mag_summary.csv"

df_manifest.to_csv(manifest_path, index=False)
df_summary.to_csv(summary_path, index=False)

cfg_path = LC_CLEAN_DIR / "photometric_config_mag.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump({
        "EXT_RLAMBDA": EXT_RLAMBDA,
        "SNR_DET": SNR_DET,
        "DET_SIGMA": DET_SIGMA,
        "ERR_EPS": ERR_EPS,
        "MIN_FLUX_POS_UJY": MIN_FLUX_POS_UJY,
        "MAG_ZP": MAG_ZP,
        "MAG_MIN": MAG_MIN,
        "MAG_MAX": MAG_MAX,
        "MAGERR_FLOOR_DET": MAGERR_FLOOR_DET,
        "MAGERR_FLOOR_ND": MAGERR_FLOOR_ND,
        "MAGERR_CAP": MAGERR_CAP,
        "CHUNKSIZE": CHUNKSIZE,
        "WRITE_FORMAT": WRITE_FORMAT,
        "ONLY_SPLITS": splits_to_use,
        "KEEP_FLUX_DEBUG": KEEP_FLUX_DEBUG,
    }, f, indent=2)

print("\n[Stage 4] Done.")
print(f"- Saved manifest: {manifest_path}")
print(f"- Saved summary : {summary_path}")
print(f"- Saved config  : {cfg_path}")

# ----------------------------
# 7) Helper for next stages
# ----------------------------
def get_clean_parts(split_name: str, which: str):
    m = df_manifest[(df_manifest["split"] == split_name) & (df_manifest["which"] == which)].sort_values("part")
    return m["path"].tolist()

globals().update({
    "EXT_RLAMBDA": EXT_RLAMBDA,
    "BAND2ID": BAND2ID,
    "ID2BAND": ID2BAND,
    "MAG_ZP": MAG_ZP,
    "LC_CLEAN_DIR": LC_CLEAN_DIR,
    "lc_clean_mag_manifest": df_manifest,
    "lc_clean_mag_summary": df_summary,
    "get_clean_parts": get_clean_parts,
})

gc.collect()


[Stage 4] Building cleaned MAG lightcurve cache (split-wise) ...
[Stage 4] split_01/train: parts=1 | rows=26,324 | det%=19.34% | neg_proxy%=38.95% | mag_range=[19.72, 25.96]
[Stage 4] split_01/test: parts=1 | rows=59,235 | det%=23.02% | neg_proxy%=37.74% | mag_range=[19.61, 26.20]
[Stage 4] split_02/train: parts=1 | rows=25,609 | det%=24.45% | neg_proxy%=34.02% | mag_range=[20.10, 26.04]
[Stage 4] split_02/test: parts=1 | rows=71,229 | det%=21.69% | neg_proxy%=36.48% | mag_range=[18.77, 26.32]
[Stage 4] split_03/train: parts=1 | rows=21,676 | det%=21.65% | neg_proxy%=36.82% | mag_range=[20.17, 26.23]
[Stage 4] split_03/test: parts=1 | rows=53,751 | det%=21.90% | neg_proxy%=36.70% | mag_range=[19.61, 26.37]
[Stage 4] split_04/train: parts=1 | rows=22,898 | det%=21.11% | neg_proxy%=38.36% | mag_range=[20.38, 26.16]
[Stage 4] split_04/test: parts=1 | rows=51,408 | det%=21.70% | neg_proxy%=38.16% | mag_range=[19.66, 26.25]
[Stage 4] split_05/train: parts=1 | rows=25,934 | det%=18.33% | neg

273

# Sequence Tokenization (Event-based Tokens)

In [7]:
# ============================================================
# STAGE 5 — Sequence Tokenization (Event-based Tokens) (ONE CELL, Kaggle CPU-SAFE) — REVISI FULL
#
# Tujuan:
# - Mengubah cleaned lightcurve (STAGE 4) -> token sequence per object_id
# - 1 observasi = 1 token
# - Simpan shard .npz per split & (train/test) + manifest slice
#
# Kompatibilitas:
# - Auto-detect schema STAGE 4:
#   (A) MAG schema (revisi ASTROMER-ready):
#       object_id,mjd,band_id,mag,mag_err,snr,detected
#   (B) ASINH schema (versi lama):
#       object_id,mjd,band_id,flux_asinh,err_log1p,snr,detected
#
# Input:
# - LC_CLEAN_DIR, get_clean_parts (STAGE 4)
# - df_train_meta, df_test_meta (STAGE 2)
# - train_ids_by_split, test_ids_by_split (STAGE 3)
# - SPLIT_LIST, ART_DIR
#
# Output:
# - artifacts/seq_tokens/split_XX/{train|test}/shard_*.npz
# - artifacts/seq_tokens/seq_manifest_{train|test}.csv
# - artifacts/seq_tokens/seq_config.json
# ============================================================

import gc, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["LC_CLEAN_DIR", "get_clean_parts",
             "df_train_meta", "df_test_meta",
             "train_ids_by_split", "test_ids_by_split",
             "SPLIT_LIST", "ART_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 -> 1 -> 2 -> 3 -> 4 dulu.")

# ----------------------------
# 1) Settings (CPU-safe)
# ----------------------------
ONLY_SPLITS = None                 # None = proses semua; atau ["split_01","split_02"] untuk debug
COMPRESS_NPZ = False               # True lebih kecil disk tapi jauh lebih lambat di CPU
SHARD_MAX_OBJECTS = 1500           # jumlah object per shard file
SNR_TANH_SCALE = 10.0              # snr_tanh = tanh(snr / scale)
TIME_CLIP_MAX_DAYS = None          # None = no clip; atau mis. 2000.0
DROP_BAD_TIME_ROWS = True          # drop rows with NaN/inf mjd
FALLBACK_NUM_BUCKETS = 64          # dipakai jika fallback (hash bucket) diperlukan

SEQ_DIR = Path(ART_DIR) / "seq_tokens"
SEQ_DIR.mkdir(parents=True, exist_ok=True)

# Token mode auto-detect (ditentukan saat baca part pertama)
TOKEN_MODE = None  # "mag" atau "asinh"

# Feature spec akan di-set setelah mode terdeteksi
FEATURE_NAMES = None
FEATURE_DIM = None

# ----------------------------
# 2) Robust readers for cleaned parts (parquet or csv.gz)
# ----------------------------
BASE_COLS = {"object_id","mjd","band_id","snr","detected"}
MODE_COLS = {
    "mag": {"mag","mag_err"},
    "asinh": {"flux_asinh","err_log1p"},
}

def _read_clean_part(path: str) -> pd.DataFrame:
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"Clean part missing: {p}")

    if p.suffix == ".parquet":
        df = pd.read_parquet(p)
    elif p.name.endswith(".csv.gz"):
        df = pd.read_csv(p, compression="gzip")
    else:
        df = pd.read_csv(p)

    df.columns = [c.strip() for c in df.columns]

    # detect mode once (global)
    global TOKEN_MODE, FEATURE_NAMES, FEATURE_DIM

    if TOKEN_MODE is None:
        cols = set(df.columns)
        if BASE_COLS.issubset(cols) and MODE_COLS["mag"].issubset(cols):
            TOKEN_MODE = "mag"
            FEATURE_NAMES = ["t_rel_log", "dt_log", "mag", "mag_err_log", "snr_tanh", "detected"]
        elif BASE_COLS.issubset(cols) and MODE_COLS["asinh"].issubset(cols):
            TOKEN_MODE = "asinh"
            FEATURE_NAMES = ["t_rel_log", "dt_log", "flux_asinh", "err_log1p", "snr_tanh", "detected"]
        else:
            raise ValueError(
                "Cannot detect cleaned schema.\n"
                f"Found columns: {list(df.columns)}\n"
                "Expected either:\n"
                "- MAG: object_id,mjd,band_id,mag,mag_err,snr,detected\n"
                "- ASINH: object_id,mjd,band_id,flux_asinh,err_log1p,snr,detected"
            )
        FEATURE_DIM = len(FEATURE_NAMES)

    # validate required cols for detected mode
    req = set(BASE_COLS) | set(MODE_COLS[TOKEN_MODE])
    missing = sorted(list(req - set(df.columns)))
    if missing:
        raise ValueError(f"Clean part missing columns {missing}. Found: {list(df.columns)} | file={p}")

    # enforce dtypes lightly
    df["object_id"] = df["object_id"].astype("string").str.strip()
    df["mjd"] = pd.to_numeric(df["mjd"], errors="coerce").astype(np.float32)
    df["band_id"] = pd.to_numeric(df["band_id"], errors="coerce").astype(np.int16)

    df["snr"] = pd.to_numeric(df["snr"], errors="coerce").astype(np.float32)
    df["detected"] = pd.to_numeric(df["detected"], errors="coerce").fillna(0).astype(np.int8)

    if TOKEN_MODE == "mag":
        df["mag"] = pd.to_numeric(df["mag"], errors="coerce").astype(np.float32)
        df["mag_err"] = pd.to_numeric(df["mag_err"], errors="coerce").astype(np.float32)
    else:
        df["flux_asinh"] = pd.to_numeric(df["flux_asinh"], errors="coerce").astype(np.float32)
        df["err_log1p"]  = pd.to_numeric(df["err_log1p"], errors="coerce").astype(np.float32)

    if DROP_BAD_TIME_ROWS:
        df = df[np.isfinite(df["mjd"].to_numpy())]

    return df

# ----------------------------
# 3) Build tokens for one object (sort by time inside object)
# ----------------------------
def build_object_tokens(df_obj: pd.DataFrame):
    """
    Returns:
      X: (L, FEATURE_DIM) float32
      B: (L,) int8 band_id
    """
    if df_obj.empty:
        return None, None

    mjd = df_obj["mjd"].to_numpy(dtype=np.float32, copy=False)
    band = df_obj["band_id"].to_numpy(dtype=np.int16, copy=False)
    snr  = df_obj["snr"].to_numpy(dtype=np.float32, copy=False)
    det  = df_obj["detected"].to_numpy(dtype=np.int8, copy=False)

    # sort by mjd, tie-break by band
    order = np.lexsort((band, mjd))
    mjd = mjd[order]
    band = band[order]
    snr  = snr[order]
    det  = det[order]

    # time features
    t0 = mjd[0]
    t_rel = mjd - t0
    dt = np.empty_like(t_rel)
    dt[0] = 0.0
    if len(t_rel) > 1:
        dt[1:] = np.maximum(mjd[1:] - mjd[:-1], 0.0)

    if TIME_CLIP_MAX_DAYS is not None:
        t_rel = np.clip(t_rel, 0.0, float(TIME_CLIP_MAX_DAYS))
        dt    = np.clip(dt,    0.0, float(TIME_CLIP_MAX_DAYS))

    t_rel_log = np.log1p(t_rel).astype(np.float32)
    dt_log    = np.log1p(dt).astype(np.float32)

    # snr -> tanh
    snr = np.nan_to_num(snr, nan=0.0, posinf=0.0, neginf=0.0)
    snr_tanh = np.tanh(snr / np.float32(SNR_TANH_SCALE)).astype(np.float32)

    det_f = det.astype(np.float32)

    # value channels (mode-specific)
    if TOKEN_MODE == "mag":
        mag = df_obj["mag"].to_numpy(dtype=np.float32, copy=False)[order]
        mag_err = df_obj["mag_err"].to_numpy(dtype=np.float32, copy=False)[order]

        mag = np.nan_to_num(mag, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        mag_err = np.nan_to_num(mag_err, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        mag_err = np.maximum(mag_err, np.float32(0.0))
        mag_err_log = np.log1p(mag_err).astype(np.float32)

        X = np.stack([t_rel_log, dt_log, mag, mag_err_log, snr_tanh, det_f], axis=1).astype(np.float32)
    else:
        flux = df_obj["flux_asinh"].to_numpy(dtype=np.float32, copy=False)[order]
        elog = df_obj["err_log1p"].to_numpy(dtype=np.float32, copy=False)[order]
        flux = np.nan_to_num(flux, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)
        elog = np.nan_to_num(elog, nan=0.0, posinf=0.0, neginf=0.0).astype(np.float32)

        X = np.stack([t_rel_log, dt_log, flux, elog, snr_tanh, det_f], axis=1).astype(np.float32)

    B = band.astype(np.int8)
    return X, B

# ----------------------------
# 4) Shard writer
# ----------------------------
def save_shard(out_path: Path, object_ids, X_concat, B_concat, offsets):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    obj_arr = np.asarray(object_ids, dtype="S")  # bytes in npz
    if COMPRESS_NPZ:
        np.savez_compressed(out_path, object_id=obj_arr, x=X_concat, band=B_concat, offsets=offsets)
    else:
        np.savez(out_path, object_id=obj_arr, x=X_concat, band=B_concat, offsets=offsets)

# ----------------------------
# 5) Streaming builder (assumes contiguous object blocks in file order; auto-detect fallback)
# ----------------------------
def build_sequences_streaming(split_name: str, which: str, expected_ids: set, out_dir: Path):
    parts = get_clean_parts(split_name, which)
    if not parts:
        raise RuntimeError(f"No cleaned parts for {split_name}/{which}. Pastikan STAGE 4 sukses.")

    manifest_rows = []
    shard_idx = 0
    batch_obj_ids, batch_X_list, batch_B_list, batch_lengths = [], [], [], []

    cur_oid = None
    cur_buf = []
    seen_done = set()
    fallback_needed = False

    def flush_object(oid, buf_blocks):
        nonlocal batch_obj_ids, batch_X_list, batch_B_list, batch_lengths
        if oid is None or not buf_blocks:
            return
        if oid not in expected_ids:
            return
        df_obj = pd.concat(buf_blocks, ignore_index=True)
        X, B = build_object_tokens(df_obj)
        if X is None:
            return
        batch_obj_ids.append(oid)
        batch_X_list.append(X)
        batch_B_list.append(B)
        batch_lengths.append(X.shape[0])

    def flush_shard():
        nonlocal shard_idx, batch_obj_ids, batch_X_list, batch_B_list, batch_lengths, manifest_rows
        if not batch_obj_ids:
            return
        lengths = np.asarray(batch_lengths, dtype=np.int64)
        offsets = np.zeros(len(lengths) + 1, dtype=np.int64)
        offsets[1:] = np.cumsum(lengths)
        X_concat = np.concatenate(batch_X_list, axis=0).astype(np.float32)
        B_concat = np.concatenate(batch_B_list, axis=0).astype(np.int8)

        shard_path = out_dir / f"shard_{shard_idx:04d}.npz"
        save_shard(shard_path, batch_obj_ids, X_concat, B_concat, offsets)

        for i, oid in enumerate(batch_obj_ids):
            manifest_rows.append({
                "object_id": oid,
                "split": split_name,
                "which": which,
                "shard": str(shard_path),
                "start": int(offsets[i]),
                "length": int(lengths[i]),
            })

        shard_idx += 1
        batch_obj_ids, batch_X_list, batch_B_list, batch_lengths = [], [], [], []
        gc.collect()

    for pi, p in enumerate(parts):
        df = _read_clean_part(p)
        if df.empty:
            continue

        oids = df["object_id"].to_numpy(dtype=object, copy=False)

        # segment boundaries where object_id changes
        change = np.empty(len(oids), dtype=bool)
        change[0] = True
        change[1:] = oids[1:] != oids[:-1]
        seg_starts = np.flatnonzero(change)
        seg_ends = np.append(seg_starts[1:], len(oids))

        for s_idx, e_idx in zip(seg_starts, seg_ends):
            oid = str(oids[s_idx])
            block = df.iloc[s_idx:e_idx]

            if (oid in seen_done) and (oid != cur_oid):
                fallback_needed = True
                break

            if cur_oid is None:
                cur_oid = oid
                cur_buf = [block]
            elif oid == cur_oid:
                cur_buf.append(block)
            else:
                flush_object(cur_oid, cur_buf)
                seen_done.add(cur_oid)

                if len(batch_obj_ids) >= SHARD_MAX_OBJECTS:
                    flush_shard()

                cur_oid = oid
                cur_buf = [block]

        del df
        if fallback_needed:
            break
        if (pi + 1) % 10 == 0:
            gc.collect()

    if not fallback_needed:
        flush_object(cur_oid, cur_buf)
        if cur_oid is not None:
            seen_done.add(cur_oid)
        flush_shard()

    built = len(seen_done.intersection(expected_ids))
    return manifest_rows, built, fallback_needed

# ----------------------------
# 6) Fallback: Hash-bucket builder (robust if not contiguous)
# - Uses pyarrow if available (fast). If not, hard fail with clear message.
# ----------------------------
def build_sequences_fallback_bucket(split_name: str, which: str, expected_ids: set, out_dir: Path, num_buckets: int = 64):
    try:
        import pyarrow as pa
        import pyarrow.parquet as pq
    except Exception as e:
        raise RuntimeError(
            "Fallback bucketization membutuhkan pyarrow (biasanya ada di Kaggle). "
            "Jika environment kamu tidak punya pyarrow, jalankan lagi di Kaggle atau pastikan parquet engine tersedia."
        ) from e

    parts = get_clean_parts(split_name, which)
    if not parts:
        raise RuntimeError(f"No cleaned parts for {split_name}/{which}.")

    tmp_dir = Path(ART_DIR) / "tmp_buckets" / split_name / which
    tmp_dir.mkdir(parents=True, exist_ok=True)

    writers = {}

    def bucket_idx(series_objid: pd.Series) -> np.ndarray:
        h = pd.util.hash_pandas_object(series_objid, index=False).to_numpy(dtype=np.uint64, copy=False)
        return (h % np.uint64(num_buckets)).astype(np.int16)

    # write bucket parquet incrementally
    for p in parts:
        df = _read_clean_part(p)
        if df.empty:
            continue
        df = df[df["object_id"].isin(expected_ids)]
        if df.empty:
            continue

        bidx = bucket_idx(df["object_id"])
        df["_b"] = bidx

        for b in np.unique(bidx):
            sub = df[df["_b"] == b].drop(columns=["_b"])
            if sub.empty:
                continue
            file_path = tmp_dir / f"bucket_{int(b):03d}.parquet"
            table = pa.Table.from_pandas(sub, preserve_index=False)
            if int(b) not in writers:
                writers[int(b)] = pq.ParquetWriter(file_path, table.schema, compression="snappy")
            writers[int(b)].write_table(table)

        del df
        gc.collect()

    for w in writers.values():
        w.close()

    # process bucket files
    manifest_rows = []
    shard_idx = 0
    batch_obj_ids, batch_X_list, batch_B_list, batch_lengths = [], [], [], []
    built_ids = set()

    def flush_shard_local():
        nonlocal shard_idx, batch_obj_ids, batch_X_list, batch_B_list, batch_lengths, manifest_rows
        if not batch_obj_ids:
            return
        lengths = np.asarray(batch_lengths, dtype=np.int64)
        offsets = np.zeros(len(lengths) + 1, dtype=np.int64)
        offsets[1:] = np.cumsum(lengths)
        X_concat = np.concatenate(batch_X_list, axis=0).astype(np.float32)
        B_concat = np.concatenate(batch_B_list, axis=0).astype(np.int8)

        shard_path = out_dir / f"shard_{shard_idx:04d}.npz"
        save_shard(shard_path, batch_obj_ids, X_concat, B_concat, offsets)

        for i, oid in enumerate(batch_obj_ids):
            manifest_rows.append({
                "object_id": oid,
                "split": split_name,
                "which": which,
                "shard": str(shard_path),
                "start": int(offsets[i]),
                "length": int(lengths[i]),
            })

        shard_idx += 1
        batch_obj_ids, batch_X_list, batch_B_list, batch_lengths = [], [], [], []
        gc.collect()

    for bf in sorted(tmp_dir.glob("bucket_*.parquet")):
        dfb = pd.read_parquet(bf)
        dfb.columns = [c.strip() for c in dfb.columns]
        if dfb.empty:
            bf.unlink(missing_ok=True)
            continue

        for oid, g in dfb.groupby("object_id", sort=False):
            oid = str(oid)
            if oid in built_ids:
                continue
            X, B = build_object_tokens(g)
            if X is None:
                continue

            batch_obj_ids.append(oid)
            batch_X_list.append(X)
            batch_B_list.append(B)
            batch_lengths.append(X.shape[0])
            built_ids.add(oid)

            if len(batch_obj_ids) >= SHARD_MAX_OBJECTS:
                flush_shard_local()

        bf.unlink(missing_ok=True)
        del dfb
        gc.collect()

    flush_shard_local()

    # cleanup tmp dir
    try:
        tmp_dir.rmdir()
    except Exception:
        pass

    return manifest_rows, len(built_ids)

# ----------------------------
# 7) Run tokenization for all splits (train & test)
# ----------------------------
splits_to_run = ONLY_SPLITS if (ONLY_SPLITS is not None) else SPLIT_LIST

all_manifest_train = []
all_manifest_test  = []

def expected_set_for(split_name: str, which: str) -> set:
    return set(train_ids_by_split[split_name]) if which == "train" else set(test_ids_by_split[split_name])

for split_name in splits_to_run:
    for which in ["train", "test"]:
        out_dir = SEQ_DIR / split_name / which
        out_dir.mkdir(parents=True, exist_ok=True)

        expected_ids = expected_set_for(split_name, which)
        if len(expected_ids) == 0:
            raise RuntimeError(f"Expected ids empty for {split_name}/{which}. Cek log/split mapping.")

        print(f"\n[Stage 5] Building sequences: {split_name}/{which} | expected_objects={len(expected_ids):,}")

        manifest_rows, built, fallback_needed = build_sequences_streaming(
            split_name=split_name,
            which=which,
            expected_ids=expected_ids,
            out_dir=out_dir
        )

        if fallback_needed or built != len(expected_ids):
            print(f"[Stage 5] Streaming not safe for {split_name}/{which} "
                  f"(built={built:,} vs expected={len(expected_ids):,}, fallback_needed={fallback_needed}).")
            print("[Stage 5] Switching to robust bucket fallback (temporary buckets, then cleaned).")

            # clear partial outputs
            for f in out_dir.glob("shard_*.npz"):
                try:
                    f.unlink()
                except Exception:
                    pass

            manifest_rows, built2 = build_sequences_fallback_bucket(
                split_name=split_name,
                which=which,
                expected_ids=expected_ids,
                out_dir=out_dir,
                num_buckets=FALLBACK_NUM_BUCKETS
            )
            if built2 != len(expected_ids):
                raise RuntimeError(f"Fallback mismatch for {split_name}/{which}: built={built2:,} expected={len(expected_ids):,}")
            built = built2

        print(f"[Stage 5] OK: {split_name}/{which} built_objects={built:,} | shards={len(list(out_dir.glob('shard_*.npz'))):,}")

        if which == "train":
            all_manifest_train.extend(manifest_rows)
        else:
            all_manifest_test.extend(manifest_rows)

        gc.collect()

# ----------------------------
# 8) Save manifests + config
# ----------------------------
df_m_train = pd.DataFrame(all_manifest_train).sort_values(["split","shard","start"]).reset_index(drop=True)
df_m_test  = pd.DataFrame(all_manifest_test).sort_values(["split","shard","start"]).reset_index(drop=True)

mtrain_path = SEQ_DIR / "seq_manifest_train.csv"
mtest_path  = SEQ_DIR / "seq_manifest_test.csv"
df_m_train.to_csv(mtrain_path, index=False)
df_m_test.to_csv(mtest_path, index=False)

cfg = {
    "token_mode": TOKEN_MODE,
    "feature_names": FEATURE_NAMES,
    "feature_dim": int(FEATURE_DIM),
    "snr_tanh_scale": float(SNR_TANH_SCALE),
    "time_clip_max_days": None if TIME_CLIP_MAX_DAYS is None else float(TIME_CLIP_MAX_DAYS),
    "compress_npz": bool(COMPRESS_NPZ),
    "shard_max_objects": int(SHARD_MAX_OBJECTS),
    "fallback_num_buckets": int(FALLBACK_NUM_BUCKETS),
}
cfg_path = SEQ_DIR / "seq_config.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(cfg, f, indent=2)

print("\n[Stage 5] DONE")
print(f"- token_mode : {TOKEN_MODE}")
print(f"- features   : {FEATURE_NAMES}")
print(f"- Saved: {mtrain_path} (rows={len(df_m_train):,})")
print(f"- Saved: {mtest_path}  (rows={len(df_m_test):,})")
print(f"- Saved: {cfg_path}")

# ----------------------------
# 9) Smoke test: load one object sequence
# ----------------------------
def load_sequence(object_id: str, which: str):
    object_id = str(object_id).strip()
    m = df_m_train if which == "train" else df_m_test
    row = m[m["object_id"] == object_id]
    if row.empty:
        raise KeyError(f"object_id not found in seq manifest ({which}): {object_id}")
    r = row.iloc[0]
    data = np.load(r["shard"], allow_pickle=False)
    start = int(r["start"])
    length = int(r["length"])
    X = data["x"][start:start+length]
    B = data["band"][start:start+length]
    return X, B

_smoke_oid = str(df_train_meta.index[0])
X_sm, B_sm = load_sequence(_smoke_oid, "train")
print(f"\n[Stage 5] Smoke test object_id={_smoke_oid}")
print(f"- seq_len={len(X_sm)} | X_shape={X_sm.shape} | bands_unique={sorted(set(B_sm.tolist()))}")

# Export globals for next stages
globals().update({
    "SEQ_DIR": SEQ_DIR,
    "seq_manifest_train": df_m_train,
    "seq_manifest_test": df_m_test,
    "SEQ_FEATURE_NAMES": FEATURE_NAMES,
    "SEQ_FEATURE_DIM": int(FEATURE_DIM),
    "SEQ_TOKEN_MODE": TOKEN_MODE,
    "load_sequence": load_sequence,
})

gc.collect()



[Stage 5] Building sequences: split_01/train | expected_objects=155
[Stage 5] OK: split_01/train built_objects=155 | shards=1

[Stage 5] Building sequences: split_01/test | expected_objects=364
[Stage 5] OK: split_01/test built_objects=364 | shards=1

[Stage 5] Building sequences: split_02/train | expected_objects=170
[Stage 5] OK: split_02/train built_objects=170 | shards=1

[Stage 5] Building sequences: split_02/test | expected_objects=414
[Stage 5] OK: split_02/test built_objects=414 | shards=1

[Stage 5] Building sequences: split_03/train | expected_objects=138
[Stage 5] OK: split_03/train built_objects=138 | shards=1

[Stage 5] Building sequences: split_03/test | expected_objects=338
[Stage 5] OK: split_03/test built_objects=338 | shards=1

[Stage 5] Building sequences: split_04/train | expected_objects=145
[Stage 5] OK: split_04/train built_objects=145 | shards=1

[Stage 5] Building sequences: split_04/test | expected_objects=332
[Stage 5] OK: split_04/test built_objects=332 | s

55

# Sequence Length Policy (Padding, Truncation, Windowing)

In [8]:
# ============================================================
# STAGE 6 — Sequence Length Policy (Padding, Truncation, Windowing)
# ONE CELL, Kaggle CPU-SAFE, nyambung dengan STAGE 0..5 — REVISI FULL (MAG/ASINH COMPAT)
#
# Upgrade utama vs versi kamu:
# - Kompatibel dengan STAGE 5 (AUTO token_mode):
#     * mode="asinh":  feature ada flux_asinh, err_log1p
#     * mode="mag"  :  feature ada mag, mag_err_log
# - Windowing score adaptif:
#     * asinh: pakai |snr_tanh| + |flux_asinh| + detected
#     * mag  : pakai |snr_tanh| + brightness_proxy(median_mag - mag) + detected
# - Strict sanity: deteksi missing fill + duplikasi object_id saat isi memmap
#
# Output:
# - artifacts/fixed_seq/{train|test}_{X|B|M}.dat  (memmap)
# - artifacts/fixed_seq/{train|test}_ids.npy
# - artifacts/fixed_seq/train_y.npy
# - artifacts/fixed_seq/{train|test}_origlen.npy, {train|test}_winstart.npy
# - artifacts/fixed_seq/length_policy_config.json
# ============================================================

import gc, json, math, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=pd.errors.DtypeWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["seq_manifest_train", "seq_manifest_test", "SEQ_FEATURE_NAMES",
             "df_train_meta", "df_test_meta", "ART_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 -> 1 -> 2 -> 3 -> 4 -> 5 dulu.")

m_train = seq_manifest_train.copy()
m_test  = seq_manifest_test.copy()

SEQ_FEATURE_NAMES = list(SEQ_FEATURE_NAMES)
feat = {name: i for i, name in enumerate(SEQ_FEATURE_NAMES)}

# ----------------------------
# 0b) Detect token_mode (MAG vs ASINH)
# ----------------------------
SEQ_TOKEN_MODE = globals().get("SEQ_TOKEN_MODE", None)
if SEQ_TOKEN_MODE is None:
    if ("flux_asinh" in feat) and ("err_log1p" in feat):
        SEQ_TOKEN_MODE = "asinh"
    elif ("mag" in feat) and ("mag_err_log" in feat):
        SEQ_TOKEN_MODE = "mag"
    else:
        raise ValueError(
            "Cannot infer SEQ_TOKEN_MODE from SEQ_FEATURE_NAMES.\n"
            f"SEQ_FEATURE_NAMES={SEQ_FEATURE_NAMES}\n"
            "Expected either (flux_asinh, err_log1p) or (mag, mag_err_log)."
        )

# Required common features
REQ_COMMON = ["t_rel_log", "dt_log", "snr_tanh", "detected"]
for k in REQ_COMMON:
    if k not in feat:
        raise ValueError(f"SEQ_FEATURE_NAMES must include '{k}'. Found: {SEQ_FEATURE_NAMES}")

# Required mode-specific features (for scoring + sanity)
if SEQ_TOKEN_MODE == "asinh":
    if "flux_asinh" not in feat:
        raise ValueError(f"token_mode=asinh requires 'flux_asinh'. Found: {SEQ_FEATURE_NAMES}")
    SCORE_VALUE_FEAT = "flux_asinh"
elif SEQ_TOKEN_MODE == "mag":
    if "mag" not in feat:
        raise ValueError(f"token_mode=mag requires 'mag'. Found: {SEQ_FEATURE_NAMES}")
    SCORE_VALUE_FEAT = "mag"
else:
    raise ValueError(f"Unknown SEQ_TOKEN_MODE={SEQ_TOKEN_MODE}")

print(f"[Stage 6] Detected token_mode = {SEQ_TOKEN_MODE} | score_value_feat = {SCORE_VALUE_FEAT}")

# ----------------------------
# 1) Inspect length distribution -> choose MAX_LEN (CPU-friendly)
# ----------------------------
def describe_lengths(m: pd.DataFrame, name: str):
    L = m["length"].to_numpy(dtype=np.int32, copy=False)
    q = np.percentile(L, [0, 1, 5, 10, 25, 50, 75, 90, 95, 98, 99, 100])
    print(f"\n{name} length stats")
    print(f"- n_objects={len(L):,} | min={int(q[0])} | p50={int(q[5])} | p90={int(q[7])} | p95={int(q[8])} | p99={int(q[10])} | max={int(q[-1])}")
    return q

q_tr = describe_lengths(m_train, "TRAIN")
q_te = describe_lengths(m_test,  "TEST")

p95 = int(max(q_tr[8], q_te[8]))

# CPU-safe caps:
# - p95 <= 256 => 256
# - <= 384 => 384
# - else => 512
if p95 <= 256:
    MAX_LEN = 256
elif p95 <= 384:
    MAX_LEN = 384
else:
    MAX_LEN = 512

FORCE_MAX_LEN = None  # e.g. 256
if FORCE_MAX_LEN is not None:
    MAX_LEN = int(FORCE_MAX_LEN)

print(f"\n[Stage 6] Chosen MAX_LEN = {MAX_LEN} (based on p95={p95})")

# ----------------------------
# 2) Windowing / truncation policy (adaptive)
# ----------------------------
# Score components:
# - always: |snr_tanh| + detected
# - mode-specific:
#   asinh: + |flux_asinh|
#   mag  : + brightness_proxy = relu(median_mag - mag)
W_SNR = 1.0
W_VAL = 0.35
W_DET = 0.25

def _brightness_proxy_from_mag(mag: np.ndarray) -> np.ndarray:
    """Higher = brighter peak. Use median-mag, clipped at 0."""
    mag = np.nan_to_num(mag, nan=np.float32(0.0), posinf=np.float32(0.0), neginf=np.float32(0.0)).astype(np.float32)
    med = np.float32(np.median(mag)) if mag.size > 0 else np.float32(0.0)
    br = (med - mag).astype(np.float32)
    br = np.maximum(br, np.float32(0.0))
    # compress dynamic range a bit (optional but stabilizes)
    br = np.log1p(br).astype(np.float32)
    return br

def select_window(X: np.ndarray, max_len: int) -> tuple[int, int, int]:
    """
    Returns (start, end, center) for window selection.
    X shape (L, F).
    """
    L = int(X.shape[0])
    if L <= max_len:
        return 0, L, 0

    snr = np.abs(X[:, feat["snr_tanh"]]).astype(np.float32, copy=False)
    det = X[:, feat["detected"]].astype(np.float32, copy=False)

    if SEQ_TOKEN_MODE == "asinh":
        val = np.abs(X[:, feat["flux_asinh"]]).astype(np.float32, copy=False)
    else:
        mag = X[:, feat["mag"]].astype(np.float32, copy=False)
        val = _brightness_proxy_from_mag(mag)

    score = (np.float32(W_SNR) * snr) + (np.float32(W_VAL) * val) + (np.float32(W_DET) * det)

    if (not np.isfinite(score).any()) or (float(np.nanmax(score)) <= 0.0):
        center = L // 2
    else:
        center = int(np.nanargmax(score))

    half = max_len // 2
    start = max(0, center - half)
    start = min(start, L - max_len)
    end = start + max_len
    return start, end, center

def pad_to_fixed(X: np.ndarray, B: np.ndarray, max_len: int):
    """
    Returns:
      Xp: (max_len, F) float32
      Bp: (max_len,) int8
      Mp: (max_len,) int8  (1=real token)
      orig_len, win_start, win_end
    """
    L = int(X.shape[0])
    if L <= 0:
        Xp = np.zeros((max_len, X.shape[1]), dtype=np.float32)
        Bp = np.zeros((max_len,), dtype=np.int8)
        Mp = np.zeros((max_len,), dtype=np.int8)
        return Xp, Bp, Mp, 0, 0, 0

    s, e, _ = select_window(X, max_len=max_len)
    Xw = X[s:e]
    Bw = B[s:e]
    lw = int(Xw.shape[0])

    Xp = np.zeros((max_len, X.shape[1]), dtype=np.float32)
    Bp = np.zeros((max_len,), dtype=np.int8)
    Mp = np.zeros((max_len,), dtype=np.int8)

    Xp[:lw] = Xw.astype(np.float32, copy=False)
    Bp[:lw] = Bw.astype(np.int8, copy=False)
    Mp[:lw] = 1
    return Xp, Bp, Mp, L, s, e

# ----------------------------
# 3) Fixed cache builder (efficient: process per shard)
# ----------------------------
FIX_DIR = Path(ART_DIR) / "fixed_seq"
FIX_DIR.mkdir(parents=True, exist_ok=True)

# ordering
train_ids = df_train_meta.index.to_list()
y_train = df_train_meta["target"].to_numpy(dtype=np.int8, copy=False)

if "df_sub" in globals() and isinstance(globals()["df_sub"], pd.DataFrame) and "object_id" in df_sub.columns:
    test_ids = df_sub["object_id"].astype(str).str.strip().to_list()
else:
    test_ids = df_test_meta.index.to_list()

train_row = {oid: i for i, oid in enumerate(train_ids)}
test_row  = {oid: i for i, oid in enumerate(test_ids)}

NTR = len(train_ids)
NTE = len(test_ids)
F = len(SEQ_FEATURE_NAMES)

# memmap paths
train_X_path = FIX_DIR / "train_X.dat"
train_B_path = FIX_DIR / "train_B.dat"
train_M_path = FIX_DIR / "train_M.dat"
test_X_path  = FIX_DIR / "test_X.dat"
test_B_path  = FIX_DIR / "test_B.dat"
test_M_path  = FIX_DIR / "test_M.dat"

# metadata arrays
train_len_path = FIX_DIR / "train_origlen.npy"
train_win_path = FIX_DIR / "train_winstart.npy"
test_len_path  = FIX_DIR / "test_origlen.npy"
test_win_path  = FIX_DIR / "test_winstart.npy"

# create memmaps
Xtr = np.memmap(train_X_path, dtype=np.float32, mode="w+", shape=(NTR, MAX_LEN, F))
Btr = np.memmap(train_B_path, dtype=np.int8,   mode="w+", shape=(NTR, MAX_LEN))
Mtr = np.memmap(train_M_path, dtype=np.int8,   mode="w+", shape=(NTR, MAX_LEN))

Xte = np.memmap(test_X_path, dtype=np.float32, mode="w+", shape=(NTE, MAX_LEN, F))
Bte = np.memmap(test_B_path, dtype=np.int8,   mode="w+", shape=(NTE, MAX_LEN))
Mte = np.memmap(test_M_path, dtype=np.int8,   mode="w+", shape=(NTE, MAX_LEN))

origlen_tr = np.zeros((NTR,), dtype=np.int32)
winstart_tr = np.zeros((NTR,), dtype=np.int32)
origlen_te = np.zeros((NTE,), dtype=np.int32)
winstart_te = np.zeros((NTE,), dtype=np.int32)

filled_tr_mask = np.zeros((NTR,), dtype=np.uint8)
filled_te_mask = np.zeros((NTE,), dtype=np.uint8)

def process_manifest_into_memmap(m: pd.DataFrame, which: str):
    """
    Read each shard once, then fill memmaps following the chosen ordering.
    Adds strict checks for duplicates/missing.
    """
    if which == "train":
        row_map = train_row
        Xmm, Bmm, Mmm = Xtr, Btr, Mtr
        origlen, winstart = origlen_tr, winstart_tr
        filled_mask = filled_tr_mask
        expected_n = NTR
    else:
        row_map = test_row
        Xmm, Bmm, Mmm = Xte, Bte, Mte
        origlen, winstart = origlen_te, winstart_te
        filled_mask = filled_te_mask
        expected_n = NTE

    filled = 0
    dup = 0

    for shard_path, g in m.groupby("shard", sort=False):
        shard_path = str(shard_path)
        data = np.load(shard_path, allow_pickle=False)
        x_all = data["x"]
        b_all = data["band"]

        for _, r in g.iterrows():
            oid = str(r["object_id"])
            idx = row_map.get(oid, None)
            if idx is None:
                continue

            if filled_mask[idx]:
                dup += 1
                continue

            start = int(r["start"])
            length = int(r["length"])
            if length <= 0:
                continue

            X = x_all[start:start+length]
            B = b_all[start:start+length]

            Xp, Bp, Mp, L0, ws, _ = pad_to_fixed(X, B, max_len=MAX_LEN)

            Xmm[idx, :, :] = Xp
            Bmm[idx, :] = Bp
            Mmm[idx, :] = Mp
            origlen[idx] = int(L0)
            winstart[idx] = int(ws)
            filled_mask[idx] = 1
            filled += 1

        del data
        if filled % 2000 == 0:
            gc.collect()

    return filled, dup, expected_n

print("\n[Stage 6] Building fixed-length cache (TRAIN)...")
filled_tr, dup_tr, exp_tr = process_manifest_into_memmap(m_train, "train")
print(f"[Stage 6] TRAIN filled: {filled_tr:,}/{exp_tr:,} | duplicates_skipped={dup_tr:,}")

print("\n[Stage 6] Building fixed-length cache (TEST)...")
filled_te, dup_te, exp_te = process_manifest_into_memmap(m_test, "test")
print(f"[Stage 6] TEST  filled: {filled_te:,}/{exp_te:,} | duplicates_skipped={dup_te:,}")

# flush memmaps
Xtr.flush(); Btr.flush(); Mtr.flush()
Xte.flush(); Bte.flush(); Mte.flush()

# save ids + y + meta
np.save(FIX_DIR / "train_ids.npy", np.asarray(train_ids, dtype="S"))
np.save(FIX_DIR / "test_ids.npy",  np.asarray(test_ids, dtype="S"))
np.save(FIX_DIR / "train_y.npy",   y_train)

np.save(train_len_path, origlen_tr)
np.save(train_win_path, winstart_tr)
np.save(test_len_path,  origlen_te)
np.save(test_win_path,  winstart_te)

# ----------------------------
# 4) Sanity checks (hard)
# ----------------------------
def _check_missing(which: str):
    if which == "train":
        miss = np.where(filled_tr_mask == 0)[0]
        ids = train_ids
    else:
        miss = np.where(filled_te_mask == 0)[0]
        ids = test_ids

    if len(miss) > 0:
        sample = [ids[i] for i in miss[:10]]
        raise RuntimeError(
            f"[Stage 6] Missing filled rows for {which}: {len(miss):,} / {len(ids):,}. "
            f"Examples: {sample}"
        )

_check_missing("train")
_check_missing("test")

def sanity_samples(which: str, n_show: int = 3):
    rng = np.random.default_rng(2025)
    if which == "train":
        Xmm, Bmm, Mmm = Xtr, Btr, Mtr
        ids = train_ids
        ol = origlen_tr
    else:
        Xmm, Bmm, Mmm = Xte, Bte, Mte
        ids = test_ids
        ol = origlen_te

    idxs = rng.choice(len(ids), size=min(n_show, len(ids)), replace=False)
    print(f"\n[Stage 6] Sanity samples ({which}):")
    for i in idxs:
        kept = int(Mmm[i].sum())
        bands = sorted(set(Bmm[i, :kept].tolist())) if kept > 0 else []
        print(f"- idx={i} oid={ids[i]} orig_len={int(ol[i])} kept={kept} | bands_unique={bands}")

sanity_samples("train", 3)
sanity_samples("test", 3)

# ----------------------------
# 5) Save config
# ----------------------------
policy_cfg = {
    "token_mode": SEQ_TOKEN_MODE,
    "max_len": int(MAX_LEN),
    "feature_names": list(SEQ_FEATURE_NAMES),
    "window_score": {
        "snr_abs": float(W_SNR),
        "value": float(W_VAL),
        "detected": float(W_DET),
        "value_feat": SCORE_VALUE_FEAT,
        "value_policy": ("abs(flux_asinh)" if SEQ_TOKEN_MODE == "asinh" else "relu(median_mag - mag) -> log1p"),
    },
    "train_order": "df_train_meta.index",
    "test_order": "df_sub.object_id" if ("df_sub" in globals() and isinstance(df_sub, pd.DataFrame) and "object_id" in df_sub.columns) else "df_test_meta.index",
    "files": {
        "train_X": str(train_X_path),
        "train_B": str(train_B_path),
        "train_M": str(train_M_path),
        "train_y": str(FIX_DIR / "train_y.npy"),
        "train_ids": str(FIX_DIR / "train_ids.npy"),
        "train_origlen": str(train_len_path),
        "train_winstart": str(train_win_path),
        "test_X": str(test_X_path),
        "test_B": str(test_B_path),
        "test_M": str(test_M_path),
        "test_ids": str(FIX_DIR / "test_ids.npy"),
        "test_origlen": str(test_len_path),
        "test_winstart": str(test_win_path),
    }
}
cfg_path = FIX_DIR / "length_policy_config.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(policy_cfg, f, indent=2)

print("\n[Stage 6] DONE")
print(f"- Saved fixed cache dir: {FIX_DIR}")
print(f"- Saved config: {cfg_path}")

# Export globals for training stage
globals().update({
    "FIX_DIR": FIX_DIR,
    "MAX_LEN": MAX_LEN,
    "FIX_TRAIN_X_PATH": train_X_path,
    "FIX_TRAIN_B_PATH": train_B_path,
    "FIX_TRAIN_M_PATH": train_M_path,
    "FIX_TEST_X_PATH": test_X_path,
    "FIX_TEST_B_PATH": test_B_path,
    "FIX_TEST_M_PATH": test_M_path,
    "FIX_TRAIN_Y_PATH": FIX_DIR / "train_y.npy",
    "FIX_TRAIN_IDS_PATH": FIX_DIR / "train_ids.npy",
    "FIX_TEST_IDS_PATH": FIX_DIR / "test_ids.npy",
    "FIX_POLICY_CFG_PATH": cfg_path,
    "SEQ_TOKEN_MODE": SEQ_TOKEN_MODE,
})

gc.collect()


[Stage 6] Detected token_mode = mag | score_value_feat = mag

TRAIN length stats
- n_objects=3,043 | min=17 | p50=150 | p90=183 | p95=194 | p99=908 | max=1164

TEST length stats
- n_objects=7,135 | min=18 | p50=152 | p90=183 | p95=193 | p99=990 | max=1186

[Stage 6] Chosen MAX_LEN = 256 (based on p95=194)

[Stage 6] Building fixed-length cache (TRAIN)...
[Stage 6] TRAIN filled: 3,043/3,043 | duplicates_skipped=0

[Stage 6] Building fixed-length cache (TEST)...
[Stage 6] TEST  filled: 7,135/7,135 | duplicates_skipped=0

[Stage 6] Sanity samples (train):
- idx=1360 oid=gwilwileth_adel_amloth orig_len=157 kept=157 | bands_unique=[0, 1, 2, 3, 4, 5]
- idx=3020 oid=vin_araf_gwador orig_len=151 kept=151 | bands_unique=[0, 1, 2, 3, 4, 5]
- idx=3025 oid=ylf_alph_mindon orig_len=167 kept=167 | bands_unique=[0, 1, 2, 3, 4, 5]

[Stage 6] Sanity samples (test):
- idx=3191 oid=rom_bellas_lebdas orig_len=142 kept=142 | bands_unique=[0, 1, 2, 3, 4, 5]
- idx=7082 oid=nim_nestad_thor orig_len=161 kept=1

55

# CV Split (Object-Level, Stratified)

In [9]:
# ============================================================
# STAGE 7 — CV Split (Object-Level, Stratified) (ONE CELL, Kaggle CPU-SAFE) — REVISI FULL
#
# Tujuan:
# - Buat split CV di level object_id (bukan per baris lightcurve)
# - Konsisten dengan urutan TRAIN yang dipakai di STAGE 6 (fixed_seq/train_ids.npy)
#
# Upgrade:
# - Prefer urutan dari FIX_DIR/train_ids.npy (kalau ada), robust decode bytes->str
# - Pilihan CV:
#     * StratifiedKFold (default)
#     * StratifiedGroupKFold dengan group="split" (opsional, lebih jujur jika ada perbedaan cadence antar split)
# - n_splits adaptif + MIN_POS_PER_FOLD supaya tiap fold punya cukup TDE (lebih stabil untuk threshold/F1)
# - Save train_idx + val_idx per fold (npz) untuk training cepat
# - Validasi keras: missing ids, duplikasi, fold tanpa kelas
#
# Output:
# - artifacts/cv/cv_folds.csv
# - artifacts/cv/cv_folds.npz   (train_idx_f + val_idx_f)
# - artifacts/cv/cv_report.txt
# - artifacts/cv/cv_config.json
# - globals: fold_assign, folds, n_splits, CV_DIR
# ============================================================

import gc, json
from pathlib import Path

import numpy as np
import pandas as pd

# ----------------------------
# 0) Require previous stages
# ----------------------------
for need in ["df_train_meta", "ART_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 2 dulu (Load and Validate Train/Test Logs).")

SEED = int(globals().get("SEED", 2025))

# ----------------------------
# 1) CV Settings (safe defaults for extreme imbalance)
# ----------------------------
DEFAULT_SPLITS = 5
FORCE_N_SPLITS = None          # set int to force (misal 3), else None
MIN_POS_PER_FOLD = 3           # rekomendasi: 3–10 (semakin besar semakin stabil, tapi butuh pos cukup)
USE_GROUP_BY_SPLIT = False     # True => StratifiedGroupKFold (group = df_train_meta["split"])

print(f"[Stage 7] seed={SEED} | default_splits={DEFAULT_SPLITS} | min_pos_per_fold={MIN_POS_PER_FOLD} | group_by_split={USE_GROUP_BY_SPLIT}")

# ----------------------------
# 2) Determine train_ids ordering (prefer fixed cache from STAGE 6)
# ----------------------------
train_ids = None
order_source = "df_train_meta.index"

if "FIX_DIR" in globals():
    p = Path(globals()["FIX_DIR"]) / "train_ids.npy"
    if p.exists():
        raw = np.load(p, allow_pickle=False)
        # robust decode (bytes -> str)
        if raw.dtype.kind in ("S", "O"):
            train_ids = [x.decode("utf-8") if isinstance(x, (bytes, bytearray)) else str(x) for x in raw.tolist()]
        else:
            train_ids = raw.astype(str).tolist()
        order_source = "fixed_seq/train_ids.npy"

if train_ids is None:
    train_ids = df_train_meta.index.astype(str).tolist()

# uniqueness check
if len(train_ids) != len(set(train_ids)):
    s = pd.Series(train_ids)
    dup = s[s.duplicated()].iloc[:10].tolist()
    raise RuntimeError(f"[Stage 7] train_ids has duplicates (examples): {dup}")

# ensure all ids exist in df_train_meta
missing_in_meta = [oid for oid in train_ids if oid not in df_train_meta.index]
if missing_in_meta:
    raise RuntimeError(f"[Stage 7] Some train_ids not found in df_train_meta (examples): {missing_in_meta[:10]}")

# y aligned to train_ids
y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

N = len(train_ids)
pos = int((y == 1).sum())
neg = int((y == 0).sum())
if pos == 0 or neg == 0:
    raise RuntimeError(f"[Stage 7] Invalid class distribution: pos={pos}, neg={neg}. Cannot do stratified CV.")

# ----------------------------
# 3) Choose n_splits safely
# ----------------------------
# Hard limits: each fold needs >=1 pos and >=1 neg
max_splits_by_pos = pos
max_splits_by_neg = neg

# Stability limit: each fold ideally has >= MIN_POS_PER_FOLD positives
max_splits_by_minpos = max(1, pos // max(int(MIN_POS_PER_FOLD), 1))

n_splits = min(DEFAULT_SPLITS, max_splits_by_pos, max_splits_by_neg, max_splits_by_minpos)

if FORCE_N_SPLITS is not None:
    n_splits = int(FORCE_N_SPLITS)

if n_splits < 2:
    raise RuntimeError(
        f"[Stage 7] Too few samples for stratified CV. "
        f"pos={pos}, neg={neg}, MIN_POS_PER_FOLD={MIN_POS_PER_FOLD} => n_splits={n_splits}. "
        "Turunkan MIN_POS_PER_FOLD atau pakai holdout."
    )

print(f"[Stage 7] n_splits={n_splits} | N={N:,} | pos={pos:,} | neg={neg:,} | pos%={pos/max(N,1)*100:.4f}% | order_source={order_source}")

# ----------------------------
# 4) Build folds (SKLearn)
# ----------------------------
try:
    from sklearn.model_selection import StratifiedKFold
    try:
        from sklearn.model_selection import StratifiedGroupKFold
    except Exception:
        StratifiedGroupKFold = None
except Exception as e:
    raise RuntimeError("scikit-learn is not available in this environment.") from e

groups = None
if USE_GROUP_BY_SPLIT:
    if StratifiedGroupKFold is None:
        raise RuntimeError("StratifiedGroupKFold not available in this sklearn version/environment.")
    groups = df_train_meta.loc[train_ids, "split"].astype(str).to_numpy()

fold_assign = np.full(N, -1, dtype=np.int16)
folds = []

if USE_GROUP_BY_SPLIT:
    splitter = StratifiedGroupKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    split_iter = splitter.split(np.zeros(N), y, groups=groups)
else:
    splitter = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    split_iter = splitter.split(np.zeros(N), y)

for fold, (tr_idx, val_idx) in enumerate(split_iter):
    fold_assign[val_idx] = fold
    folds.append({
        "fold": int(fold),
        "train_idx": tr_idx.astype(np.int32),
        "val_idx": val_idx.astype(np.int32),
    })

if (fold_assign < 0).any():
    raise RuntimeError("[Stage 7] Fold assignment still has -1 (some objects not assigned).")

# ----------------------------
# 5) Validate per-fold distribution (hard checks)
# ----------------------------
lines = []
lines.append(f"CV={('StratifiedGroupKFold(split)' if USE_GROUP_BY_SPLIT else 'StratifiedKFold')} n_splits={n_splits} seed={SEED}")
lines.append(f"Order source: {order_source}")
lines.append(f"Total: N={N} | pos={pos} | neg={neg} | pos%={pos/max(N,1)*100:.6f}%")
lines.append("Per-fold distribution:")

ok = True
min_pos_seen = 10**9
for f in range(n_splits):
    idx = np.where(fold_assign == f)[0]
    yf = y[idx]
    pf = int((yf == 1).sum())
    nf = int((yf == 0).sum())
    min_pos_seen = min(min_pos_seen, pf)
    lines.append(f"- fold {f}: n={len(idx):6d} | pos={pf:5d} | neg={nf:6d} | pos%={(pf/max(len(idx),1))*100:9.6f}%")
    if pf == 0 or nf == 0:
        ok = False

if not ok:
    raise RuntimeError(
        "[Stage 7] A fold has pos=0 or neg=0. "
        "Reduce n_splits or disable group_by_split, then rebuild."
    )

if min_pos_seen < MIN_POS_PER_FOLD:
    lines.append(f"NOTE: min positives in a fold = {min_pos_seen} (< MIN_POS_PER_FOLD={MIN_POS_PER_FOLD}). "
                 "Training/threshold tuning may be noisy; consider smaller n_splits or lower MIN_POS_PER_FOLD.")

# ----------------------------
# 6) Save artifacts (csv + npz + report + config)
# ----------------------------
ART_DIR = Path(ART_DIR)
CV_DIR = ART_DIR / "cv"
CV_DIR.mkdir(parents=True, exist_ok=True)

df_folds = pd.DataFrame({"object_id": train_ids, "fold": fold_assign.astype(int)})
folds_csv = CV_DIR / "cv_folds.csv"
df_folds.to_csv(folds_csv, index=False)

# Save idx arrays per fold
npz_path = CV_DIR / "cv_folds.npz"
npz_kwargs = {}
for f in range(n_splits):
    npz_kwargs[f"train_idx_{f}"] = folds[f]["train_idx"]
    npz_kwargs[f"val_idx_{f}"]   = folds[f]["val_idx"]
np.savez(npz_path, **npz_kwargs)

report_path = CV_DIR / "cv_report.txt"
with open(report_path, "w", encoding="utf-8") as f:
    f.write("\n".join(lines) + "\n")

cfg_path = CV_DIR / "cv_config.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(
        {
            "seed": SEED,
            "n_splits": int(n_splits),
            "cv_type": "StratifiedGroupKFold(split)" if USE_GROUP_BY_SPLIT else "StratifiedKFold",
            "min_pos_per_fold": int(MIN_POS_PER_FOLD),
            "order_source": order_source,
            "artifacts": {
                "folds_csv": str(folds_csv),
                "folds_npz": str(npz_path),
                "report_txt": str(report_path),
            },
        },
        f,
        indent=2,
    )

print("\n[Stage 7] CV split OK")
print(f"- Saved: {folds_csv}")
print(f"- Saved: {npz_path}")
print(f"- Saved: {report_path}")
print(f"- Saved: {cfg_path}")
print("\n".join(lines[-(n_splits + 3):]))

# ----------------------------
# 7) Export globals for next stage
# ----------------------------
globals().update({
    "CV_DIR": CV_DIR,
    "n_splits": n_splits,
    "train_ids_ordered": train_ids,
    "y_ordered": y,
    "fold_assign": fold_assign,
    "folds": folds,
    "CV_FOLDS_CSV": folds_csv,
    "CV_FOLDS_NPZ": npz_path,
    "CV_CFG_PATH": cfg_path,
})

gc.collect()


[Stage 7] seed=2025 | default_splits=5 | min_pos_per_fold=3 | group_by_split=False
[Stage 7] n_splits=5 | N=3,043 | pos=148 | neg=2,895 | pos%=4.8636% | order_source=fixed_seq/train_ids.npy

[Stage 7] CV split OK
- Saved: /kaggle/working/mallorn_run/artifacts/cv/cv_folds.csv
- Saved: /kaggle/working/mallorn_run/artifacts/cv/cv_folds.npz
- Saved: /kaggle/working/mallorn_run/artifacts/cv/cv_report.txt
- Saved: /kaggle/working/mallorn_run/artifacts/cv/cv_config.json
Order source: fixed_seq/train_ids.npy
Total: N=3043 | pos=148 | neg=2895 | pos%=4.863621%
Per-fold distribution:
- fold 0: n=   609 | pos=   30 | neg=   579 | pos%= 4.926108%
- fold 1: n=   609 | pos=   30 | neg=   579 | pos%= 4.926108%
- fold 2: n=   609 | pos=   30 | neg=   579 | pos%= 4.926108%
- fold 3: n=   608 | pos=   29 | neg=   579 | pos%= 4.769737%
- fold 4: n=   608 | pos=   29 | neg=   579 | pos%= 4.769737%


33

# Train Model (CPU-Safe Configuration)

In [10]:
# ============================================================
# STAGE 8 — Train Multiband Event Transformer (CPU-Safe) — REVISI FULL v2
# Fixes (tetap):
# - NO leakage: global feature scaler fit per fold (train_idx only) + disimpan di ckpt
# - pos_weight dihitung per fold (train_idx only)
# - grad_accum remainder di-handle (last step tidak dibuang)
# - all-pad guard di attention pooling (hindari NaN)
#
# Fix baru (untuk error kamu):
# - CKPT_DIR / OOF_DIR / LOG_DIR TIDAK wajib sudah ada di globals:
#   otomatis dibuat dari RUN_DIR/ART_DIR atau default /kaggle/working/mallorn_run/*
# ============================================================

import os, gc, json, math, time, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------
# 0) Require minimum previous stages (lebih fleksibel)
# ----------------------------
need_min = ["FIX_DIR","MAX_LEN","SEQ_FEATURE_NAMES","df_train_meta","n_splits","folds"]
for k in need_min:
    if k not in globals():
        raise RuntimeError(f"Missing `{k}`. Jalankan STAGE 0..7 dulu dengan urutan benar.")

# Ordering & labels (fallback aman)
if "train_ids_ordered" in globals() and globals()["train_ids_ordered"] is not None:
    train_ids = list(globals()["train_ids_ordered"])
else:
    p = Path(globals()["FIX_DIR"]) / "train_ids.npy"
    if p.exists():
        train_ids = np.load(p, allow_pickle=False).astype("S").astype(str).tolist()
    else:
        train_ids = df_train_meta.index.astype(str).tolist()

if "y_ordered" in globals() and globals()["y_ordered"] is not None and len(globals()["y_ordered"]) == len(train_ids):
    y = np.asarray(globals()["y_ordered"], dtype=np.int8)
else:
    y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

# ----------------------------
# 0b) Ensure output dirs exist (FIX untuk CKPT_DIR missing)
# ----------------------------
# Base run dir
if "RUN_DIR" in globals() and globals()["RUN_DIR"] is not None:
    RUN_DIR = Path(globals()["RUN_DIR"])
else:
    # infer from ART_DIR if ada; else default
    if "ART_DIR" in globals() and globals()["ART_DIR"] is not None:
        RUN_DIR = Path(globals()["ART_DIR"]).parent
    else:
        RUN_DIR = Path("/kaggle/working/mallorn_run")

# Artifacts dir (kalau belum ada)
ART_DIR = Path(globals().get("ART_DIR", RUN_DIR / "artifacts"))
ART_DIR.mkdir(parents=True, exist_ok=True)

# Create dirs if missing from globals
CKPT_DIR = Path(globals().get("CKPT_DIR", RUN_DIR / "checkpoints"))
OOF_DIR  = Path(globals().get("OOF_DIR",  RUN_DIR / "oof"))
LOG_DIR  = Path(globals().get("LOG_DIR",  RUN_DIR / "logs"))

CKPT_DIR.mkdir(parents=True, exist_ok=True)
OOF_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)

# Export back so next stages see them
globals().update({
    "RUN_DIR": RUN_DIR,
    "ART_DIR": ART_DIR,
    "CKPT_DIR": CKPT_DIR,
    "OOF_DIR": OOF_DIR,
    "LOG_DIR": LOG_DIR,
})

# ----------------------------
# 1) Imports (torch) + CPU safety
# ----------------------------
try:
    import torch
    import torch.nn as nn
except Exception as e:
    raise RuntimeError("PyTorch tidak tersedia di environment ini.") from e

SEED = int(globals().get("SEED", 2025))
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device("cpu")

# CPU thread guard (avoid oversubscription)
try:
    torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", "2")))
    torch.set_num_interop_threads(1)
except Exception:
    pass

# ----------------------------
# 2) Open fixed-length memmaps (do NOT load into RAM)
# ----------------------------
FIX_DIR = Path(globals()["FIX_DIR"])
N = len(train_ids)
L = int(globals()["MAX_LEN"])
Fdim = len(globals()["SEQ_FEATURE_NAMES"])

train_X_path = FIX_DIR / "train_X.dat"
train_B_path = FIX_DIR / "train_B.dat"
train_M_path = FIX_DIR / "train_M.dat"

for p in [train_X_path, train_B_path, train_M_path]:
    if not p.exists():
        raise FileNotFoundError(f"Missing fixed cache file: {p}. Pastikan STAGE 6 sukses.")

X_mm = np.memmap(train_X_path, dtype=np.float32, mode="r", shape=(N, L, Fdim))
B_mm = np.memmap(train_B_path, dtype=np.int8,   mode="r", shape=(N, L))
M_mm = np.memmap(train_M_path, dtype=np.int8,   mode="r", shape=(N, L))

# ----------------------------
# 3) Build RAW global features aligned to train_ids (NO scaling here)
# ----------------------------
G_COLS = ["Z", "Z_err", "EBV", "Z_missing", "Z_err_missing", "EBV_missing", "is_photoz"]
for c in G_COLS:
    if c not in df_train_meta.columns:
        df_train_meta[c] = 0.0

G_raw = df_train_meta.loc[train_ids, G_COLS].copy()
for c in G_COLS:
    G_raw[c] = pd.to_numeric(G_raw[c], errors="coerce").fillna(0.0).astype(np.float32)

G_raw_np = G_raw.to_numpy(dtype=np.float32, copy=False)  # (N, g_dim)
g_dim = int(G_raw_np.shape[1])

# save global columns list (optional)
global_cols_path = Path(LOG_DIR) / "global_feature_cols.json"
with open(global_cols_path, "w", encoding="utf-8") as f:
    json.dump({"cols": G_COLS}, f, indent=2)

# ----------------------------
# 4) Dataset / DataLoader (num_workers=0)
# ----------------------------
class MemmapSeqDataset(torch.utils.data.Dataset):
    def __init__(self, idx, X_mm, B_mm, M_mm, G_scaled_np, y=None):
        self.idx = np.asarray(idx, dtype=np.int32)
        self.X_mm = X_mm
        self.B_mm = B_mm
        self.M_mm = M_mm
        self.G = G_scaled_np  # (N,g_dim)
        self.y = None if y is None else np.asarray(y, dtype=np.int8)

    def __len__(self):
        return len(self.idx)

    def __getitem__(self, i):
        j = int(self.idx[i])
        X = self.X_mm[j]  # (L,F) float32
        B = self.B_mm[j]  # (L,) int8
        M = self.M_mm[j]  # (L,) int8
        G = self.G[j]     # (g_dim,) float32
        if self.y is None:
            return (
                torch.from_numpy(X),
                torch.from_numpy(B.astype(np.int64, copy=False)),
                torch.from_numpy(M.astype(np.int64, copy=False)),
                torch.from_numpy(G),
            )
        yy = self.y[j]
        return (
            torch.from_numpy(X),
            torch.from_numpy(B.astype(np.int64, copy=False)),
            torch.from_numpy(M.astype(np.int64, copy=False)),
            torch.from_numpy(G),
            torch.tensor(float(yy), dtype=torch.float32),
        )

def make_loader(ds, batch_size, shuffle):
    return torch.utils.data.DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=0,
        pin_memory=False,
        drop_last=False,
    )

# ----------------------------
# 5) Model (match inference stage) + all-pad guard
# ----------------------------
class MultibandEventTransformer(nn.Module):
    def __init__(self, feat_dim, max_len, n_bands=6, d_model=128, n_heads=4, n_layers=2, ff_mult=2, dropout=0.10, g_dim=7):
        super().__init__()
        self.n_bands = n_bands
        self.d_model = d_model
        self.max_len = max_len

        self.x_proj = nn.Linear(feat_dim, d_model)
        self.band_emb = nn.Embedding(n_bands, d_model)

        self.pos_emb = nn.Parameter(torch.zeros(1, max_len, d_model))
        nn.init.normal_(self.pos_emb, mean=0.0, std=0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=int(d_model * ff_mult),
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.attn = nn.Linear(d_model, 1)

        self.g_proj = nn.Sequential(
            nn.Linear(g_dim, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        self.head = nn.Sequential(
            nn.Linear(d_model + (d_model // 2), d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, X, band_id, mask, G):
        X = X.to(torch.float32)
        band_id = band_id.clamp(0, self.n_bands - 1).to(torch.long)
        mask = mask.to(torch.long)

        pad_mask = (mask == 0)  # True = pad
        # ALL-PAD GUARD: kalau 1 sequence full pad, buka token 0
        all_pad = pad_mask.all(dim=1)
        if all_pad.any():
            pad_mask = pad_mask.clone()
            pad_mask[all_pad, 0] = False

        h = self.x_proj(X) + self.band_emb(band_id) + self.pos_emb[:, :X.shape[1], :]
        h = self.encoder(h, src_key_padding_mask=pad_mask)

        a = self.attn(h).squeeze(-1)
        a = a.masked_fill(pad_mask, -1e9)
        w = torch.softmax(a, dim=1)
        pooled = torch.sum(h * w.unsqueeze(-1), dim=1)

        g = self.g_proj(G.to(torch.float32))
        z = torch.cat([pooled, g], dim=1)
        return self.head(z).squeeze(-1)

# ----------------------------
# 6) Training hyperparams (CPU-safe)
# ----------------------------
CFG = {
    "d_model": 128,
    "n_heads": 4,
    "n_layers": 2,
    "ff_mult": 2,
    "dropout": 0.10,
    "batch_size": 16,
    "grad_accum": 2,
    "epochs": 10,
    "lr": 3e-4,
    "weight_decay": 0.01,
    "patience": 3,
    "max_grad_norm": 1.0,
}

# auto CPU soften if sequence long
if L >= 512:
    CFG["d_model"] = 96
    CFG["n_heads"] = 4
    CFG["n_layers"] = 2
    CFG["batch_size"] = 12
    CFG["grad_accum"] = 2

cfg_path = Path(LOG_DIR) / "train_cfg.json"
with open(cfg_path, "w", encoding="utf-8") as f:
    json.dump(CFG, f, indent=2)

pos_all = int((y == 1).sum())
neg_all = int((y == 0).sum())
print("[Stage 8] TRAIN CONFIG (CPU)")
print(f"- N={N:,} | pos={pos_all:,} | neg={neg_all:,} | pos%={pos_all/max(N,1)*100:.4f}%")
print(f"- Model: d_model={CFG['d_model']} heads={CFG['n_heads']} layers={CFG['n_layers']} dropout={CFG['dropout']}")
print(f"- Batch={CFG['batch_size']} grad_accum={CFG['grad_accum']} epochs={CFG['epochs']} lr={CFG['lr']}")
print(f"- CKPT_DIR={CKPT_DIR}")
print(f"- OOF_DIR ={OOF_DIR}")
print(f"- LOG_DIR ={LOG_DIR}")

# ----------------------------
# 7) Metrics helpers
# ----------------------------
def sigmoid_np(x):
    x = np.clip(x, -50, 50)
    return 1.0 / (1.0 + np.exp(-x))

def f1_binary(y_true, y_pred01):
    y_true = y_true.astype(np.int32)
    y_pred01 = y_pred01.astype(np.int32)
    tp = int(((y_true == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true == 1) & (y_pred01 == 0)).sum())
    if tp == 0:
        return 0.0
    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    if prec + rec == 0:
        return 0.0
    return float(2 * prec * rec / (prec + rec))

@torch.no_grad()
def eval_model(model, loader, criterion):
    model.eval()
    losses, logits_all, y_all = [], [], []
    for batch in loader:
        Xb, Bb, Mb, Gb, yb = batch
        Xb = Xb.to(device); Bb = Bb.to(device); Mb = Mb.to(device); Gb = Gb.to(device); yb = yb.to(device)
        logit = model(Xb, Bb, Mb, Gb)
        loss = criterion(logit, yb)
        losses.append(float(loss.item()))
        logits_all.append(logit.detach().cpu().numpy())
        y_all.append(yb.detach().cpu().numpy())
    logits_all = np.concatenate(logits_all, axis=0) if logits_all else np.zeros((0,), dtype=np.float32)
    y_all = np.concatenate(y_all, axis=0).astype(np.int8) if y_all else np.zeros((0,), dtype=np.int8)
    probs = sigmoid_np(logits_all)
    pred01 = (probs >= 0.5).astype(np.int8)
    f1 = f1_binary(y_all, pred01)
    return float(np.mean(losses) if losses else np.nan), probs, y_all, f1

# ----------------------------
# 8) CV Train (fold-wise scaler + pos_weight)
# ----------------------------
oof_prob = np.zeros((N,), dtype=np.float32)
fold_metrics = []

all_idx = np.arange(N, dtype=np.int32)
n_splits = int(globals()["n_splits"])

def fit_scaler_fold(G_raw_np, tr_idx):
    X = G_raw_np[tr_idx]
    mean = X.mean(axis=0).astype(np.float32)
    std  = X.std(axis=0).astype(np.float32)
    std  = np.where(std < 1e-6, 1.0, std).astype(np.float32)
    return mean, std

def apply_scaler(G_raw_np, mean, std):
    return ((G_raw_np - mean) / std).astype(np.float32)

start_time = time.time()

for fold_info in globals()["folds"]:
    fold = int(fold_info["fold"])
    val_idx = np.asarray(fold_info["val_idx"], dtype=np.int32)

    val_mask = np.zeros(N, dtype=bool)
    val_mask[val_idx] = True
    tr_idx = all_idx[~val_mask]

    # fold-wise pos_weight
    y_tr = y[tr_idx]
    pos = int((y_tr == 1).sum())
    neg = int((y_tr == 0).sum())
    if pos == 0:
        raise RuntimeError(f"[Stage 8] Fold {fold}: no positives in training split.")
    pos_weight = float(neg / max(pos, 1))
    pos_weight_t = torch.tensor([pos_weight], dtype=torch.float32, device=device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight_t)

    print(f"\n[Stage 8] FOLD {fold}/{n_splits-1} | train={len(tr_idx):,} val={len(val_idx):,} | pos_weight={pos_weight:.4f}")

    # fold-wise scaler (NO leakage)
    g_mean, g_std = fit_scaler_fold(G_raw_np, tr_idx)
    G_fold_z = apply_scaler(G_raw_np, g_mean, g_std)  # (N,g_dim) kecil

    ds_tr = MemmapSeqDataset(tr_idx, X_mm, B_mm, M_mm, G_fold_z, y=y)
    ds_va = MemmapSeqDataset(val_idx, X_mm, B_mm, M_mm, G_fold_z, y=y)
    dl_tr = make_loader(ds_tr, batch_size=CFG["batch_size"], shuffle=True)
    dl_va = make_loader(ds_va, batch_size=CFG["batch_size"], shuffle=False)

    model = MultibandEventTransformer(
        feat_dim=Fdim,
        max_len=L,
        n_bands=6,
        d_model=CFG["d_model"],
        n_heads=CFG["n_heads"],
        n_layers=CFG["n_layers"],
        ff_mult=CFG["ff_mult"],
        dropout=CFG["dropout"],
        g_dim=g_dim,
    ).to(device)

    opt = torch.optim.AdamW(model.parameters(), lr=CFG["lr"], weight_decay=CFG["weight_decay"])

    best_val_loss = float("inf")
    best_epoch = -1
    best_probs = None
    patience_left = int(CFG["patience"])

    grad_accum = int(CFG["grad_accum"])

    for epoch in range(1, int(CFG["epochs"]) + 1):
        model.train()
        opt.zero_grad(set_to_none=True)

        total_loss_true = 0.0
        n_batches = 0
        accum = 0

        for batch in dl_tr:
            Xb, Bb, Mb, Gb, yb = batch
            Xb = Xb.to(device); Bb = Bb.to(device); Mb = Mb.to(device); Gb = Gb.to(device); yb = yb.to(device)

            logit = model(Xb, Bb, Mb, Gb)
            loss = criterion(logit, yb)

            # accumulate true loss for logging
            total_loss_true += float(loss.item())
            n_batches += 1

            # scale for grad accumulation
            (loss / float(grad_accum)).backward()
            accum += 1

            if accum == grad_accum:
                if CFG["max_grad_norm"] is not None:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), float(CFG["max_grad_norm"]))
                opt.step()
                opt.zero_grad(set_to_none=True)
                accum = 0

        # remainder step (IMPORTANT)
        if accum > 0:
            if CFG["max_grad_norm"] is not None:
                torch.nn.utils.clip_grad_norm_(model.parameters(), float(CFG["max_grad_norm"]))
            opt.step()
            opt.zero_grad(set_to_none=True)

        train_loss = total_loss_true / max(n_batches, 1)

        # validate
        val_loss, probs, y_val, f1_05 = eval_model(model, dl_va, criterion)

        improved = val_loss < (best_val_loss - 1e-6)
        if improved:
            best_val_loss = float(val_loss)
            best_epoch = int(epoch)
            best_probs = probs.copy()

            ckpt_path = CKPT_DIR / f"fold_{fold}.pt"
            torch.save(
                {
                    "fold": fold,
                    "epoch": epoch,
                    "model_state": model.state_dict(),
                    "cfg": CFG,
                    "seq_feature_names": list(globals()["SEQ_FEATURE_NAMES"]),
                    "max_len": L,
                    "global_cols": G_COLS,
                    "global_scaler": {"mean": g_mean, "std": g_std},  # fold-wise scaler
                    "pos_weight": pos_weight,
                },
                ckpt_path,
            )
            patience_left = int(CFG["patience"])
        else:
            patience_left -= 1

        print(f"  epoch {epoch:02d} | train_loss={train_loss:.5f} | val_loss={val_loss:.5f} | f1@0.5={f1_05:.4f} | best_epoch={best_epoch} | patience_left={patience_left}")

        if patience_left <= 0:
            break

    if best_probs is None:
        raise RuntimeError(f"Fold {fold}: best_probs is None (unexpected).")

    # fill OOF
    oof_prob[val_idx] = best_probs.astype(np.float32)

    pred01 = (best_probs >= 0.5).astype(np.int8)
    best_f1_05 = f1_binary(y[val_idx], pred01)

    fold_metrics.append({
        "fold": fold,
        "val_size": int(len(val_idx)),
        "best_epoch": int(best_epoch),
        "best_val_loss": float(best_val_loss),
        "f1_at_0p5": float(best_f1_05),
        "pos_weight": float(pos_weight),
    })

    del model, opt, ds_tr, ds_va, dl_tr, dl_va, G_fold_z
    gc.collect()

elapsed = time.time() - start_time

# ----------------------------
# 9) Save OOF artifacts
# ----------------------------
oof_path_npy = OOF_DIR / "oof_prob.npy"
np.save(oof_path_npy, oof_prob)

df_oof = pd.DataFrame({"object_id": train_ids, "target": y.astype(int), "oof_prob": oof_prob.astype(np.float32)})
oof_path_csv = OOF_DIR / "oof_prob.csv"
df_oof.to_csv(oof_path_csv, index=False)

metrics_path = OOF_DIR / "fold_metrics.json"
with open(metrics_path, "w", encoding="utf-8") as f:
    json.dump({"fold_metrics": fold_metrics, "elapsed_sec": float(elapsed)}, f, indent=2)

oof_pred01 = (oof_prob >= 0.5).astype(np.int8)
oof_f1_05 = f1_binary(y, oof_pred01)

print("\n[Stage 8] CV TRAIN DONE")
print(f"- elapsed: {elapsed/60:.2f} min")
print(f"- OOF saved: {oof_path_npy}")
print(f"- OOF saved: {oof_path_csv}")
print(f"- fold metrics: {metrics_path}")
print(f"- OOF f1@0.5 (rough): {oof_f1_05:.4f}")

globals().update({
    "oof_prob": oof_prob,
    "OOF_PROB_PATH": oof_path_npy,
    "OOF_CSV_PATH": oof_path_csv,
    "FOLD_METRICS_PATH": metrics_path,
    "GLOBAL_COLS_PATH": global_cols_path,
    "TRAIN_CFG_PATH": cfg_path,
})

gc.collect()


[Stage 8] TRAIN CONFIG (CPU)
- N=3,043 | pos=148 | neg=2,895 | pos%=4.8636%
- Model: d_model=128 heads=4 layers=2 dropout=0.1
- Batch=16 grad_accum=2 epochs=10 lr=0.0003
- CKPT_DIR=/kaggle/working/mallorn_run/checkpoints
- OOF_DIR =/kaggle/working/mallorn_run/oof
- LOG_DIR =/kaggle/working/mallorn_run/logs

[Stage 8] FOLD 0/4 | train=2,434 val=609 | pos_weight=19.6271


  torch.from_numpy(X),


  epoch 01 | train_loss=1.43790 | val_loss=1.35755 | f1@0.5=0.0000 | best_epoch=1 | patience_left=3
  epoch 02 | train_loss=1.46963 | val_loss=1.31184 | f1@0.5=0.0972 | best_epoch=2 | patience_left=3
  epoch 03 | train_loss=1.40327 | val_loss=1.34925 | f1@0.5=0.0963 | best_epoch=2 | patience_left=2
  epoch 04 | train_loss=1.33664 | val_loss=1.31863 | f1@0.5=0.0000 | best_epoch=2 | patience_left=1
  epoch 05 | train_loss=1.35236 | val_loss=1.40819 | f1@0.5=0.0000 | best_epoch=2 | patience_left=0

[Stage 8] FOLD 1/4 | train=2,434 val=609 | pos_weight=19.6271




  epoch 01 | train_loss=1.46357 | val_loss=1.61596 | f1@0.5=0.0990 | best_epoch=1 | patience_left=3
  epoch 02 | train_loss=1.43054 | val_loss=1.90663 | f1@0.5=0.0000 | best_epoch=1 | patience_left=2
  epoch 03 | train_loss=1.41220 | val_loss=1.77561 | f1@0.5=0.0000 | best_epoch=1 | patience_left=1
  epoch 04 | train_loss=1.37819 | val_loss=1.67626 | f1@0.5=0.1148 | best_epoch=1 | patience_left=0

[Stage 8] FOLD 2/4 | train=2,434 val=609 | pos_weight=19.6271
  epoch 01 | train_loss=1.75239 | val_loss=1.50390 | f1@0.5=0.0000 | best_epoch=1 | patience_left=3
  epoch 02 | train_loss=1.40910 | val_loss=1.34395 | f1@0.5=0.0000 | best_epoch=2 | patience_left=3
  epoch 03 | train_loss=1.42046 | val_loss=1.42483 | f1@0.5=0.0000 | best_epoch=2 | patience_left=2
  epoch 04 | train_loss=1.33406 | val_loss=1.35694 | f1@0.5=0.0968 | best_epoch=2 | patience_left=1
  epoch 05 | train_loss=1.41585 | val_loss=1.29759 | f1@0.5=0.1020 | best_epoch=5 | patience_left=3
  epoch 06 | train_loss=1.35510 | val

33

# OOF Prediction + Threshold Tuning

In [11]:
# ============================================================
# STAGE 9 — OOF Prediction + Threshold Tuning (ONE CELL, Kaggle CPU-SAFE) — REVISI FULL v2.1
#
# Fix:
# - Robust oof_prob 1D (anti "len() of unsized object")
# - Robust train_ids_ordered bytes->str
# - Clean oof_prob NaN/inf + clip to [0,1] before quantile/sweep
# - Clear error if object_id alignment missing in df_train_meta
# ============================================================

import gc, json, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
need = ["OOF_DIR", "df_train_meta"]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing `{k}`. Jalankan STAGE 0..8 dulu.")

OOF_DIR = Path(OOF_DIR)
OOF_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Helper: robust stringify id
# ----------------------------
def _to_str_list(ids):
    out = []
    for x in ids:
        if isinstance(x, (bytes, np.bytes_)):
            out.append(x.decode("utf-8", errors="ignore").strip())
        else:
            out.append(str(x).strip())
    return out

# ----------------------------
# Helper: robust load oof_prob as 1D float32
# ----------------------------
def _as_1d_float32(arr):
    a = np.asarray(arr)
    # handle object dtype that may hold an array inside (0-d object)
    if a.dtype == object and a.ndim == 0:
        try:
            a = np.asarray(a.item())
        except Exception:
            pass
    a = np.asarray(a, dtype=np.float32)
    if a.ndim == 0:
        return a  # caller decides
    if a.ndim > 1:
        a = a.reshape(-1)
    return a

def _load_oof_prob():
    # 1) try from globals
    if "oof_prob" in globals():
        op = _as_1d_float32(globals()["oof_prob"])
        if isinstance(op, np.ndarray) and op.ndim != 0:
            return op

    # 2) try from npy
    p = OOF_DIR / "oof_prob.npy"
    if p.exists():
        op = _as_1d_float32(np.load(p, allow_pickle=False))
        if isinstance(op, np.ndarray) and op.ndim != 0:
            return op

    # 3) try from csv (as last resort)
    pcsv = OOF_DIR / "oof_prob.csv"
    if pcsv.exists():
        df = pd.read_csv(pcsv)
        if "oof_prob" in df.columns:
            op = _as_1d_float32(df["oof_prob"].to_numpy())
            if isinstance(op, np.ndarray) and op.ndim != 0:
                return op

    raise FileNotFoundError("OOF prob not found (globals/npy/csv). Jalankan STAGE 8 (training) dulu.")

# Load OOF probabilities (guaranteed 1D or raise)
oof_prob = _load_oof_prob()

# Final guard: if still scalar, fail clearly
if not isinstance(oof_prob, np.ndarray) or oof_prob.ndim == 0:
    raise TypeError(
        f"Invalid oof_prob (unsized/scalar). Type={type(oof_prob)} ndim={getattr(oof_prob,'ndim',None)}. "
        "Pastikan STAGE 8 menyimpan oof_prob sebagai array 1D."
    )

# sanitize oof_prob (avoid NaN/inf breaking quantile/sweep)
oof_prob = np.nan_to_num(oof_prob, nan=0.0, posinf=1.0, neginf=0.0).astype(np.float32)
oof_prob = np.clip(oof_prob, 0.0, 1.0).astype(np.float32)

# ----------------------------
# Align y ordering:
# A) train_ids_ordered if valid length
# B) fallback via oof_prob.csv (most robust)
# C) last fallback df_train_meta order (requires same length)
# ----------------------------
train_ids = None
y = None

# Case A
if "train_ids_ordered" in globals():
    _ids = _to_str_list(list(globals()["train_ids_ordered"]))
    if len(_ids) == len(oof_prob):
        # validate existence in df_train_meta index
        missing = [oid for oid in _ids if oid not in df_train_meta.index]
        if missing:
            raise KeyError(
                f"train_ids_ordered contains ids not in df_train_meta (examples): {missing[:5]} | missing_n={len(missing)}"
            )
        train_ids = _ids
        y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

# Case B
if y is None:
    pcsv = OOF_DIR / "oof_prob.csv"
    if pcsv.exists():
        df_oof = pd.read_csv(pcsv)
        if ("object_id" in df_oof.columns) and ("oof_prob" in df_oof.columns):
            df_oof["object_id"] = df_oof["object_id"].astype(str).str.strip()
            ids_csv = df_oof["object_id"].tolist()

            # validate existence
            missing = [oid for oid in ids_csv if oid not in df_train_meta.index]
            if missing:
                raise KeyError(
                    f"oof_prob.csv contains object_id not in df_train_meta (examples): {missing[:5]} | missing_n={len(missing)}"
                )

            # override oof_prob with csv order to guarantee alignment
            oof_prob = _as_1d_float32(df_oof["oof_prob"].to_numpy())
            if not isinstance(oof_prob, np.ndarray) or oof_prob.ndim == 0:
                raise TypeError("oof_prob from csv became scalar (unexpected).")
            oof_prob = np.nan_to_num(oof_prob.astype(np.float32), nan=0.0, posinf=1.0, neginf=0.0)
            oof_prob = np.clip(oof_prob, 0.0, 1.0).astype(np.float32)

            train_ids = ids_csv
            y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

# Case C
if y is None:
    if len(oof_prob) != len(df_train_meta):
        raise RuntimeError(
            f"Cannot align y: len(oof_prob)={len(oof_prob)} != len(df_train_meta)={len(df_train_meta)} "
            "and train_ids_ordered not usable and oof_prob.csv not available."
        )
    train_ids = df_train_meta.index.astype(str).tolist()
    y = df_train_meta["target"].to_numpy(dtype=np.int8, copy=False)

# Final length check
if len(oof_prob) != len(y):
    raise RuntimeError(f"Length mismatch: oof_prob={len(oof_prob)} vs y={len(y)}")

# y sanity
uy = set(np.unique(y).tolist())
if not uy.issubset({0, 1}):
    raise ValueError(f"y must be binary 0/1. Found: {sorted(list(uy))}")

# ----------------------------
# 1) Metrics
# ----------------------------
def f1_binary(y_true, y_pred01):
    y_true = y_true.astype(np.int32)
    y_pred01 = y_pred01.astype(np.int32)
    tp = int(((y_true == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true == 1) & (y_pred01 == 0)).sum())
    if tp == 0:
        return 0.0
    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    if prec + rec == 0:
        return 0.0
    return float(2 * prec * rec / (prec + rec))

def precision_recall(y_true, y_pred01):
    y_true = y_true.astype(np.int32)
    y_pred01 = y_pred01.astype(np.int32)
    tp = int(((y_true == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true == 1) & (y_pred01 == 0)).sum())
    prec = tp / max(tp + fp, 1)
    rec  = tp / max(tp + fn, 1)
    return float(prec), float(rec), tp, fp, fn

# ----------------------------
# 2) Threshold sweep
# ----------------------------
grid = np.concatenate([
    np.linspace(0.01, 0.10, 19),
    np.linspace(0.10, 0.90, 81),
    np.linspace(0.90, 0.99, 19),
]).astype(np.float32)

qs = np.linspace(0.01, 0.99, 99, dtype=np.float32)
# quantile on cleaned probs
quant_thr = np.quantile(oof_prob, qs).astype(np.float32)

thr_candidates = np.unique(np.clip(np.concatenate([grid, quant_thr]), 0.0, 1.0))

best = {"thr": 0.5, "f1": -1.0, "prec": 0.0, "rec": 0.0, "tp": 0, "fp": 0, "fn": 0, "pos_pred": 0}
rows = []

for thr in thr_candidates:
    pred = (oof_prob >= thr).astype(np.int8)
    f1 = f1_binary(y, pred)
    prec, rec, tp, fp, fn = precision_recall(y, pred)
    pos_pred = int(pred.sum())
    rows.append((float(thr), float(f1), float(prec), float(rec), int(tp), int(fp), int(fn), pos_pred))

    if (f1 > best["f1"] + 1e-12) or (
        abs(f1 - best["f1"]) <= 1e-12 and (rec > best["rec"] + 1e-12)
    ) or (
        abs(f1 - best["f1"]) <= 1e-12 and abs(rec - best["rec"]) <= 1e-12 and (fp < best["fp"])
    ):
        best.update({"thr": float(thr), "f1": float(f1), "prec": float(prec), "rec": float(rec),
                     "tp": int(tp), "fp": int(fp), "fn": int(fn), "pos_pred": int(pos_pred)})

thr_table = pd.DataFrame(rows, columns=["thr","f1","precision","recall","tp","fp","fn","pos_pred"])
thr_table = thr_table.sort_values(["f1","recall","precision"], ascending=[False, False, False]).reset_index(drop=True)

BEST_THR = float(best["thr"])

# ----------------------------
# 3) Baseline 0.5
# ----------------------------
pred05 = (oof_prob >= 0.5).astype(np.int8)
f1_05 = f1_binary(y, pred05)
prec_05, rec_05, tp_05, fp_05, fn_05 = precision_recall(y, pred05)

# ----------------------------
# 4) Save report
# ----------------------------
out_json = OOF_DIR / "threshold_tuning.json"
out_txt  = OOF_DIR / "threshold_report.txt"
out_csv  = OOF_DIR / "threshold_table_top200.csv"

payload = {
    "best_threshold": BEST_THR,
    "best_f1": best["f1"],
    "best_precision": best["prec"],
    "best_recall": best["rec"],
    "best_counts": {"tp": best["tp"], "fp": best["fp"], "fn": best["fn"], "pos_pred": best["pos_pred"]},
    "baseline_thr_0p5": {"f1": f1_05, "precision": prec_05, "recall": rec_05, "tp": tp_05, "fp": fp_05, "fn": fn_05, "pos_pred": int(pred05.sum())},
    "n_objects": int(len(y)),
    "pos": int((y == 1).sum()),
    "neg": int((y == 0).sum()),
}

with open(out_json, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2)

lines = []
lines.append("OOF Threshold Tuning Report")
lines.append(f"- N={payload['n_objects']} | pos={payload['pos']} | neg={payload['neg']} | pos%={payload['pos']/max(payload['n_objects'],1)*100:.4f}%")
lines.append("")
lines.append("Baseline @ thr=0.5")
lines.append(f"- F1={f1_05:.6f} | P={prec_05:.6f} | R={rec_05:.6f} | tp={tp_05} fp={fp_05} fn={fn_05} | pos_pred={int(pred05.sum())}")
lines.append("")
lines.append(f"BEST @ thr={BEST_THR:.6f}")
lines.append(f"- F1={best['f1']:.6f} | P={best['prec']:.6f} | R={best['rec']:.6f} | tp={best['tp']} fp={best['fp']} fn={best['fn']} | pos_pred={best['pos_pred']}")
lines.append("")
lines.append("Top 10 thresholds by (F1, recall, precision):")
for i in range(min(10, len(thr_table))):
    r = thr_table.iloc[i]
    lines.append(
        f"{i+1:02d}. thr={r['thr']:.6f} | f1={r['f1']:.6f} | P={r['precision']:.6f} | R={r['recall']:.6f} | tp={int(r['tp'])} fp={int(r['fp'])} fn={int(r['fn'])} | pos_pred={int(r['pos_pred'])}"
    )

with open(out_txt, "w", encoding="utf-8") as f:
    f.write("\n".join(lines) + "\n")

thr_table.head(200).to_csv(out_csv, index=False)

print("[Stage 9] Threshold tuning DONE")
print(f"- Best threshold: {BEST_THR:.6f}")
print(f"- Best F1:        {best['f1']:.6f}  (P={best['prec']:.6f} R={best['rec']:.6f})")
print(f"- Baseline F1@0.5:{f1_05:.6f}  (P={prec_05:.6f} R={rec_05:.6f})")
print(f"- Saved: {out_json}")
print(f"- Saved: {out_txt}")
print(f"- Saved: {out_csv}")

globals().update({
    "BEST_THR": BEST_THR,
    "thr_table": thr_table,
    "THR_JSON_PATH": out_json,
    "THR_REPORT_PATH": out_txt,
})

gc.collect()


[Stage 9] Threshold tuning DONE
- Best threshold: 0.510000
- Best F1:        0.101343  (P=0.055705 R=0.560811)
- Baseline F1@0.5:0.099182  (P=0.053650 R=0.655405)
- Saved: /kaggle/working/mallorn_run/oof/threshold_tuning.json
- Saved: /kaggle/working/mallorn_run/oof/threshold_report.txt
- Saved: /kaggle/working/mallorn_run/oof/threshold_table_top200.csv


33

# Test Inference (Fold Ensemble)

In [12]:
# ============================================================
# STAGE 10 — Test Inference (Fold Ensemble) (ONE CELL, Kaggle CPU-SAFE)
#
# REVISI FULL v2.3:
# - NO dependency on global_scaler.json (supports STAGE 8 NO-leakage fold-wise scaler in ckpt)
# - Per-fold standardization for test globals using ckpt["global_scaler"]
# - Robust decode test_ids.npy (anti "b'...'" ids) + auto-normalize df_test_meta.index if needed
# - torch.load kompatibel PyTorch 2.6 weights_only default
# ============================================================

import os, gc, json, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------
# 0) Require previous stages
# ----------------------------
need = [
    "ART_DIR",
    "FIX_DIR","MAX_LEN","SEQ_FEATURE_NAMES",
    "df_test_meta",
    "CKPT_DIR",
    "n_splits",
]
for k in need:
    if k not in globals():
        raise RuntimeError(f"Missing `{k}`. Jalankan STAGE 0..9 dulu.")

# Torch
try:
    import torch
    import torch.nn as nn
except Exception as e:
    raise RuntimeError("PyTorch tidak tersedia di environment ini.") from e

device = torch.device("cpu")
SEED = int(globals().get("SEED", 2025))
torch.manual_seed(SEED)
np.random.seed(SEED)

# Thread guard
try:
    torch.set_num_threads(int(os.environ.get("OMP_NUM_THREADS", "2")))
    torch.set_num_interop_threads(1)
except Exception:
    pass

FIX_DIR = Path(FIX_DIR)
ART_DIR = Path(ART_DIR)
ART_DIR.mkdir(parents=True, exist_ok=True)
CKPT_DIR = Path(CKPT_DIR)

# ----------------------------
# helper: normalize id
# ----------------------------
def _norm_id(x):
    # decode bytes early
    if isinstance(x, (bytes, np.bytes_)):
        try:
            x = x.decode("utf-8", errors="ignore")
        except Exception:
            x = str(x)
    s = str(x).strip()
    # fix "b'...'" / 'b"..."'
    if (s.startswith("b'") and s.endswith("'")) or (s.startswith('b"') and s.endswith('"')):
        s = s[2:-1]
    return s.strip()

def _load_ids_npy(path: Path):
    arr = np.load(path, allow_pickle=False)
    # arr could be bytes dtype, unicode, or object (bytes inside)
    ids = arr.tolist() if hasattr(arr, "tolist") else list(arr)
    return [_norm_id(x) for x in ids]

# ----------------------------
# 1) Load TEST ordering (must match STAGE 6)
# ----------------------------
test_ids_path = FIX_DIR / "test_ids.npy"
if not test_ids_path.exists():
    raise FileNotFoundError(f"Missing: {test_ids_path}. Pastikan STAGE 6 sukses.")

test_ids = _load_ids_npy(test_ids_path)
NTE = len(test_ids)
if NTE <= 0:
    raise RuntimeError("test_ids kosong (NTE=0). Pastikan STAGE 6 sukses membuat test_ids.npy.")

# Optional: normalize df_test_meta.index if mismatch
missing_ids = [oid for oid in test_ids[:2000] if oid not in df_test_meta.index]  # quick probe
if missing_ids:
    # normalize full index (shallow copy, data not duplicated)
    df_test_meta = df_test_meta.copy(deep=False)
    df_test_meta.index = pd.Index([_norm_id(x) for x in df_test_meta.index], name=df_test_meta.index.name)

# Final guard: ids must exist
missing_ids = [oid for oid in test_ids if oid not in df_test_meta.index]
if missing_ids:
    raise KeyError(
        f"Some test_ids not found in df_test_meta.index (examples): {missing_ids[:10]} | missing_n={len(missing_ids)}.\n"
        f"- Cek apakah df_test_meta.index benar-benar object_id.\n"
        f"- Cek apakah test_ids.npy masih kebaca sebagai b'...'."
    )

# Duplicates guard
if len(set(test_ids)) != len(test_ids):
    # duplicates in ordering is dangerous
    s = pd.Series(test_ids)
    dup = s[s.duplicated()].head(10).tolist()
    raise ValueError(f"Duplicate object_id in test_ids ordering (examples): {dup}")

# ----------------------------
# 2) Open fixed-length TEST memmaps
# ----------------------------
Fdim = len(SEQ_FEATURE_NAMES)
L = int(MAX_LEN)

test_X_path = FIX_DIR / "test_X.dat"
test_B_path = FIX_DIR / "test_B.dat"
test_M_path = FIX_DIR / "test_M.dat"

for p in [test_X_path, test_B_path, test_M_path]:
    if not p.exists():
        raise FileNotFoundError(f"Missing fixed cache file: {p}. Pastikan STAGE 6 sukses.")

Xte = np.memmap(test_X_path, dtype=np.float32, mode="r", shape=(NTE, L, Fdim))
Bte = np.memmap(test_B_path, dtype=np.int8,   mode="r", shape=(NTE, L))
Mte = np.memmap(test_M_path, dtype=np.int8,   mode="r", shape=(NTE, L))

# ----------------------------
# 3) Dataset/Loader for inference
# ----------------------------
class TestMemmapDataset(torch.utils.data.Dataset):
    def __init__(self, Xmm, Bmm, Mmm, G_np_z):
        self.Xmm = Xmm
        self.Bmm = Bmm
        self.Mmm = Mmm
        self.G = G_np_z

    def __len__(self):
        return int(self.Xmm.shape[0])

    def __getitem__(self, i):
        # Note: read-only memmap is fine; no in-place writes are done.
        X = self.Xmm[i]
        B = self.Bmm[i].astype(np.int64, copy=False)
        M = self.Mmm[i].astype(np.int64, copy=False)
        G = self.G[i]
        return (
            torch.from_numpy(X),
            torch.from_numpy(B),
            torch.from_numpy(M),
            torch.from_numpy(G),
        )

def make_loader(ds, batch_size=64):
    return torch.utils.data.DataLoader(
        ds,
        batch_size=int(batch_size),
        shuffle=False,
        num_workers=0,
        pin_memory=False,
        drop_last=False,
    )

# ----------------------------
# 4) Model definition (must match STAGE 8)
# ----------------------------
class MultibandEventTransformer(nn.Module):
    def __init__(self, feat_dim, n_bands=6, d_model=128, n_heads=4, n_layers=2, ff_mult=2, dropout=0.10, g_dim=7, max_len=512):
        super().__init__()
        self.n_bands = n_bands
        self.max_len = max_len

        self.x_proj = nn.Linear(feat_dim, d_model)
        self.band_emb = nn.Embedding(n_bands, d_model)

        self.pos_emb = nn.Parameter(torch.zeros(1, max_len, d_model))
        nn.init.normal_(self.pos_emb, mean=0.0, std=0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=int(d_model * ff_mult),
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.attn = nn.Linear(d_model, 1)

        self.g_proj = nn.Sequential(
            nn.Linear(g_dim, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
        )

        self.head = nn.Sequential(
            nn.Linear(d_model + (d_model // 2), d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, X, band_id, mask, G):
        X = X.to(torch.float32)
        band_id = band_id.clamp(0, self.n_bands - 1).to(torch.long)
        mask = mask.to(torch.long)

        pad_mask = (mask == 0)  # True for pad

        # ALL-PAD GUARD (match your STAGE 8 revised safety)
        all_pad = pad_mask.all(dim=1)
        if all_pad.any():
            pad_mask = pad_mask.clone()
            pad_mask[all_pad, 0] = False

        h = self.x_proj(X) + self.band_emb(band_id) + self.pos_emb[:, :X.shape[1], :]
        h = self.encoder(h, src_key_padding_mask=pad_mask)

        a = self.attn(h).squeeze(-1)
        a = a.masked_fill(pad_mask, -1e9)
        w = torch.softmax(a, dim=1)
        pooled = torch.sum(h * w.unsqueeze(-1), dim=1)

        g = self.g_proj(G.to(torch.float32))
        z = torch.cat([pooled, g], dim=1)
        logit = self.head(z).squeeze(-1)
        return logit

def sigmoid_np(x):
    x = np.clip(x, -50, 50)
    return 1.0 / (1.0 + np.exp(-x))

@torch.no_grad()
def predict_probs(model, loader):
    model.eval()
    outs = []
    for Xb, Bb, Mb, Gb in loader:
        Xb = Xb.to(device)
        Bb = Bb.to(device)
        Mb = Mb.to(device)
        Gb = Gb.to(device)
        logit = model(Xb, Bb, Mb, Gb)
        outs.append(logit.detach().cpu().numpy())
    logits = np.concatenate(outs, axis=0) if outs else np.zeros((0,), dtype=np.float32)
    return sigmoid_np(logits).astype(np.float32)

# ----------------------------
# 5) Safe/compat checkpoint loader (PyTorch 2.6 weights_only default)
# ----------------------------
def torch_load_compat(path: Path):
    """
    Prefer loading full dict (needed for fold-wise scaler).
    - Try weights_only=True; if result not a full ckpt dict, fallback.
    - Fallback to weights_only=False (trusted ckpt from your own run).
    """
    try:
        obj = torch.load(path, map_location="cpu", weights_only=True)
        # If it's just a state_dict (no metadata), fallback
        if isinstance(obj, dict) and ("model_state" in obj or "cfg" in obj or "global_scaler" in obj):
            return obj
        # Often weights_only gives OrderedDict state_dict; fallback
        return torch.load(path, map_location="cpu", weights_only=False)
    except TypeError:
        # older torch without weights_only arg
        return torch.load(path, map_location="cpu")
    except Exception:
        return torch.load(path, map_location="cpu", weights_only=False)

# checkpoints list
ckpts = []
for f in range(int(n_splits)):
    p = CKPT_DIR / f"fold_{f}.pt"
    if not p.exists():
        raise FileNotFoundError(f"Missing checkpoint: {p}. Pastikan STAGE 8 menyimpan ckpt per fold.")
    ckpts.append(p)

# ----------------------------
# 6) Optional fallback scaler file (only used if ckpt missing scaler)
# ----------------------------
fallback_scaler = None
fallback_scaler_path = Path(globals().get("GLOBAL_SCALER_PATH", "")) if "GLOBAL_SCALER_PATH" in globals() else None
if fallback_scaler_path and str(fallback_scaler_path) and fallback_scaler_path.exists():
    try:
        with open(fallback_scaler_path, "r", encoding="utf-8") as f:
            fallback_scaler = json.load(f)
    except Exception:
        fallback_scaler = None
else:
    # try default logs path if user still has it
    log_dir = Path(globals().get("LOG_DIR", "/kaggle/working/mallorn_run/logs"))
    p2 = log_dir / "global_scaler.json"
    if p2.exists():
        try:
            with open(p2, "r", encoding="utf-8") as f:
                fallback_scaler = json.load(f)
        except Exception:
            fallback_scaler = None

# ----------------------------
# 7) Run inference per fold (fold-wise scaler) -> ensemble mean
# ----------------------------
BATCH_SIZE = 64
test_prob_folds = np.zeros((NTE, int(n_splits)), dtype=np.float32)

print(f"[Stage 10] Test inference: N_test={NTE:,} | folds={n_splits} | batch={BATCH_SIZE} (CPU)")

for fold, ckpt_path in enumerate(ckpts):
    ckpt = torch_load_compat(ckpt_path)

    # Extract model_state robustly
    if isinstance(ckpt, dict) and "model_state" in ckpt:
        model_state = ckpt["model_state"]
    elif isinstance(ckpt, dict):
        # might be state_dict directly
        model_state = ckpt
        ckpt = {}  # no metadata
    else:
        raise RuntimeError(f"Unexpected checkpoint content type: {type(ckpt)}")

    cfg = ckpt.get("cfg", {}) if isinstance(ckpt, dict) else {}
    d_model  = int(cfg.get("d_model", 128))
    n_heads  = int(cfg.get("n_heads", 4))
    n_layers = int(cfg.get("n_layers", 2))
    ff_mult  = int(cfg.get("ff_mult", 2))
    dropout  = float(cfg.get("dropout", 0.10))

    # ----- fold-wise global scaler + cols -----
    G_COLS = ckpt.get("global_cols", None) if isinstance(ckpt, dict) else None
    if G_COLS is None:
        # fallback default cols (should match your STAGE 8)
        G_COLS = ["Z", "Z_err", "EBV", "Z_missing", "Z_err_missing", "EBV_missing", "is_photoz"]
    G_COLS = list(G_COLS)

    # Ensure columns exist in df_test_meta
    for c in G_COLS:
        if c not in df_test_meta.columns:
            df_test_meta[c] = 0.0

    G_raw = df_test_meta.loc[test_ids, G_COLS].copy()
    for c in G_COLS:
        G_raw[c] = pd.to_numeric(G_raw[c], errors="coerce").fillna(0.0).astype(np.float32)
    G_np = G_raw.to_numpy(dtype=np.float32, copy=False)

    scaler = ckpt.get("global_scaler", None) if isinstance(ckpt, dict) else None
    if scaler is None and fallback_scaler is not None:
        # only if ckpt does not store fold scaler (older pipeline)
        # must match columns count
        if ("mean" in fallback_scaler) and ("std" in fallback_scaler) and ("cols" in fallback_scaler):
            if list(fallback_scaler["cols"]) == G_COLS:
                scaler = {"mean": fallback_scaler["mean"], "std": fallback_scaler["std"]}

    if scaler is not None:
        g_mean = np.asarray(scaler["mean"], dtype=np.float32)
        g_std  = np.asarray(scaler["std"], dtype=np.float32)
        g_std  = np.where(g_std < 1e-6, 1.0, g_std).astype(np.float32)
        if g_mean.shape[0] != G_np.shape[1] or g_std.shape[0] != G_np.shape[1]:
            # mismatch -> no scaling
            G_np_z = G_np.astype(np.float32, copy=False)
        else:
            G_np_z = ((G_np - g_mean) / g_std).astype(np.float32)
    else:
        # last resort: no scaling
        G_np_z = G_np.astype(np.float32, copy=False)

    # Dataset/Loader (fold-specific G_np_z)
    ds_test = TestMemmapDataset(Xte, Bte, Mte, G_np_z)
    dl_test = make_loader(ds_test, batch_size=BATCH_SIZE)

    # Build model + load state
    model = MultibandEventTransformer(
        feat_dim=Fdim,
        n_bands=6,
        d_model=d_model,
        n_heads=n_heads,
        n_layers=n_layers,
        ff_mult=ff_mult,
        dropout=dropout,
        g_dim=G_np_z.shape[1],
        max_len=L,
    ).to(device)

    model.load_state_dict(model_state, strict=True)

    probs = predict_probs(model, dl_test)
    if len(probs) != NTE:
        raise RuntimeError(f"Fold {fold}: probs length mismatch {len(probs)} vs {NTE}")

    test_prob_folds[:, fold] = probs
    print(f"  fold {fold}: prob_mean={float(probs.mean()):.6f} | prob_std={float(probs.std()):.6f}")

    del model, probs, ds_test, dl_test, G_raw, G_np
    gc.collect()

# Ensemble mean
test_prob_ens = test_prob_folds.mean(axis=1).astype(np.float32)

# ----------------------------
# 8) Save artifacts
# ----------------------------
fold_path = ART_DIR / "test_prob_fold.npy"
ens_path  = ART_DIR / "test_prob_ens.npy"
csv_path  = ART_DIR / "test_prob_ens.csv"

np.save(fold_path, test_prob_folds)
np.save(ens_path, test_prob_ens)

df_pred = pd.DataFrame({"object_id": test_ids, "prob": test_prob_ens})
df_pred.to_csv(csv_path, index=False)

print("\n[Stage 10] DONE")
print(f"- Saved fold probs: {fold_path}")
print(f"- Saved ens probs : {ens_path}")
print(f"- Saved csv       : {csv_path}")
print(f"- ens mean={float(test_prob_ens.mean()):.6f} | std={float(test_prob_ens.std()):.6f} | min={float(test_prob_ens.min()):.6f} | max={float(test_prob_ens.max()):.6f}")

# Export globals for submission
globals().update({
    "test_ids": test_ids,
    "test_prob_folds": test_prob_folds,
    "test_prob_ens": test_prob_ens,
    "TEST_PROB_FOLD_PATH": fold_path,
    "TEST_PROB_ENS_PATH": ens_path,
    "TEST_PROB_CSV_PATH": csv_path,
})

gc.collect()


[Stage 10] Test inference: N_test=7,135 | folds=5 | batch=64 (CPU)




  fold 0: prob_mean=0.533540 | prob_std=0.033738




  fold 1: prob_mean=0.536049 | prob_std=0.021311




  fold 2: prob_mean=0.502968 | prob_std=0.050674




  fold 3: prob_mean=0.480641 | prob_std=0.026030




  fold 4: prob_mean=0.437383 | prob_std=0.049572

[Stage 10] DONE
- Saved fold probs: /kaggle/working/mallorn_run/artifacts/test_prob_fold.npy
- Saved ens probs : /kaggle/working/mallorn_run/artifacts/test_prob_ens.npy
- Saved csv       : /kaggle/working/mallorn_run/artifacts/test_prob_ens.csv
- ens mean=0.498116 | std=0.034568 | min=0.197329 | max=0.536540


0

# Evalution 

In [13]:
# ============================================================
# ONE CELL — EVALUATION (Precision / Recall / F1) + Threshold Sweep (OOF)
# Sesuai materi:
#   Precision = TP / (TP + FP)
#   Recall    = TP / (TP + FN)
#   F1        = 2 * (Precision * Recall) / (Precision + Recall)
#
# Input (minimal):
# - df_train_meta (kolom: target)
# - oof_prob (globals) ATAU file OOF_DIR/oof_prob.npy atau OOF_DIR/oof_prob.csv
#
# Output:
# - Print ringkasan metrik
# - (opsional) simpan: eval_report.txt + eval_threshold_table.csv
# ============================================================

import gc, json
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# 0) Require minimal globals
# ----------------------------
if "df_train_meta" not in globals():
    raise RuntimeError("Missing df_train_meta. Jalankan STAGE 2 dulu.")

ART_DIR = Path(globals().get("ART_DIR", "/kaggle/working"))
OOF_DIR = Path(globals().get("OOF_DIR", ART_DIR / "oof"))
OOF_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 1) Robust loader for oof_prob (1D float32)
# ----------------------------
def _as_1d_float32(arr):
    a = np.asarray(arr)
    if a.dtype == object and a.ndim == 0:
        try:
            a = np.asarray(a.item())
        except Exception:
            pass
    a = np.asarray(a, dtype=np.float32)
    if a.ndim == 0:
        return a
    if a.ndim > 1:
        a = a.reshape(-1)
    return a

def load_oof_prob():
    # globals
    if "oof_prob" in globals():
        op = _as_1d_float32(globals()["oof_prob"])
        if isinstance(op, np.ndarray) and op.ndim != 0:
            return op, None  # no csv df
    # npy
    p = OOF_DIR / "oof_prob.npy"
    if p.exists():
        op = _as_1d_float32(np.load(p, allow_pickle=False))
        if isinstance(op, np.ndarray) and op.ndim != 0:
            return op, None
    # csv
    pcsv = OOF_DIR / "oof_prob.csv"
    if pcsv.exists():
        df = pd.read_csv(pcsv)
        if "oof_prob" in df.columns:
            op = _as_1d_float32(df["oof_prob"].to_numpy())
            if isinstance(op, np.ndarray) and op.ndim != 0:
                return op, df
    raise FileNotFoundError("OOF probability tidak ditemukan (globals / oof_prob.npy / oof_prob.csv). Jalankan STAGE 8 dulu.")

oof_prob, df_oof_csv = load_oof_prob()

if not isinstance(oof_prob, np.ndarray) or oof_prob.ndim == 0:
    raise TypeError(f"Invalid oof_prob (scalar/unsized). type={type(oof_prob)} ndim={getattr(oof_prob,'ndim',None)}")

# ----------------------------
# 2) Align y (target) ke urutan oof_prob
# ----------------------------
train_ids = None
y = None

# Paling robust: pakai oof_prob.csv kalau ada object_id
if df_oof_csv is not None and ("object_id" in df_oof_csv.columns):
    df_oof_csv["object_id"] = df_oof_csv["object_id"].astype(str).str.strip()
    train_ids = df_oof_csv["object_id"].tolist()
    # override oof_prob agar pasti align dengan object_id pada csv
    oof_prob = _as_1d_float32(df_oof_csv["oof_prob"].to_numpy())
    y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

# Kalau tidak ada CSV, coba train_ids_ordered (STAGE 7/8)
if y is None and ("train_ids_ordered" in globals()):
    _ids = list(globals()["train_ids_ordered"])
    if len(_ids) == len(oof_prob):
        train_ids = _ids
        y = df_train_meta.loc[train_ids, "target"].to_numpy(dtype=np.int8, copy=False)

# Last fallback: df_train_meta order harus match length
if y is None:
    if len(oof_prob) != len(df_train_meta):
        raise RuntimeError(
            f"Tidak bisa align y. len(oof_prob)={len(oof_prob)} != len(df_train_meta)={len(df_train_meta)} "
            "dan tidak ada oof_prob.csv yang menyertakan object_id."
        )
    train_ids = df_train_meta.index.astype(str).tolist()
    y = df_train_meta["target"].to_numpy(dtype=np.int8, copy=False)

if len(y) != len(oof_prob):
    raise RuntimeError(f"Length mismatch: y={len(y)} vs oof_prob={len(oof_prob)}")

# ----------------------------
# 3) Metrics sesuai materi (TP/FP/FN -> P/R/F1)
# ----------------------------
def prf_from_pred(y_true, y_pred01):
    y_true = np.asarray(y_true, dtype=np.int32)
    y_pred01 = np.asarray(y_pred01, dtype=np.int32)

    tp = int(((y_true == 1) & (y_pred01 == 1)).sum())
    fp = int(((y_true == 0) & (y_pred01 == 1)).sum())
    fn = int(((y_true == 1) & (y_pred01 == 0)).sum())

    precision = tp / max(tp + fp, 1)
    recall    = tp / max(tp + fn, 1)
    f1 = 0.0 if (precision + recall) == 0 else (2.0 * precision * recall / (precision + recall))

    return {
        "tp": tp, "fp": fp, "fn": fn,
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
        "pos_pred": int(y_pred01.sum()),
    }

def eval_at_threshold(prob, y_true, thr):
    pred = (prob >= float(thr)).astype(np.int8)
    return prf_from_pred(y_true, pred)

# Baseline thr=0.5
base = eval_at_threshold(oof_prob, y, 0.5)

# ----------------------------
# 4) Threshold sweep (opsional tapi sangat berguna untuk F1)
# ----------------------------
grid = np.concatenate([
    np.linspace(0.01, 0.10, 19),
    np.linspace(0.10, 0.90, 81),
    np.linspace(0.90, 0.99, 19),
]).astype(np.float32)

qs = np.linspace(0.01, 0.99, 99, dtype=np.float32)
quant_thr = np.quantile(oof_prob, qs).astype(np.float32)
thr_candidates = np.unique(np.clip(np.concatenate([grid, quant_thr]), 0.0, 1.0))

best = {"thr": 0.5, **base}
rows = []

for thr in thr_candidates:
    met = eval_at_threshold(oof_prob, y, thr)
    rows.append([float(thr), met["f1"], met["precision"], met["recall"], met["tp"], met["fp"], met["fn"], met["pos_pred"]])

    # pilih terbaik: F1 tertinggi, tie-break: recall lebih besar, lalu fp lebih kecil
    if (met["f1"] > best["f1"] + 1e-12) or (
        abs(met["f1"] - best["f1"]) <= 1e-12 and (met["recall"] > best["recall"] + 1e-12)
    ) or (
        abs(met["f1"] - best["f1"]) <= 1e-12 and abs(met["recall"] - best["recall"]) <= 1e-12 and (met["fp"] < best["fp"])
    ):
        best = {"thr": float(thr), **met}

thr_table = pd.DataFrame(rows, columns=["thr","f1","precision","recall","tp","fp","fn","pos_pred"])
thr_table = thr_table.sort_values(["f1","recall","precision"], ascending=[False, False, False]).reset_index(drop=True)

BEST_THR = float(best["thr"])

# ----------------------------
# 5) Print report
# ----------------------------
pos = int((y == 1).sum())
neg = int((y == 0).sum())
N = int(len(y))

print("EVALUATION (OOF) — F1 score metric")
print(f"- N={N:,} | pos={pos:,} | neg={neg:,} | pos%={pos/max(N,1)*100:.4f}%\n")

print("Baseline @ thr=0.5")
print(f"- F1={base['f1']:.6f} | P={base['precision']:.6f} | R={base['recall']:.6f} | tp={base['tp']} fp={base['fp']} fn={base['fn']} | pos_pred={base['pos_pred']}\n")

print(f"BEST @ thr={BEST_THR:.6f}")
print(f"- F1={best['f1']:.6f} | P={best['precision']:.6f} | R={best['recall']:.6f} | tp={best['tp']} fp={best['fp']} fn={best['fn']} | pos_pred={best['pos_pred']}\n")

print("Top 10 thresholds:")
for i in range(min(10, len(thr_table))):
    r = thr_table.iloc[i]
    print(f"{i+1:02d}. thr={r['thr']:.6f} | f1={r['f1']:.6f} | P={r['precision']:.6f} | R={r['recall']:.6f} | tp={int(r['tp'])} fp={int(r['fp'])} fn={int(r['fn'])} | pos_pred={int(r['pos_pred'])}")

# ----------------------------
# 6) Save artifacts (report + table)
# ----------------------------
out_txt = OOF_DIR / "eval_report.txt"
out_csv = OOF_DIR / "eval_threshold_table.csv"
out_json = OOF_DIR / "eval_summary.json"

lines = []
lines.append("OOF Evaluation Report (Precision/Recall/F1)")
lines.append(f"N={N} | pos={pos} | neg={neg} | pos%={pos/max(N,1)*100:.6f}%")
lines.append("")
lines.append("Baseline @ thr=0.5")
lines.append(f"F1={base['f1']:.8f} | P={base['precision']:.8f} | R={base['recall']:.8f} | tp={base['tp']} fp={base['fp']} fn={base['fn']} | pos_pred={base['pos_pred']}")
lines.append("")
lines.append(f"BEST @ thr={BEST_THR:.8f}")
lines.append(f"F1={best['f1']:.8f} | P={best['precision']:.8f} | R={best['recall']:.8f} | tp={best['tp']} fp={best['fp']} fn={best['fn']} | pos_pred={best['pos_pred']}")
lines.append("")
lines.append("Top 10 thresholds:")
for i in range(min(10, len(thr_table))):
    r = thr_table.iloc[i]
    lines.append(f"{i+1:02d}. thr={r['thr']:.8f} | f1={r['f1']:.8f} | P={r['precision']:.8f} | R={r['recall']:.8f} | tp={int(r['tp'])} fp={int(r['fp'])} fn={int(r['fn'])} | pos_pred={int(r['pos_pred'])}")

with open(out_txt, "w", encoding="utf-8") as f:
    f.write("\n".join(lines) + "\n")

thr_table.to_csv(out_csv, index=False)

payload = {
    "N": N, "pos": pos, "neg": neg,
    "baseline_thr_0p5": base,
    "best": {"thr": BEST_THR, **best},
}
with open(out_json, "w", encoding="utf-8") as f:
    json.dump(payload, f, indent=2)

print(f"\nSaved:")
print(f"- {out_txt}")
print(f"- {out_csv}")
print(f"- {out_json}")

# Export for next stages
globals().update({"BEST_THR": BEST_THR, "thr_table": thr_table})

gc.collect()


EVALUATION (OOF) — F1 score metric
- N=3,043 | pos=148 | neg=2,895 | pos%=4.8636%

Baseline @ thr=0.5
- F1=0.099182 | P=0.053650 | R=0.655405 | tp=97 fp=1711 fn=51 | pos_pred=1808

BEST @ thr=0.510000
- F1=0.101343 | P=0.055705 | R=0.560811 | tp=83 fp=1407 fn=65 | pos_pred=1490

Top 10 thresholds:
01. thr=0.510000 | f1=0.101343 | P=0.055705 | R=0.560811 | tp=83 fp=1407 fn=65 | pos_pred=1490
02. thr=0.509971 | f1=0.101281 | P=0.055667 | R=0.560811 | tp=83 fp=1408 fn=65 | pos_pred=1491
03. thr=0.496765 | f1=0.101229 | P=0.054584 | R=0.695946 | tp=103 fp=1784 fn=45 | pos_pred=1887
04. thr=0.512106 | f1=0.100775 | P=0.055714 | R=0.527027 | tp=78 fp=1322 fn=70 | pos_pred=1400
05. thr=0.510584 | f1=0.100684 | P=0.055441 | R=0.547297 | tp=81 fp=1380 fn=67 | pos_pred=1461
06. thr=0.511362 | f1=0.100127 | P=0.055245 | R=0.533784 | tp=79 fp=1351 fn=69 | pos_pred=1430
07. thr=0.539344 | f1=0.100106 | P=0.059418 | R=0.317568 | tp=47 fp=744 fn=101 | pos_pred=791
08. thr=0.498222 | f1=0.099800 | P=0

33

# Submission Build

In [14]:
# ============================================================
# STAGE 11 — Submission Build (ONE CELL, Kaggle CPU-SAFE)
#
# REVISI: output SESUAI instruksi kompetisi -> prediction HARUS 0/1 (BINARY)
# - header: object_id,prediction
# - file utama: /kaggle/working/submission.csv
# - tetap robust loader untuk test_ids + test_prob_ens
# ============================================================

import gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore", category=FutureWarning)

# ----------------------------
# 0) Require STAGE 0 globals
# ----------------------------
for need in ["PATHS", "SUB_DIR"]:
    if need not in globals():
        raise RuntimeError(f"Missing `{need}`. Jalankan STAGE 0 dulu (setup).")

sample_path = Path(PATHS["SAMPLE_SUB"])
if not sample_path.exists():
    raise FileNotFoundError(f"Missing sample_submission.csv: {sample_path}")

df_sub = pd.read_csv(sample_path)
if not {"object_id", "prediction"}.issubset(df_sub.columns):
    raise ValueError(f"sample_submission must have columns object_id,prediction. Found: {list(df_sub.columns)}")

# ----------------------------
# Helpers: robust loaders
# ----------------------------
def _as_1d_float32(arr):
    a = np.asarray(arr)
    if a.dtype == object and a.ndim == 0:
        try:
            a = np.asarray(a.item())
        except Exception:
            pass
    a = np.asarray(a, dtype=np.float32)
    if a.ndim == 0:
        return a
    if a.ndim > 1:
        a = a.reshape(-1)
    return a

def _load_test_prob():
    # 1) globals
    if "test_prob_ens" in globals() and globals()["test_prob_ens"] is not None:
        tp = _as_1d_float32(globals()["test_prob_ens"])
        if isinstance(tp, np.ndarray) and tp.ndim != 0:
            return tp

    # 2) artifact path var
    if "TEST_PROB_ENS_PATH" in globals() and globals()["TEST_PROB_ENS_PATH"] is not None:
        p = Path(globals()["TEST_PROB_ENS_PATH"])
        if p.exists():
            tp = _as_1d_float32(np.load(p, allow_pickle=False))
            if tp.ndim != 0:
                return tp

    # 3) default artifact location
    p = Path(globals().get("ART_DIR", "/kaggle/working")) / "test_prob_ens.npy"
    if p.exists():
        tp = _as_1d_float32(np.load(p, allow_pickle=False))
        if tp.ndim != 0:
            return tp

    # 4) csv fallback
    p = Path(globals().get("ART_DIR", "/kaggle/working")) / "test_prob_ens.csv"
    if p.exists():
        df = pd.read_csv(p)
        if "prob" in df.columns:
            tp = _as_1d_float32(df["prob"].to_numpy())
            if tp.ndim != 0:
                return tp

    raise RuntimeError("Missing test_prob_ens. Jalankan STAGE 10 dulu (Test Inference).")

def _load_test_ids():
    # 1) globals (must be non-None and iterable)
    if "test_ids" in globals() and globals()["test_ids"] is not None:
        try:
            ids = list(globals()["test_ids"])
            if len(ids) > 0:
                return [str(x).strip() for x in ids]
        except TypeError:
            pass

    # 2) fixed_seq cache
    fix_dir = Path(globals().get("FIX_DIR", "/kaggle/working/mallorn_run/artifacts/fixed_seq"))
    pids = fix_dir / "test_ids.npy"
    if pids.exists():
        ids = np.load(pids, allow_pickle=False).astype("S").astype(str).tolist()
        return [str(x).strip() for x in ids]

    # 3) from test_prob_ens.csv (if exists)
    p = Path(globals().get("ART_DIR", "/kaggle/working")) / "test_prob_ens.csv"
    if p.exists():
        df = pd.read_csv(p)
        if "object_id" in df.columns:
            return df["object_id"].astype(str).str.strip().tolist()

    raise RuntimeError("Missing test_ids. Pastikan STAGE 6 membuat test_ids.npy atau STAGE 10 export test_ids.")

# ----------------------------
# 1) Load predictions + ids
# ----------------------------
test_prob = _load_test_prob()
test_ids  = _load_test_ids()

if not isinstance(test_prob, np.ndarray) or test_prob.ndim == 0:
    raise TypeError(f"Invalid test_prob (scalar/unsized). type={type(test_prob)} ndim={getattr(test_prob,'ndim',None)}")

if len(test_prob) != len(test_ids):
    raise RuntimeError(f"Length mismatch: test_prob={len(test_prob)} vs test_ids={len(test_ids)}")

# Threshold (use BEST_THR if available; else 0.5)
thr = float(globals().get("BEST_THR", 0.5))

# ----------------------------
# 2) Build mapping object_id -> prob + strict checks
# ----------------------------
df_pred = pd.DataFrame({"object_id": test_ids, "prob": test_prob.astype(np.float32, copy=False)})

df_pred["object_id"] = df_pred["object_id"].astype(str).str.strip()
if df_pred["object_id"].duplicated().any():
    dup = df_pred.loc[df_pred["object_id"].duplicated(), "object_id"].iloc[:5].tolist()
    raise ValueError(f"Duplicated object_id in predictions (examples): {dup}")

p = df_pred["prob"].to_numpy(dtype=np.float32, copy=False)
if not np.isfinite(p).all():
    bad = int((~np.isfinite(p)).sum())
    raise ValueError(f"Found non-finite probabilities in test predictions: {bad} rows")
df_pred["prob"] = np.clip(p, 0.0, 1.0)

# ----------------------------
# 3) Align to sample_submission order + BUILD BINARY PREDICTION (0/1)
# ----------------------------
df_sub["object_id"] = df_sub["object_id"].astype(str).str.strip()

df_out = df_sub[["object_id"]].merge(df_pred, on="object_id", how="left")

if df_out["prob"].isna().any():
    missing_n = int(df_out["prob"].isna().sum())
    miss_ids = df_out.loc[df_out["prob"].isna(), "object_id"].iloc[:5].tolist()
    raise ValueError(
        f"Some sample_submission object_id have no prediction: {missing_n} missing. "
        f"Examples: {miss_ids}"
    )

# REQUIRED BY COMPETITION: binary 0/1
df_out["prediction"] = (df_out["prob"].to_numpy(dtype=np.float32) >= np.float32(thr)).astype(np.int8)
df_out = df_out[["object_id", "prediction"]]

# strict format checks
if df_out["object_id"].duplicated().any():
    raise ValueError("submission has duplicate object_id (unexpected).")
if len(df_out) != len(df_sub):
    raise RuntimeError("submission row count mismatch with sample_submission.")
if not set(np.unique(df_out["prediction"].to_numpy())).issubset({0, 1}):
    raise RuntimeError("submission prediction contains values outside {0,1} (unexpected).")

# ----------------------------
# 4) Write submission files
# ----------------------------
SUB_DIR = Path(SUB_DIR)
SUB_DIR.mkdir(parents=True, exist_ok=True)

out_main  = Path("/kaggle/working/submission.csv")
out_copy  = SUB_DIR / "submission.csv"
out_proba = SUB_DIR / "submission_proba.csv"  # debug only

df_out.to_csv(out_main, index=False)
df_out.to_csv(out_copy, index=False)

# optional debug proba (NOT for Kaggle submit)
df_dbg = df_sub[["object_id"]].merge(df_pred, on="object_id", how="left")
df_dbg = df_dbg.rename(columns={"prob": "prediction"})
df_dbg.to_csv(out_proba, index=False)

print("[Stage 11] SUBMISSION READY (BINARY 0/1)")
print(f"- threshold_used={thr:.6f}")
print(f"- wrote: {out_main}")
print(f"- copy : {out_copy}")
print(f"- debug proba (optional): {out_proba}")
print(f"- rows: {len(df_out):,}")

print("\nPreview:")
print(df_out.head(8).to_string(index=False))

globals().update({
    "SUBMISSION_PATH": out_main,
    "SUBMISSION_COPY_PATH": out_copy,
    "SUBMISSION_MODE": "binary",
    "SUBMISSION_THRESHOLD": thr,
})

gc.collect()


[Stage 11] SUBMISSION READY (BINARY 0/1)
- threshold_used=0.510000
- wrote: /kaggle/working/submission.csv
- copy : /kaggle/working/mallorn_run/submissions/submission.csv
- debug proba (optional): /kaggle/working/mallorn_run/submissions/submission_proba.csv
- rows: 7,135

Preview:
                   object_id  prediction
    Eluwaith_Mithrim_nothrim           0
          Eru_heledir_archam           0
           Gonhir_anann_fuin           0
Gwathuirim_haradrim_tegilbor           1
            achas_minai_maen           1
               adab_fae_gath           0
             adel_draug_gaur           1
     aderthad_cuil_galadhrim           1


32