# Set Paths & Select Config (CFG)

In [17]:
# ============================================================
# STAGE 0 — Set Paths & Select Config (CFG) (Kaggle-ready, offline)
# - Root kompetisi: /kaggle/input/recodai-luc-scientific-image-forgery-detection
# - Root dataset output: /kaggle/input/recod-ailuc-dinov2-base  (atau variasi nama lain)
# - Auto-detect jika nama folder beda
# - Auto-pilih CFG terbaik untuk MATCH + PRED berdasarkan coverage fitur train
#
# Output globals:
# - COMP_ROOT, OUT_DS_ROOT, OUT_ROOT
# - PATHS (dict jalur penting)
# - MATCH_CFG_DIR, PRED_CFG_DIR, DINO_CFG_DIR (opsional)
# ============================================================

import os, re, json
from pathlib import Path
import pandas as pd

# ----------------------------
# Helper: find competition root
# ----------------------------
def find_comp_root(preferred: str = "/kaggle/input/recodai-luc-scientific-image-forgery-detection") -> Path:
    p = Path(preferred)
    if p.exists():
        return p
    # fallback: scan /kaggle/input for a dir that looks like the competition
    base = Path("/kaggle/input")
    if not base.exists():
        raise FileNotFoundError("/kaggle/input not found (are you in Kaggle notebook?)")
    cands = []
    for d in base.iterdir():
        if not d.is_dir():
            continue
        # heuristic: must have sample_submission.csv and test_images/train_images
        if (d / "sample_submission.csv").exists() and ((d / "train_images").exists() or (d / "test_images").exists()):
            cands.append(d)
    if not cands:
        raise FileNotFoundError(
            "Competition root not found. Expected folder with sample_submission.csv and train_images/test_images."
        )
    # prefer one containing 'recodai' and 'forgery'
    cands.sort(key=lambda x: (("recodai" not in x.name.lower()), ("forgery" not in x.name.lower()), x.name))
    return cands[0]

# ----------------------------
# Helper: find output dataset root
# ----------------------------
def find_output_dataset_root(preferred_names=(
    "recod-ailuc-dinov2-base",
    "recod-ai-luc-dinov2-base",
    "recodai-luc-dinov2-base",
    "recodai-luc-dinov2",
)) -> Path:
    base = Path("/kaggle/input")
    # try preferred direct hits first
    for nm in preferred_names:
        p = base / nm
        if p.exists():
            return p

    # fallback: scan for a dataset that contains recodai_luc/artifacts
    cands = []
    for d in base.iterdir():
        if not d.is_dir():
            continue
        # either directly has recodai_luc, or nested one-level
        if (d / "recodai_luc" / "artifacts").exists():
            cands.append(d)
            continue
        # some datasets wrap inside one folder
        inner = list(d.glob("*/recodai_luc/artifacts"))
        if inner:
            cands.append(d)
    if not cands:
        raise FileNotFoundError(
            "Output dataset root not found. Expected something like /kaggle/input/<...>/recodai_luc/artifacts/"
        )
    # prefer those containing 'dinov2' in name
    cands.sort(key=lambda x: (("dinov2" not in x.name.lower()), x.name))
    return cands[0]

# ----------------------------
# Helper: resolve OUT_ROOT = <dataset>/recodai_luc
# ----------------------------
def resolve_out_root(out_ds_root: Path) -> Path:
    direct = out_ds_root / "recodai_luc"
    if direct.exists():
        return direct
    # else find nested
    hits = list(out_ds_root.glob("*/recodai_luc"))
    if hits:
        return hits[0]
    raise FileNotFoundError(f"Could not locate recodai_luc folder under {out_ds_root}")

# ----------------------------
# Helper: pick best cfg directory by coverage of train feature csv
# ----------------------------
def pick_best_cfg(cache_root: Path, prefix: str, feat_train_pattern: str) -> Path:
    """
    prefix: e.g. 'match_base_cfg_' or 'pred_base'
    feat_train_pattern: glob pattern for feature csv inside cfg dir
    """
    cands = []
    for d in cache_root.iterdir():
        if not d.is_dir():
            continue
        if not d.name.startswith(prefix):
            continue
        # find feature file
        feat_files = list(d.glob(feat_train_pattern))
        if not feat_files:
            continue
        feat_path = feat_files[0]
        # score by row count (coverage)
        try:
            n = sum(1 for _ in open(feat_path, "r", encoding="utf-8", errors="ignore")) - 1
        except Exception:
            n = -1
        cands.append((n, d, feat_path))

    if not cands:
        raise FileNotFoundError(f"No cfg folders found under {cache_root} with prefix={prefix} and {feat_train_pattern}")

    # choose max rows, tie-break by name
    cands.sort(key=lambda x: (-x[0], x[1].name))
    best_n, best_dir, best_feat = cands[0]
    return best_dir

# ----------------------------
# 0) Locate roots
# ----------------------------
COMP_ROOT = find_comp_root("/kaggle/input/recodai-luc-scientific-image-forgery-detection")
OUT_DS_ROOT = find_output_dataset_root()

OUT_ROOT = resolve_out_root(OUT_DS_ROOT)  # .../recodai_luc
ART_DIR = OUT_ROOT / "artifacts"
CACHE_DIR = OUT_ROOT / "cache"

# ----------------------------
# 1) Competition paths (raw images/masks)
# ----------------------------
PATHS = {}
PATHS["COMP_ROOT"] = str(COMP_ROOT)
PATHS["SAMPLE_SUB"] = str(COMP_ROOT / "sample_submission.csv")

# common competition layout (handle if nested)
PATHS["TRAIN_IMAGES"] = str(COMP_ROOT / "train_images")
PATHS["TEST_IMAGES"]  = str(COMP_ROOT / "test_images")
PATHS["TRAIN_MASKS"]  = str(COMP_ROOT / "train_masks")
PATHS["SUPP_IMAGES"]  = str(COMP_ROOT / "supplemental_images")
PATHS["SUPP_MASKS"]   = str(COMP_ROOT / "supplemental_masks")

# optional subfolders inside train_images
PATHS["TRAIN_AUTH_DIR"] = str(COMP_ROOT / "train_images" / "authentic")
PATHS["TRAIN_FORG_DIR"] = str(COMP_ROOT / "train_images" / "forged")

# ----------------------------
# 2) Output dataset paths (clean artifacts + cache)
# ----------------------------
PATHS["OUT_DS_ROOT"] = str(OUT_DS_ROOT)
PATHS["OUT_ROOT"] = str(OUT_ROOT)
PATHS["ART_DIR"] = str(ART_DIR)
PATHS["CACHE_DIR"] = str(CACHE_DIR)

# artifacts (train tables / folds / profiles)
PATHS["DF_TRAIN_ALL"] = str(ART_DIR / "df_train_all.parquet")
PATHS["DF_TRAIN_CLS"] = str(ART_DIR / "df_train_cls.parquet")
PATHS["DF_TRAIN_SEG"] = str(ART_DIR / "df_train_seg.parquet")
PATHS["DF_TEST"]      = str(ART_DIR / "df_test.parquet")
PATHS["CV_CASE_FOLDS"]   = str(ART_DIR / "cv_case_folds.csv")
PATHS["CV_SAMPLE_FOLDS"] = str(ART_DIR / "cv_sample_folds.csv")
PATHS["IMG_PROFILE_TRAIN"] = str(ART_DIR / "image_profile_train.parquet")
PATHS["IMG_PROFILE_TEST"]  = str(ART_DIR / "image_profile_test.parquet")
PATHS["MASK_PROFILE"]      = str(ART_DIR / "mask_profile.parquet")
PATHS["CASE_SUMMARY"]      = str(ART_DIR / "case_summary.parquet")

# ----------------------------
# 3) Select best MATCH/PRED CFG dirs automatically
# ----------------------------
if not CACHE_DIR.exists():
    raise FileNotFoundError(f"CACHE_DIR not found: {CACHE_DIR}")

# Match cfg dirs look like: match_base_cfg_<hash>
MATCH_CFG_DIR = pick_best_cfg(
    CACHE_DIR,
    prefix="match_base_cfg_",
    feat_train_pattern="match_features_train_all.csv"
)
# Pred cfg dirs look like: pred_base_v3_v7_cfg_<hash> (name may vary; use startswith 'pred_base')
# We'll scan by startswith 'pred_base' and require pred_features_train_all.csv
PRED_CFG_DIR = pick_best_cfg(
    CACHE_DIR,
    prefix="pred_base",
    feat_train_pattern="pred_features_train_all.csv"
)

# DINO cache cfg (optional)
DINO_CFG_DIR = None
dino_root = CACHE_DIR / "dino_v2_large"
if dino_root.exists():
    # choose any cfg_* that has manifest_train_all.csv
    dino_cands = []
    for d in dino_root.iterdir():
        if d.is_dir() and d.name.startswith("cfg_") and (d / "manifest_train_all.csv").exists():
            dino_cands.append(d)
    if dino_cands:
        dino_cands.sort(key=lambda x: x.name)
        DINO_CFG_DIR = dino_cands[0]

# attach feature file paths
PATHS["MATCH_CFG_DIR"] = str(MATCH_CFG_DIR)
PATHS["PRED_CFG_DIR"]  = str(PRED_CFG_DIR)
PATHS["DINO_CFG_DIR"]  = str(DINO_CFG_DIR) if DINO_CFG_DIR else ""

PATHS["MATCH_FEAT_TRAIN"] = str(MATCH_CFG_DIR / "match_features_train_all.csv")
PATHS["MATCH_FEAT_TEST"]  = str(MATCH_CFG_DIR / "match_features_test.csv")
PATHS["PRED_FEAT_TRAIN"]  = str(PRED_CFG_DIR / "pred_features_train_all.csv")
PATHS["PRED_FEAT_TEST"]   = str(PRED_CFG_DIR / "pred_features_test.csv")

# ----------------------------
# 4) Sanity checks (no hard fail for optional files)
# ----------------------------
must_exist = [
    ("sample_submission.csv", PATHS["SAMPLE_SUB"]),
    ("df_train_all.parquet",  PATHS["DF_TRAIN_ALL"]),
    ("cv_case_folds.csv",     PATHS["CV_CASE_FOLDS"]),
    ("match_features_train_all.csv", PATHS["MATCH_FEAT_TRAIN"]),
    ("pred_features_train_all.csv",  PATHS["PRED_FEAT_TRAIN"]),
]
missing = [name for name, p in must_exist if not Path(p).exists()]
if missing:
    raise FileNotFoundError("Missing required files: " + ", ".join(missing))

print("OK — Roots")
print("  COMP_ROOT   :", COMP_ROOT)
print("  OUT_DS_ROOT :", OUT_DS_ROOT)
print("  OUT_ROOT    :", OUT_ROOT)
print("\nOK — Selected CFG")
print("  MATCH_CFG_DIR:", MATCH_CFG_DIR.name)
print("  PRED_CFG_DIR :", PRED_CFG_DIR.name)
print("  DINO_CFG_DIR :", (DINO_CFG_DIR.name if DINO_CFG_DIR else "(not found / optional)"))

print("\nOK — Key files")
for k in ["DF_TRAIN_ALL","CV_CASE_FOLDS","MATCH_FEAT_TRAIN","PRED_FEAT_TRAIN","IMG_PROFILE_TRAIN"]:
    p = Path(PATHS[k])
    print(f"  {k:16s}: {p}  {'(exists)' if p.exists() else '(missing/optional)'}")


OK — Roots
  COMP_ROOT   : /kaggle/input/recodai-luc-scientific-image-forgery-detection
  OUT_DS_ROOT : /kaggle/input/recod-ailuc-dinov2-base
  OUT_ROOT    : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc

OK — Selected CFG
  MATCH_CFG_DIR: match_base_cfg_f9f7ea3a65c5
  PRED_CFG_DIR : pred_base_v3_v7_cfg_5dbf0aa165
  DINO_CFG_DIR : cfg_3246fd54aab0

OK — Key files
  DF_TRAIN_ALL    : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/df_train_all.parquet  (exists)
  CV_CASE_FOLDS   : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/cv_case_folds.csv  (exists)
  MATCH_FEAT_TRAIN: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/match_base_cfg_f9f7ea3a65c5/match_features_train_all.csv  (exists)
  PRED_FEAT_TRAIN : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/pred_base_v3_v7_cfg_5dbf0aa165/pred_features_train_all.csv  (exists)
  IMG_PROFILE_TRAIN: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/image_profile_train.parquet  (exists)


# Build Training Table (X, y, folds)

In [18]:
# ============================================================
# STEP 2 — Build Training Table (X, y, folds) — REVISI FULL (lebih “powerful” untuk Transformer)
# - Fokus: siapkan df_train_tabular + (X_train, y_train, folds, FEATURE_COLS)
# - Sumber utama: pred_features + (opsional) match_features + (opsional) image_profile
# - Split: gunakan cv_case_folds.csv (anti leakage, by case_id)
# - Tidak ada submission di sini
#
# Upgrade utama (dibanding versi sebelumnya):
# - Feature engineering lebih kaya (log/clip untuk banyak kolom heavy-tail)
# - Interaction features (mengandung informasi komposisi: sim*count, area*sim, density, dll)
# - Quantile winsorization opsional (mengurangi outlier ekstrem, stabil untuk Transformer)
# - Simpan schema fitur + clip caps agar reproducible
#
# Catatan:
# - DINOv2 Large model path (offline): /kaggle/input/dinov2/pytorch/large/1
#   (Di step ini hanya dicek exist; ekstraksi DINO tidak dilakukan di step ini.)
# ============================================================

import os, json, math, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# ----------------------------
# 0) Require PATHS
# ----------------------------
if "PATHS" not in globals() or not isinstance(PATHS, dict):
    raise RuntimeError("Missing PATHS. Jalankan dulu STAGE 0 — Set Paths & Select Config (CFG).")

# ----------------------------
# 1) Check DINOv2 Large local path (offline)
# ----------------------------
DINO_LARGE_DIR = Path("/kaggle/input/dinov2/pytorch/large/1")
if not DINO_LARGE_DIR.exists():
    raise FileNotFoundError(f"DINOv2-Large path not found: {DINO_LARGE_DIR}")
PATHS["DINO_LARGE_DIR"] = str(DINO_LARGE_DIR)

# ----------------------------
# 2) Feature Engineering Config (fleksibel)
# ----------------------------
FE_CFG = {
    # sumber fitur
    "use_match_features": True,
    "use_image_profile": True,

    # engineering
    "add_log_features": True,
    "add_interactions": True,

    # outlier control
    "clip_by_quantile": True,   # winsorization berbasis quantile train-all (tanpa label)
    "clip_q": 0.999,            # 0.999 ~ cap p99.9
    "clip_max_fallback": 1e9,   # fallback jika quantile tidak valid

    # fill
    "fillna_value": 0.0,

    # dtype
    "cast_float32": True,
}

# ----------------------------
# 3) Prefer WORKING features if exist (because you may have re-generated there)
# ----------------------------
def prefer_working(input_path: str, working_candidate: str | None = None) -> Path:
    p_in = Path(input_path)
    if working_candidate is not None:
        p_w = Path(working_candidate)
        if p_w.exists():
            return p_w
    return p_in

match_cfg_name = Path(PATHS["MATCH_CFG_DIR"]).name
pred_cfg_name  = Path(PATHS["PRED_CFG_DIR"]).name

WORK_ROOT = Path("/kaggle/working/recodai_luc/cache")
match_feat_work = WORK_ROOT / match_cfg_name / "match_features_train_all.csv"
pred_feat_work  = WORK_ROOT / pred_cfg_name  / "pred_features_train_all.csv"

PRED_FEAT_TRAIN  = prefer_working(PATHS["PRED_FEAT_TRAIN"],  str(pred_feat_work))
MATCH_FEAT_TRAIN = prefer_working(PATHS["MATCH_FEAT_TRAIN"], str(match_feat_work))

DF_TRAIN_ALL     = Path(PATHS["DF_TRAIN_ALL"])
CV_CASE_FOLDS    = Path(PATHS["CV_CASE_FOLDS"])
IMG_PROFILE_TRAIN = Path(PATHS.get("IMG_PROFILE_TRAIN", ""))

for need_name, need_path in [
    ("df_train_all.parquet", DF_TRAIN_ALL),
    ("cv_case_folds.csv", CV_CASE_FOLDS),
    ("pred_features_train_all.csv", PRED_FEAT_TRAIN),
]:
    if not need_path.exists():
        raise FileNotFoundError(f"Missing required file: {need_name} -> {need_path}")

print("Using:")
print("  DF_TRAIN_ALL     :", DF_TRAIN_ALL)
print("  CV_CASE_FOLDS    :", CV_CASE_FOLDS)
print("  PRED_FEAT_TRAIN  :", PRED_FEAT_TRAIN)
print("  MATCH_FEAT_TRAIN :", MATCH_FEAT_TRAIN, "(optional)" if MATCH_FEAT_TRAIN.exists() else "(missing/skip)")
print("  IMG_PROFILE_TRAIN:", IMG_PROFILE_TRAIN, "(optional)" if IMG_PROFILE_TRAIN.exists() else "(missing/skip)")
print("  DINO_LARGE_DIR   :", DINO_LARGE_DIR)

# ----------------------------
# 4) Load minimal inputs
# ----------------------------
df_base = pd.read_parquet(DF_TRAIN_ALL)
df_cv   = pd.read_csv(CV_CASE_FOLDS)
df_pred = pd.read_csv(PRED_FEAT_TRAIN)

df_match = None
if FE_CFG["use_match_features"] and MATCH_FEAT_TRAIN.exists():
    try:
        df_match = pd.read_csv(MATCH_FEAT_TRAIN)
    except Exception:
        df_match = None

df_prof = None
if FE_CFG["use_image_profile"] and IMG_PROFILE_TRAIN.exists():
    try:
        df_prof = pd.read_parquet(IMG_PROFILE_TRAIN)
    except Exception:
        df_prof = None

# ----------------------------
# 5) Normalize keys: uid/sample_id, case_id, variant
# ----------------------------
def ensure_uid_case_variant(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    if "uid" not in df.columns:
        for alt in ["sample_id", "id", "key"]:
            if alt in df.columns:
                df = df.rename(columns={alt: "uid"})
                break
    if "uid" not in df.columns:
        raise ValueError("Cannot find uid/sample_id column. Expected 'uid' or 'sample_id'.")

    if "case_id" not in df.columns or "variant" not in df.columns:
        uid = df["uid"].astype(str)
        if "case_id" not in df.columns:
            df["case_id"] = uid.str.extract(r"^(\d+)")[0].astype("Int64")
        if "variant" not in df.columns:
            v = uid.str.extract(r"__(\w+)$")[0]
            v2 = uid.str.extract(r"_(\w+)$")[0]
            df["variant"] = v.fillna(v2).fillna("unk")

    df["case_id"] = df["case_id"].astype(int)
    df["variant"] = df["variant"].astype(str)
    df["uid"] = df["uid"].astype(str)
    return df

df_pred = ensure_uid_case_variant(df_pred)

df_base2 = df_base.copy()
if "uid" not in df_base2.columns:
    if "sample_id" in df_base2.columns:
        df_base2 = df_base2.rename(columns={"sample_id": "uid"})
    elif ("case_id" in df_base2.columns and "variant" in df_base2.columns):
        df_base2["uid"] = df_base2["case_id"].astype(str) + "__" + df_base2["variant"].astype(str)

# label
label_col = None
for cand in ["y_forged", "has_mask", "is_forged", "forged"]:
    if cand in df_base2.columns:
        label_col = cand
        break
if label_col is None and "y_forged" in df_pred.columns:
    label_col = "y_forged"
if label_col is None:
    raise ValueError("Cannot find label column in df_train_all/pred_features (y_forged/has_mask/is_forged/forged).")

# folds
if "case_id" not in df_cv.columns or "fold" not in df_cv.columns:
    raise ValueError("cv_case_folds.csv must contain columns: case_id, fold")
df_cv["case_id"] = df_cv["case_id"].astype(int)
df_cv["fold"] = df_cv["fold"].astype(int)

# ----------------------------
# 6) Merge: start from df_pred (1 row per uid)
# ----------------------------
df_train = df_pred.copy()

if "y_forged" in df_train.columns:
    df_train["y"] = df_train["y_forged"].astype(int)
else:
    if "uid" in df_base2.columns:
        df_train = df_train.merge(df_base2[["uid", label_col]].rename(columns={label_col: "y"}), on="uid", how="left")
    else:
        if {"case_id","variant",label_col}.issubset(df_base2.columns):
            df_train = df_train.merge(
                df_base2[["case_id","variant",label_col]].rename(columns={label_col:"y"}),
                on=["case_id","variant"], how="left"
            )
        else:
            raise ValueError("Could not merge label from df_train_all (missing uid or case_id+variant).")
    df_train["y"] = df_train["y"].astype(int)

df_train = df_train.drop(columns=["fold"], errors="ignore").merge(df_cv[["case_id","fold"]], on="case_id", how="left")
if df_train["fold"].isna().any():
    raise ValueError(f"Missing fold after merging cv_case_folds.csv: {int(df_train['fold'].isna().sum())} rows")
df_train["fold"] = df_train["fold"].astype(int)

# optional: merge match_features (bring only new cols)
if df_match is not None:
    df_match = ensure_uid_case_variant(df_match)
    base_cols = set(df_train.columns)
    new_cols = [c for c in df_match.columns if c not in base_cols]
    keep_cols = ["uid"] + [c for c in new_cols if c not in ["case_id","variant"]]
    if len(keep_cols) > 1:
        df_train = df_train.merge(df_match[keep_cols], on="uid", how="left")

# optional: merge image profile by case_id
if df_prof is not None and "case_id" in df_prof.columns:
    df_prof2 = df_prof.copy()
    df_prof2["case_id"] = df_prof2["case_id"].astype(int)
    df_prof2 = df_prof2.drop_duplicates("case_id")
    clash = set(df_prof2.columns).intersection(df_train.columns)
    clash -= {"case_id"}
    if clash:
        df_prof2 = df_prof2.rename(columns={c: f"profile_{c}" for c in clash})
    df_train = df_train.merge(df_prof2, on="case_id", how="left")

# ----------------------------
# 7) Feature engineering (lebih kaya + stabil untuk Transformer)
# ----------------------------
def safe_log1p(arr):
    arr = np.asarray(arr, dtype=np.float64)
    arr = np.where(np.isfinite(arr), arr, 0.0)
    arr = np.clip(arr, 0.0, None)
    return np.log1p(arr)

def get_clip_cap(series: pd.Series, q: float, fallback: float):
    s = pd.to_numeric(series, errors="coerce").astype(float)
    s = s.replace([np.inf, -np.inf], np.nan).dropna()
    if len(s) == 0:
        return fallback
    s_pos = s[s >= 0]
    if len(s_pos) == 0:
        return fallback
    cap = float(s_pos.quantile(q))
    if not np.isfinite(cap) or cap <= 0:
        return fallback
    return cap

# daftar kolom heavy-tail yang biasanya ada di pred/match
HEAVY_COLS = [
    "peak_ratio",
    "best_weight",
    "best_count",
    "n_pairs_thr",
    "n_pairs_mnn",
    "n_pairs",            # jika ada
    "n_comp",
    "largest_comp",
    "grid_area_frac",
    "mask_area_frac",     # jika ada
    "pred_area_frac",     # jika ada
]

clip_caps = {}
if FE_CFG["clip_by_quantile"]:
    for c in HEAVY_COLS:
        if c in df_train.columns:
            clip_caps[c] = get_clip_cap(df_train[c], FE_CFG["clip_q"], FE_CFG["clip_max_fallback"])

# log + cap features
if FE_CFG["add_log_features"]:
    for c in HEAVY_COLS:
        if c in df_train.columns:
            cap = clip_caps.get(c, FE_CFG["clip_max_fallback"])
            x = pd.to_numeric(df_train[c], errors="coerce").fillna(0.0).astype(float)
            x = np.clip(x, 0.0, cap)
            df_train[f"{c}_cap"] = x.astype(np.float32)
            df_train[f"log_{c}"] = safe_log1p(x).astype(np.float32)

# fokus khusus: peak_ratio & best_weight (sentinel sangat besar)
if "peak_ratio" in df_train.columns and "log_peak_ratio" not in df_train.columns:
    x = pd.to_numeric(df_train["peak_ratio"], errors="coerce").fillna(0.0).astype(float)
    x = np.clip(x, 0.0, 1e6)
    df_train["peak_ratio_cap"] = x.astype(np.float32)
    df_train["log_peak_ratio"] = safe_log1p(x).astype(np.float32)

if "best_weight" in df_train.columns and "log_best_weight" not in df_train.columns:
    x = pd.to_numeric(df_train["best_weight"], errors="coerce").fillna(0.0).astype(float)
    x = np.clip(x, 0.0, 1e9)
    df_train["best_weight_cap"] = x.astype(np.float32)
    df_train["log_best_weight"] = safe_log1p(x).astype(np.float32)

# interaction features (menambah “informasi” untuk model)
if FE_CFG["add_interactions"]:
    # guard: ambil nilai aman
    def getf(col, default=0.0):
        if col in df_train.columns:
            return pd.to_numeric(df_train[col], errors="coerce").fillna(default).astype(float).values
        return np.full(len(df_train), default, dtype=np.float64)

    best_mean_sim = getf("best_mean_sim", 0.0)
    best_count    = getf("best_count", 0.0)
    grid_area     = getf("grid_area_frac", 0.0)
    has_peak      = getf("has_peak", 0.0)
    n_comp        = getf("n_comp", 0.0)
    largest_comp  = getf("largest_comp", 0.0)

    # interaksi populer
    df_train["sim_x_count"] = (best_mean_sim * best_count).astype(np.float32)
    df_train["area_x_sim"]  = (grid_area * best_mean_sim).astype(np.float32)
    df_train["area_x_count"] = (grid_area * best_count).astype(np.float32)

    # density/fragmentation
    df_train["comp_density"] = (largest_comp / (1.0 + n_comp)).astype(np.float32)   # makin besar -> komponen dominan
    df_train["comp_inv"]     = (1.0 / (1.0 + n_comp)).astype(np.float32)            # penalti fragmentasi

    # peak gating
    if "log_peak_ratio" in df_train.columns:
        df_train["has_peak_x_logpeak"] = (has_peak * getf("log_peak_ratio", 0.0)).astype(np.float32)
    else:
        df_train["has_peak_x_logpeak"] = (has_peak * 0.0).astype(np.float32)

    # jika ada n_pairs_thr/mnn
    n_pairs_thr = getf("n_pairs_thr", 0.0)
    n_pairs_mnn = getf("n_pairs_mnn", 0.0)
    df_train["mnn_ratio"] = (n_pairs_mnn / (1.0 + n_pairs_thr)).astype(np.float32)

# Replace inf with NaN for numeric
for c in df_train.columns:
    if pd.api.types.is_numeric_dtype(df_train[c]):
        df_train[c] = df_train[c].replace([np.inf, -np.inf], np.nan)

# ----------------------------
# 8) Select feature columns (numeric only, exclude identifiers/labels)
# ----------------------------
TARGET_COLS = {"y", "y_forged", "has_mask", "is_forged", "forged"}
SPLIT_COLS  = {"fold"}

num_cols = [c for c in df_train.columns if pd.api.types.is_numeric_dtype(df_train[c])]
feature_cols = [c for c in num_cols if c not in TARGET_COLS and c not in SPLIT_COLS and c not in ["case_id"]]

# fill NaN
df_train[feature_cols] = df_train[feature_cols].fillna(FE_CFG["fillna_value"])

# cast float32
if FE_CFG["cast_float32"]:
    df_train[feature_cols] = df_train[feature_cols].astype(np.float32)

# ----------------------------
# 9) Final outputs
# ----------------------------
df_train_tabular = df_train[["uid","case_id","variant","fold","y"] + feature_cols].copy()
X_train = df_train_tabular[feature_cols]
y_train = df_train_tabular["y"].astype(int)
folds   = df_train_tabular["fold"].astype(int)

print("\nOK — Training table built")
print("  df_train_tabular:", df_train_tabular.shape)
print("  X_train:", X_train.shape, "| y pos%:", float(y_train.mean())*100.0)
print("  folds:", folds.nunique(), "unique folds")
print("  feature_cols:", len(feature_cols))

# quick sanity
if X_train.shape[0] != y_train.shape[0]:
    raise RuntimeError("X_train and y_train row mismatch")
if y_train.isna().any():
    raise RuntimeError("y_train contains NaN")
if folds.isna().any():
    raise RuntimeError("folds contains NaN")

FEATURE_COLS = feature_cols
print("\nFeature head:", FEATURE_COLS[:20])
print("Feature tail:", FEATURE_COLS[-10:])

# ----------------------------
# 10) Save reproducible schema (feature list + FE config + clip caps)
# ----------------------------
OUT_ART = Path("/kaggle/working/recodai_luc_gate_artifacts")
OUT_ART.mkdir(parents=True, exist_ok=True)

with open(OUT_ART / "feature_cols.json", "w") as f:
    json.dump(FEATURE_COLS, f, indent=2)

schema = {
    "fe_cfg": FE_CFG,
    "clip_caps": clip_caps,
    "n_features": int(len(FEATURE_COLS)),
    "example_feature_head": FEATURE_COLS[:25],
}
with open(OUT_ART / "feature_schema.json", "w") as f:
    json.dump(schema, f, indent=2)

print(f"\nSaved -> {OUT_ART/'feature_cols.json'}")
print(f"Saved -> {OUT_ART/'feature_schema.json'}")


Using:
  DF_TRAIN_ALL     : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/df_train_all.parquet
  CV_CASE_FOLDS    : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/cv_case_folds.csv
  PRED_FEAT_TRAIN  : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/pred_base_v3_v7_cfg_5dbf0aa165/pred_features_train_all.csv
  MATCH_FEAT_TRAIN : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/match_base_cfg_f9f7ea3a65c5/match_features_train_all.csv (optional)
  IMG_PROFILE_TRAIN: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/image_profile_train.parquet (optional)
  DINO_LARGE_DIR   : /kaggle/input/dinov2/pytorch/large/1

OK — Training table built
  df_train_tabular: (5176, 71)
  X_train: (5176, 66) | y pos%: 54.07650695517774
  folds: 5 unique folds
  feature_cols: 66

Feature head: ['feat_exists', 'match_exists', 'has_peak', 'peak_ratio', 'best_weight', 'best_count', 'best_mean_sim', 'n_pairs_thr', 'n_pairs_mnn', 'best_inlier_ratio', 'best_weight

# Train Baseline Model (Leakage-Safe CV)

In [None]:
# ============================================================
# Step 3 — Train Baseline Model (Leakage-Safe CV) — TRANSFORMER BASELINE (REVISI FULL v2)
# - Baseline: Tabular Transformer (FT-Transformer style for numeric features)
# - CV: pakai kolom `fold` (by case_id)
# - Output:
#   * OOF probabilities
#   * CV report (AUC, F1, Precision, Recall, LogLoss)
#   * Simpan model per fold + model_full (torch .pt bundle: state_dict + scaler)
#
# FIX UTAMA:
# - predict_proba() handle batch berupa (xb, yb) / list / tuple  -> TIDAK ERROR lagi
#
# Upgrade:
# - AMP (autocast + GradScaler) jika CUDA
# - Early stopping pakai val_logloss
# ============================================================

import json, gc, math, time, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from IPython.display import display
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, log_loss

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ----------------------------
# 0) Require outputs from Step 2
# ----------------------------
need_vars = ["df_train_tabular", "FEATURE_COLS"]
for v in need_vars:
    if v not in globals():
        raise RuntimeError(f"Missing `{v}`. Jalankan dulu Step 2 — Build Training Table (X, y, folds).")

df_train_tabular = df_train_tabular.copy()
FEATURE_COLS = list(FEATURE_COLS)

required_cols = {"uid", "case_id", "variant", "fold", "y"}
missing_cols = [c for c in required_cols if c not in df_train_tabular.columns]
if missing_cols:
    raise ValueError(f"df_train_tabular missing columns: {missing_cols}.")

# ----------------------------
# 1) Config (kapasitas bisa kamu naikkan di sini)
# ----------------------------
CFG = {
    "seed": 2025,

    # model capacity
    "d_model": 384,
    "n_layers": 8,
    "n_heads": 8,
    "ffn_mult": 4,
    "dropout": 0.20,
    "attn_dropout": 0.10,

    # training
    "batch_size": 512,
    "epochs": 80,
    "lr": 2e-4,
    "weight_decay": 5e-3,
    "warmup_frac": 0.10,
    "grad_clip": 1.0,

    # early stopping
    "early_stop_patience": 12,
    "early_stop_min_delta": 1e-4,

    # report threshold (cuma untuk report baseline; tuning nanti beda step)
    "report_thr": 0.5,
}

def seed_everything(seed: int = 2025):
    import random, os
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

seed_everything(CFG["seed"])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = (device.type == "cuda")
print("Device:", device, "| AMP:", use_amp)

# opsional (pytorch 2.x): bantu matmul lebih kenceng
try:
    torch.set_float32_matmul_precision("high")
except Exception:
    pass

# ----------------------------
# 2) Build arrays + guard
# ----------------------------
X = df_train_tabular[FEATURE_COLS].to_numpy(dtype=np.float32, copy=True)
y = df_train_tabular["y"].to_numpy(dtype=np.int64, copy=True)
folds = df_train_tabular["fold"].to_numpy(dtype=np.int64, copy=True)

if not np.isfinite(X).all():
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

n = len(df_train_tabular)
unique_folds = sorted(pd.Series(folds).unique().tolist())
n_folds = len(unique_folds)
n_features = X.shape[1]

print("Transformer baseline setup:")
print("  rows      :", n)
print("  folds     :", n_folds, "|", unique_folds)
print("  pos%      :", float(y.mean()) * 100.0)
print("  n_features:", n_features)

# ----------------------------
# 3) Dataset
# ----------------------------
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = None if y is None else torch.from_numpy(y.astype(np.int64))

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

# ----------------------------
# 4) FT-Transformer style model for numeric features
# ----------------------------
class FTTransformer(nn.Module):
    def __init__(self, n_features, d_model=384, n_heads=8, n_layers=8, ffn_mult=4,
                 dropout=0.2, attn_dropout=0.1):
        super().__init__()
        self.n_features = n_features
        self.d_model = d_model

        self.w = nn.Parameter(torch.randn(n_features, d_model) * 0.02)
        self.b = nn.Parameter(torch.zeros(n_features, d_model))
        self.feat_emb = nn.Parameter(torch.randn(n_features, d_model) * 0.02)

        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=ffn_mult * d_model,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.token_dropout = nn.Dropout(attn_dropout)
        self.norm = nn.LayerNorm(d_model)

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, x):
        tok = x.unsqueeze(-1) * self.w.unsqueeze(0) + self.b.unsqueeze(0)
        tok = tok + self.feat_emb.unsqueeze(0)
        tok = self.token_dropout(tok)

        B = tok.size(0)
        cls = self.cls.expand(B, -1, -1)
        seq = torch.cat([cls, tok], dim=1)

        z = self.encoder(seq)
        z = self.norm(z[:, 0])
        logit = self.head(z).squeeze(-1)
        return logit

# ----------------------------
# 5) Scaler helpers (fit only on train fold)
# ----------------------------
def fit_standardizer(X_tr: np.ndarray):
    mu = X_tr.mean(axis=0, dtype=np.float64)
    sig = X_tr.std(axis=0, dtype=np.float64)
    sig = np.where(sig < 1e-8, 1.0, sig)
    return mu.astype(np.float32), sig.astype(np.float32)

def apply_standardizer(X_in: np.ndarray, mu: np.ndarray, sig: np.ndarray):
    return ((X_in - mu) / sig).astype(np.float32)

# ----------------------------
# 6) Scheduler: warmup + cosine
# ----------------------------
def make_warmup_cosine_scheduler(optimizer, total_steps: int, warmup_steps: int):
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step + 1) / float(max(1, warmup_steps))
        progress = (step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return 0.5 * (1.0 + math.cos(math.pi * min(1.0, progress)))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

# ----------------------------
# 7) Predict helper (FIX: handle batch=(xb,yb))
# ----------------------------
@torch.no_grad()
def predict_proba(model, loader):
    model.eval()
    probs = []
    for batch in loader:
        # batch bisa: xb saja, atau (xb, yb), atau list/tuple
        if isinstance(batch, (list, tuple)):
            xb = batch[0]
        else:
            xb = batch
        xb = xb.to(device, non_blocking=True)

        with torch.cuda.amp.autocast(enabled=use_amp):
            logits = model(xb)
            p = torch.sigmoid(logits)

        probs.append(p.detach().cpu().numpy())
    return np.concatenate(probs, axis=0).astype(np.float32)

def safe_auc(y_true, p):
    if len(np.unique(y_true)) < 2:
        return None
    return float(roc_auc_score(y_true, p))

def safe_logloss(y_true, p):
    p = np.clip(np.asarray(p, dtype=np.float64), 1e-6, 1-1e-6)
    return float(log_loss(y_true, p, labels=[0,1]))

# ----------------------------
# 8) Train one fold (AMP + early stopping)
# ----------------------------
def train_one_fold(X_tr, y_tr, X_va, y_va, cfg):
    mu, sig = fit_standardizer(X_tr)
    X_trn = apply_standardizer(X_tr, mu, sig)
    X_van = apply_standardizer(X_va, mu, sig)

    ds_tr = TabDataset(X_trn, y_tr)
    ds_va = TabDataset(X_van, y_va)

    dl_tr = DataLoader(
        ds_tr, batch_size=cfg["batch_size"], shuffle=True,
        num_workers=2, pin_memory=(device.type=="cuda"),
        drop_last=False
    )
    dl_va = DataLoader(
        ds_va, batch_size=cfg["batch_size"], shuffle=False,
        num_workers=2, pin_memory=(device.type=="cuda"),
        drop_last=False
    )

    model = FTTransformer(
        n_features=n_features,
        d_model=cfg["d_model"],
        n_heads=cfg["n_heads"],
        n_layers=cfg["n_layers"],
        ffn_mult=cfg["ffn_mult"],
        dropout=cfg["dropout"],
        attn_dropout=cfg["attn_dropout"],
    ).to(device)

    pos = int(y_tr.sum())
    neg = int(len(y_tr) - pos)
    pos_weight = float(neg / max(1, pos))
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))

    opt = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
    total_steps = int(cfg["epochs"]) * max(1, len(dl_tr))
    warmup_steps = int(cfg["warmup_frac"] * total_steps)
    sch = make_warmup_cosine_scheduler(opt, total_steps=total_steps, warmup_steps=warmup_steps)

    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    best = {"val_logloss": 1e9, "epoch": -1}
    best_state = None
    bad = 0

    for epoch in range(int(cfg["epochs"])):
        model.train()
        t0 = time.time()
        loss_sum = 0.0
        n_sum = 0

        for xb, yb in dl_tr:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True).float()

            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=use_amp):
                logits = model(xb)
                loss = loss_fn(logits, yb)

            scaler.scale(loss).backward()
            if cfg["grad_clip"] and cfg["grad_clip"] > 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), float(cfg["grad_clip"]))

            scaler.step(opt)
            scaler.update()
            sch.step()

            loss_sum += float(loss.item()) * xb.size(0)
            n_sum += xb.size(0)

        # validate (fix: predict_proba can read (xb,yb) batches)
        p_va = predict_proba(model, dl_va)
        vll = safe_logloss(y_va, p_va)

        tr_loss = loss_sum / max(1, n_sum)
        dt = time.time() - t0
        print(f"  epoch {epoch+1:03d}/{cfg['epochs']} | train_loss={tr_loss:.5f} | val_logloss={vll:.5f} | dt={dt:.1f}s")

        improved = (best["val_logloss"] - vll) > float(cfg["early_stop_min_delta"])
        if improved:
            best["val_logloss"] = float(vll)
            best["epoch"] = int(epoch)
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= int(cfg["early_stop_patience"]):
                print(f"  early stop at epoch {epoch+1}, best_epoch={best['epoch']+1}, best_val_logloss={best['val_logloss']:.5f}")
                break

        gc.collect()

    if best_state is not None:
        model.load_state_dict(best_state, strict=True)

    p_va = predict_proba(model, dl_va)

    pack = {
        "state_dict": {k: v.detach().cpu() for k, v in model.state_dict().items()},
        "mu": mu,
        "sig": sig,
        "cfg": cfg,
    }
    return pack, p_va, best

# ----------------------------
# 9) CV loop
# ----------------------------
oof_pred = np.zeros(n, dtype=np.float32)
fold_reports = []

models_dir = Path("/kaggle/working/recodai_luc_gate_artifacts/baseline_transformer_folds")
models_dir.mkdir(parents=True, exist_ok=True)

for f in unique_folds:
    print(f"\n[Fold {f}]")
    tr_idx = np.where(folds != f)[0]
    va_idx = np.where(folds == f)[0]

    X_tr, y_tr = X[tr_idx], y[tr_idx]
    X_va, y_va = X[va_idx], y[va_idx]

    pack, p_va, best = train_one_fold(X_tr, y_tr, X_va, y_va, CFG)
    oof_pred[va_idx] = p_va

    auc = safe_auc(y_va, p_va)

    thr = float(CFG["report_thr"])
    yhat = (p_va >= thr).astype(np.int32)

    rep = {
        "fold": int(f),
        "n_val": int(len(va_idx)),
        "pos_val": int(y_va.sum()),
        "auc": auc,
        f"f1@{thr}": float(f1_score(y_va, yhat, zero_division=0)),
        f"precision@{thr}": float(precision_score(y_va, yhat, zero_division=0)),
        f"recall@{thr}": float(recall_score(y_va, yhat, zero_division=0)),
        "logloss": safe_logloss(y_va, p_va),
        "best_val_logloss": float(best["val_logloss"]),
        "best_epoch": int(best["epoch"] + 1),
    }
    fold_reports.append(rep)

    torch.save(
        {"pack": pack, "feature_cols": FEATURE_COLS},
        models_dir / f"baseline_transformer_fold_{f}.pt"
    )

    if device.type == "cuda":
        torch.cuda.empty_cache()
    gc.collect()

# ----------------------------
# 10) Overall OOF metrics
# ----------------------------
oof_auc = safe_auc(y, oof_pred)
thr = float(CFG["report_thr"])
oof_yhat = (oof_pred >= thr).astype(np.int32)

overall = {
    "rows": int(n),
    "folds": int(n_folds),
    "pos_total": int(y.sum()),
    "pos_rate": float(y.mean()),
    "oof_auc": oof_auc,
    f"oof_f1@{thr}": float(f1_score(y, oof_yhat, zero_division=0)),
    f"oof_precision@{thr}": float(precision_score(y, oof_yhat, zero_division=0)),
    f"oof_recall@{thr}": float(recall_score(y, oof_yhat, zero_division=0)),
    "oof_logloss": safe_logloss(y, oof_pred),
}

df_rep = pd.DataFrame(fold_reports).sort_values("fold").reset_index(drop=True)
print("\nPer-fold report:")
display(df_rep)

print("\nOOF overall:")
print(overall)

# ----------------------------
# 11) Train "full baseline model" (fixed, lebih singkat)
# ----------------------------
def train_full_fixed(X_full_raw, y_full, cfg):
    mu, sig = fit_standardizer(X_full_raw)
    X_full = apply_standardizer(X_full_raw, mu, sig)

    ds_full = TabDataset(X_full, y_full)
    dl_full = DataLoader(
        ds_full, batch_size=cfg["batch_size"], shuffle=True,
        num_workers=2, pin_memory=(device.type=="cuda"),
        drop_last=False
    )

    model = FTTransformer(
        n_features=n_features,
        d_model=cfg["d_model"],
        n_heads=cfg["n_heads"],
        n_layers=cfg["n_layers"],
        ffn_mult=cfg["ffn_mult"],
        dropout=cfg["dropout"],
        attn_dropout=cfg["attn_dropout"],
    ).to(device)

    pos = int(y_full.sum())
    neg = int(len(y_full) - pos)
    pos_weight = float(neg / max(1, pos))
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))

    opt = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])
    E_FULL = max(10, int(cfg["epochs"] * 0.6))
    total_steps = E_FULL * max(1, len(dl_full))
    warmup_steps = int(cfg["warmup_frac"] * total_steps)
    sch = make_warmup_cosine_scheduler(opt, total_steps=total_steps, warmup_steps=warmup_steps)

    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    print(f"\nTraining full baseline transformer for {E_FULL} epochs (fixed)...")
    for epoch in range(E_FULL):
        model.train()
        loss_sum = 0.0
        n_sum = 0
        for xb, yb in dl_full:
            xb = xb.to(device, non_blocking=True)
            yb = yb.to(device, non_blocking=True).float()

            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=use_amp):
                logits = model(xb)
                loss = loss_fn(logits, yb)

            scaler.scale(loss).backward()
            if cfg["grad_clip"] and cfg["grad_clip"] > 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), float(cfg["grad_clip"]))
            scaler.step(opt)
            scaler.update()
            sch.step()

            loss_sum += float(loss.item()) * xb.size(0)
            n_sum += xb.size(0)

        print(f"  full epoch {epoch+1:03d}/{E_FULL} | loss={loss_sum/max(1,n_sum):.5f}")

    full_pack = {
        "state_dict": {k: v.detach().cpu() for k, v in model.state_dict().items()},
        "mu": mu,
        "sig": sig,
        "cfg": cfg,
    }
    return full_pack

out_dir = Path("/kaggle/working/recodai_luc_gate_artifacts")
out_dir.mkdir(parents=True, exist_ok=True)

full_pack = train_full_fixed(X, y, CFG)
torch.save({"pack": full_pack, "feature_cols": FEATURE_COLS}, out_dir / "baseline_transformer_model_full.pt")

# ----------------------------
# 12) Save OOF + report
# ----------------------------
df_oof = df_train_tabular[["uid", "case_id", "variant", "fold", "y"]].copy()
df_oof["oof_pred_baseline_tf"] = oof_pred
df_oof.to_csv(out_dir / "oof_baseline_transformer.csv", index=False)

report = {
    "model": "FT-Transformer (numeric tabular) — baseline",
    "cfg": CFG,
    "feature_count": int(len(FEATURE_COLS)),
    "fold_reports": fold_reports,
    "overall": overall,
}
with open(out_dir / "baseline_transformer_cv_report.json", "w") as f:
    json.dump(report, f, indent=2)

print("\nSaved artifacts:")
print("  fold models  ->", models_dir)
print("  full model   ->", out_dir / "baseline_transformer_model_full.pt")
print("  oof preds    ->", out_dir / "oof_baseline_transformer.csv")
print("  cv report    ->", out_dir / "baseline_transformer_cv_report.json")

# Export globals
OOF_PRED_BASELINE_TF = oof_pred
BASELINE_TF_OVERALL = overall
BASELINE_TF_FOLD_REPORTS = fold_reports


Device: cpu | AMP: False
Transformer baseline setup:
  rows      : 5176
  folds     : 5 | [0, 1, 2, 3, 4]
  pos%      : 54.07650695517774
  n_features: 66

[Fold 0]


# Optimize Model & Hyperparameters (Iterative)

In [None]:
# ============================================================
# Step 4 — Optimize Model & Hyperparameters (Iterative) — TRANSFORMER ONLY (REVISI FULL)
# - Fokus: cari konfigurasi Tabular Transformer paling kuat (tanpa LightGBM/CatBoost/sklearn trees)
# - Validasi: Leakage-safe CV pakai kolom `fold` (by case_id)
# - Skor utama: Best F-beta (beta=0.5) dari OOF (lebih anti-FP)
#   + log AUC & LogLoss sebagai sanity metric.
#
# Output:
# - /kaggle/working/recodai_luc_gate_artifacts/opt_search/opt_results.csv
# - /kaggle/working/recodai_luc_gate_artifacts/opt_search/opt_results.json
# - /kaggle/working/recodai_luc_gate_artifacts/opt_search/opt_fold_details.csv
# - /kaggle/working/recodai_luc_gate_artifacts/opt_search/oof_preds_<cfg_name>.csv (top configs)
# - /kaggle/working/recodai_luc_gate_artifacts/best_gate_config.json
# - /kaggle/working/recodai_luc_gate_artifacts/best_gate_model.pt  (fold packs untuk config terbaik)
#
# REQUIRE:
# - Step 2 sudah jalan: df_train_tabular, FEATURE_COLS
# ============================================================

import os, json, gc, math, time, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from IPython.display import display
from sklearn.metrics import roc_auc_score, log_loss, precision_score, recall_score, fbeta_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

# ----------------------------
# 0) Require data from Step 2
# ----------------------------
need_vars = ["df_train_tabular", "FEATURE_COLS"]
for v in need_vars:
    if v not in globals():
        raise RuntimeError(f"Missing `{v}`. Jalankan dulu Step 2 — Build Training Table (X, y, folds).")

df_train_tabular = df_train_tabular.copy()
FEATURE_COLS = list(FEATURE_COLS)

X_all = df_train_tabular[FEATURE_COLS].to_numpy(dtype=np.float32, copy=True)
y_all = df_train_tabular["y"].to_numpy(dtype=np.int64, copy=True)
folds_all = df_train_tabular["fold"].to_numpy(dtype=np.int64, copy=True)
uids_all = df_train_tabular["uid"].astype(str).to_numpy()

# guard
if not np.isfinite(X_all).all():
    X_all = np.nan_to_num(X_all, nan=0.0, posinf=0.0, neginf=0.0)

unique_folds = sorted(pd.Series(folds_all).unique().tolist())
n = len(y_all)
pos_rate = float(y_all.mean())
n_features = X_all.shape[1]

print("Optimize setup (Transformer only):")
print(f"  rows={n} | folds={len(unique_folds)} | pos%={pos_rate*100:.2f} | n_features={n_features}")

# ----------------------------
# 1) Global settings
# ----------------------------
SEED = 2025
BETA = 0.5            # beta < 1 => fokus precision (anti false-positive)
THR_GRID = 201        # grid thresholds untuk best Fbeta
REPORT_TOPK_OOF = 3   # simpan OOF untuk topK config

def seed_everything(seed=2025):
    import random, os
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = (device.type == "cuda")
print("Device:", device, "| AMP:", use_amp)

# ----------------------------
# 2) Helpers: fast threshold search + safe metrics
# ----------------------------
def best_fbeta_fast(y_true, p, beta=0.5, grid=201):
    """
    Cari threshold terbaik untuk F-beta secara vectorized.
    Output: best dict {fbeta, thr, precision, recall}
    """
    y = (np.asarray(y_true).astype(np.int32) == 1)
    p = np.clip(np.asarray(p, dtype=np.float64), 1e-8, 1.0 - 1e-8)

    thrs = np.linspace(0.01, 0.99, grid, dtype=np.float64)
    pred = (p[:, None] >= thrs[None, :])

    y1 = y[:, None]
    tp = (pred & y1).sum(axis=0).astype(np.float64)
    fp = (pred & (~y1)).sum(axis=0).astype(np.float64)
    fn = (y.sum().astype(np.float64) - tp)

    precision = np.divide(tp, tp + fp, out=np.zeros_like(tp), where=(tp + fp) > 0)
    recall    = np.divide(tp, tp + fn, out=np.zeros_like(tp), where=(tp + fn) > 0)

    b2 = beta * beta
    denom = (b2 * precision + recall)
    fbeta = np.divide((1.0 + b2) * precision * recall, denom, out=np.zeros_like(precision), where=denom > 0)

    j = int(np.argmax(fbeta))
    return {
        "fbeta": float(fbeta[j]),
        "thr": float(thrs[j]),
        "precision": float(precision[j]),
        "recall": float(recall[j]),
    }

def safe_auc(y_true, p):
    if len(np.unique(y_true)) < 2:
        return None
    return float(roc_auc_score(y_true, p))

def safe_logloss(y_true, p):
    p = np.clip(np.asarray(p, dtype=np.float64), 1e-8, 1 - 1e-8)
    return float(log_loss(y_true, p, labels=[0, 1]))

# ----------------------------
# 3) Dataset + Standardizer (fit only on train fold => no leakage)
# ----------------------------
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = None if y is None else torch.from_numpy(y.astype(np.int64))

    def __len__(self): return self.X.shape[0]

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

def fit_standardizer(X_tr: np.ndarray):
    mu = X_tr.mean(axis=0, dtype=np.float64)
    sig = X_tr.std(axis=0, dtype=np.float64)
    sig = np.where(sig < 1e-8, 1.0, sig)
    return mu.astype(np.float32), sig.astype(np.float32)

def apply_standardizer(X_in: np.ndarray, mu: np.ndarray, sig: np.ndarray):
    return ((X_in - mu) / sig).astype(np.float32)

# ----------------------------
# 4) Transformer model (FT-Transformer style, numeric-only)
# ----------------------------
class FTTransformer(nn.Module):
    def __init__(self, n_features, d_model=384, n_heads=8, n_layers=8, ffn_mult=4,
                 dropout=0.2, attn_dropout=0.1):
        super().__init__()
        self.n_features = n_features
        self.d_model = d_model

        # per-feature linear tokenization
        self.w = nn.Parameter(torch.randn(n_features, d_model) * 0.02)
        self.b = nn.Parameter(torch.zeros(n_features, d_model))
        self.feat_emb = nn.Parameter(torch.randn(n_features, d_model) * 0.02)

        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=ffn_mult * d_model,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.token_dropout = nn.Dropout(attn_dropout)
        self.norm = nn.LayerNorm(d_model)

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, x):
        # x: (B, F)
        tok = x.unsqueeze(-1) * self.w.unsqueeze(0) + self.b.unsqueeze(0)
        tok = tok + self.feat_emb.unsqueeze(0)
        tok = self.token_dropout(tok)

        B = tok.size(0)
        cls = self.cls.expand(B, -1, -1)
        seq = torch.cat([cls, tok], dim=1)  # (B, 1+F, D)

        z = self.encoder(seq)
        z = self.norm(z[:, 0])  # CLS
        logit = self.head(z).squeeze(-1)
        return logit

# ----------------------------
# 5) Scheduler: warmup + cosine
# ----------------------------
def make_warmup_cosine_scheduler(optimizer, total_steps: int, warmup_steps: int):
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step + 1) / float(max(1, warmup_steps))
        progress = (step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return 0.5 * (1.0 + math.cos(math.pi * min(1.0, progress)))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

@torch.no_grad()
def predict_proba(model, loader):
    model.eval()
    probs = []
    for xb in loader:
        xb = xb.to(device)
        logits = model(xb)
        p = torch.sigmoid(logits).detach().cpu().numpy()
        probs.append(p)
    return np.concatenate(probs, axis=0).astype(np.float32)

def train_one_fold_transformer(X_tr, y_tr, X_va, y_va, cfg):
    # standardize per fold
    mu, sig = fit_standardizer(X_tr)
    X_trn = apply_standardizer(X_tr, mu, sig)
    X_van = apply_standardizer(X_va, mu, sig)

    ds_tr = TabDataset(X_trn, y_tr)
    ds_va = TabDataset(X_van, y_va)

    dl_tr = DataLoader(ds_tr, batch_size=cfg["batch_size"], shuffle=True, num_workers=2,
                       pin_memory=(device.type=="cuda"), drop_last=False)
    dl_va = DataLoader(ds_va, batch_size=cfg["batch_size"], shuffle=False, num_workers=2,
                       pin_memory=(device.type=="cuda"), drop_last=False)

    model = FTTransformer(
        n_features=n_features,
        d_model=cfg["d_model"],
        n_heads=cfg["n_heads"],
        n_layers=cfg["n_layers"],
        ffn_mult=cfg["ffn_mult"],
        dropout=cfg["dropout"],
        attn_dropout=cfg["attn_dropout"],
    ).to(device)

    # imbalance
    pos = int(y_tr.sum())
    neg = int(len(y_tr) - pos)
    pos_weight = float(neg / max(1, pos))
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))

    opt = torch.optim.AdamW(model.parameters(), lr=cfg["lr"], weight_decay=cfg["weight_decay"])

    total_steps = cfg["epochs"] * max(1, len(dl_tr))
    warmup_steps = int(cfg["warmup_frac"] * total_steps)
    sch = make_warmup_cosine_scheduler(opt, total_steps=total_steps, warmup_steps=warmup_steps)

    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    best_val = 1e9
    best_state = None
    best_epoch = -1
    bad = 0

    for epoch in range(cfg["epochs"]):
        model.train()
        loss_sum = 0.0
        n_sum = 0

        for xb, yb in dl_tr:
            xb = xb.to(device)
            yb = yb.to(device).float()

            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=use_amp):
                logits = model(xb)
                loss = loss_fn(logits, yb)

            scaler.scale(loss).backward()
            if cfg["grad_clip"] and cfg["grad_clip"] > 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), cfg["grad_clip"])
            scaler.step(opt)
            scaler.update()
            sch.step()

            loss_sum += float(loss.item()) * xb.size(0)
            n_sum += xb.size(0)

        # val
        p_va = predict_proba(model, dl_va)
        vll = safe_logloss(y_va, p_va)

        improved = (best_val - vll) > cfg["min_delta"]
        if improved:
            best_val = vll
            best_epoch = epoch
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= cfg["patience"]:
                break

        gc.collect()

    if best_state is not None:
        model.load_state_dict(best_state, strict=True)

    # final val preds (best)
    p_va = predict_proba(model, dl_va)

    pack = {
        "state_dict": {k: v.detach().cpu() for k, v in model.state_dict().items()},
        "mu": mu,
        "sig": sig,
        "cfg": cfg,
        "best_epoch": int(best_epoch + 1),
        "best_val_logloss": float(best_val),
    }
    return pack, p_va

# ----------------------------
# 6) CV evaluator for a config
# ----------------------------
def run_cv_config(cfg, cfg_name, beta=0.5, thr_grid=201, verbose=False):
    oof = np.zeros(n, dtype=np.float32)
    fold_rows = []

    for f in unique_folds:
        tr = np.where(folds_all != f)[0]
        va = np.where(folds_all == f)[0]

        X_tr, y_tr = X_all[tr], y_all[tr]
        X_va, y_va = X_all[va], y_all[va]

        pack, p_va = train_one_fold_transformer(X_tr, y_tr, X_va, y_va, cfg)
        oof[va] = p_va

        fold_auc = safe_auc(y_va, p_va)
        fold_ll  = safe_logloss(y_va, p_va)
        best_fold = best_fbeta_fast(y_va, p_va, beta=beta, grid=max(51, thr_grid//2))

        fold_rows.append({
            "cfg": cfg_name,
            "fold": int(f),
            "n_val": int(len(va)),
            "pos_val": int(y_va.sum()),
            "auc": fold_auc,
            "logloss": fold_ll,
            "best_fbeta": best_fold["fbeta"],
            "best_thr": best_fold["thr"],
            "best_prec": best_fold["precision"],
            "best_rec": best_fold["recall"],
            "best_val_logloss": pack["best_val_logloss"],
            "best_epoch": pack["best_epoch"],
        })

        if verbose:
            print(f"    fold {f}: best_fbeta={best_fold['fbeta']:.5f} thr={best_fold['thr']:.3f} ll={fold_ll:.5f}")

        del pack
        gc.collect()

    # overall
    oof_auc = safe_auc(y_all, oof)
    oof_ll  = safe_logloss(y_all, oof)
    best_oof = best_fbeta_fast(y_all, oof, beta=beta, grid=thr_grid)

    summary = {
        "cfg": cfg_name,
        "oof_auc": oof_auc,
        "oof_logloss": oof_ll,
        "oof_best_fbeta": best_oof["fbeta"],
        "oof_best_thr": best_oof["thr"],
        "oof_best_prec": best_oof["precision"],
        "oof_best_rec": best_oof["recall"],
        "d_model": cfg["d_model"],
        "n_layers": cfg["n_layers"],
        "n_heads": cfg["n_heads"],
        "ffn_mult": cfg["ffn_mult"],
        "dropout": cfg["dropout"],
        "attn_dropout": cfg["attn_dropout"],
        "batch_size": cfg["batch_size"],
        "epochs": cfg["epochs"],
        "lr": cfg["lr"],
        "weight_decay": cfg["weight_decay"],
        "warmup_frac": cfg["warmup_frac"],
        "patience": cfg["patience"],
    }
    return summary, fold_rows, oof

# ----------------------------
# 7) Define candidate configs (Transformer MAX OK)
#    Kamu bebas menambah/ubah list ini kalau mau lebih banyak trial.
# ----------------------------
BASE = dict(
    batch_size=512,
    epochs=70,
    lr=2e-4,
    weight_decay=5e-3,
    warmup_frac=0.10,
    grad_clip=1.0,
    patience=10,
    min_delta=1e-4,
)

candidates = []

# (A) Strong MAX (recommended)
candidates.append(("tf_max_384x8", dict(BASE, d_model=384, n_layers=8, n_heads=8, ffn_mult=4, dropout=0.20, attn_dropout=0.10)))

# (B) Bigger (more powerful, risk overfit -> more dropout/wd)
candidates.append(("tf_extreme_512x12", dict(BASE, d_model=512, n_layers=12, n_heads=16, ffn_mult=4, dropout=0.28, attn_dropout=0.15, lr=1.2e-4, weight_decay=1e-2, epochs=90, patience=12)))

# (C) Faster but still strong
candidates.append(("tf_256x6_fast", dict(BASE, d_model=256, n_layers=6, n_heads=8, ffn_mult=4, dropout=0.18, attn_dropout=0.08, lr=3e-4, weight_decay=3e-3, epochs=60, patience=9)))

# (D) Alternative regularization (lebih tahan noise)
candidates.append(("tf_384x10_reg", dict(BASE, d_model=384, n_layers=10, n_heads=8, ffn_mult=4, dropout=0.25, attn_dropout=0.12, lr=1.6e-4, weight_decay=8e-3, epochs=85, patience=12)))

# (E) Slightly different ffn width
candidates.append(("tf_384x8_ffn2", dict(BASE, d_model=384, n_layers=8, n_heads=8, ffn_mult=2, dropout=0.18, attn_dropout=0.10, lr=2.2e-4, weight_decay=4e-3, epochs=70, patience=10)))

print(f"\nTotal Transformer candidates: {len(candidates)}")
print("Primary score: OOF best F-beta (beta=0.5)")

# ----------------------------
# 8) Run iterative search
# ----------------------------
OUT_DIR = Path("/kaggle/working/recodai_luc_gate_artifacts")
OPT_DIR = OUT_DIR / "opt_search"
OPT_DIR.mkdir(parents=True, exist_ok=True)

all_summaries = []
all_fold_rows = []
oof_store = {}

t_start = time.time()
for i, (name, cfg) in enumerate(candidates, 1):
    print(f"\n[{i:02d}/{len(candidates)}] CV -> {name}")
    summ, fold_rows, oof = run_cv_config(cfg, name, beta=BETA, thr_grid=THR_GRID, verbose=False)
    all_summaries.append(summ)
    all_fold_rows.extend(fold_rows)
    oof_store[name] = oof

    print(f"  oof_best_fbeta: {summ['oof_best_fbeta']:.6f} | thr: {summ['oof_best_thr']:.3f}"
          f" | auc: {(summ['oof_auc'] if summ['oof_auc'] is not None else float('nan')):.6f}"
          f" | logloss: {summ['oof_logloss']:.6f}")

print(f"\nSearch done in {(time.time()-t_start)/60:.1f} min")

df_sum = pd.DataFrame(all_summaries)
df_fold = pd.DataFrame(all_fold_rows)

# rank: primary -fbeta, tie-break logloss
df_sum = df_sum.sort_values(["oof_best_fbeta", "oof_logloss"], ascending=[False, True]).reset_index(drop=True)

print("\nTop candidates:")
display(df_sum.head(10))

# save search results
df_sum.to_csv(OPT_DIR / "opt_results.csv", index=False)
with open(OPT_DIR / "opt_results.json", "w") as f:
    json.dump(df_sum.to_dict(orient="records"), f, indent=2)
df_fold.to_csv(OPT_DIR / "opt_fold_details.csv", index=False)

# ----------------------------
# 9) Optional: simple OOF ensemble (avg top-K)
# ----------------------------
TOPK = min(3, len(df_sum))
top_names = df_sum["cfg"].head(TOPK).tolist()

ens_summary = None
if TOPK >= 2:
    oof_ens = np.mean([oof_store[nm] for nm in top_names], axis=0).astype(np.float32)
    ens_best = best_fbeta_fast(y_all, oof_ens, beta=BETA, grid=THR_GRID)
    ens_auc  = safe_auc(y_all, oof_ens)
    ens_ll   = safe_logloss(y_all, oof_ens)

    ens_summary = {
        "cfg": f"ensemble_avg_top{TOPK}",
        "members": top_names,
        "oof_auc": ens_auc,
        "oof_logloss": ens_ll,
        "oof_best_fbeta": ens_best["fbeta"],
        "oof_best_thr": ens_best["thr"],
        "oof_best_prec": ens_best["precision"],
        "oof_best_rec": ens_best["recall"],
    }
    print("\nEnsemble OOF (avg) result:")
    print(ens_summary)

# save OOF preds for top configs (debugging)
for nm in top_names[:REPORT_TOPK_OOF]:
    df_o = pd.DataFrame({
        "uid": uids_all,
        "y": y_all,
        "fold": folds_all,
        f"oof_pred_{nm}": oof_store[nm]
    })
    df_o.to_csv(OPT_DIR / f"oof_preds_{nm}.csv", index=False)

# ----------------------------
# 10) Choose best (single vs ensemble) — default pilih yang fbeta paling tinggi
# ----------------------------
best_single = df_sum.iloc[0].to_dict()
best_choice = {"type": "single", "name": best_single["cfg"], "summary": best_single}

if ens_summary is not None and ens_summary["oof_best_fbeta"] >= float(best_single["oof_best_fbeta"]):
    best_choice = {"type": "ensemble_oof_only", "name": ens_summary["cfg"], "summary": ens_summary}

print("\nBest choice:")
print(best_choice)

# ----------------------------
# 11) Re-train & SAVE fold packs for BEST SINGLE config
#     (Untuk dipakai di Step 5 Final Training / inference, kita simpan model per fold + scaler)
#     Catatan: jika best_choice = ensemble_oof_only, kita tetap simpan model untuk best single sebagai baseline kuat.
# ----------------------------
best_cfg_name = best_single["cfg"]
best_cfg = None
for nm, cfg in candidates:
    if nm == best_cfg_name:
        best_cfg = cfg
        break
if best_cfg is None:
    raise RuntimeError("Best cfg not found in candidates list (unexpected).")

print(f"\nRe-train folds for best config -> {best_cfg_name}")
best_fold_packs = []
best_oof = np.zeros(n, dtype=np.float32)

for f in unique_folds:
    tr = np.where(folds_all != f)[0]
    va = np.where(folds_all == f)[0]

    X_tr, y_tr = X_all[tr], y_all[tr]
    X_va, y_va = X_all[va], y_all[va]

    pack, p_va = train_one_fold_transformer(X_tr, y_tr, X_va, y_va, best_cfg)
    pack["fold"] = int(f)
    best_fold_packs.append(pack)
    best_oof[va] = p_va

    gc.collect()

# save best fold model pack
best_model_path = OUT_DIR / "best_gate_model.pt"
torch.save(
    {
        "type": "transformer_ft",
        "feature_cols": FEATURE_COLS,
        "fold_packs": best_fold_packs,
        "cfg_name": best_cfg_name,
        "cfg": best_cfg,
        "seed": SEED,
    },
    best_model_path
)

# config bundle for next steps
best_oof_best = best_fbeta_fast(y_all, best_oof, beta=BETA, grid=THR_GRID)
best_bundle = {
    "type": "transformer_ft",
    "selection": {
        "primary_metric": f"oof_best_fbeta(beta={BETA})",
        "beta": BETA,
        "chosen_cfg": best_cfg_name,
        "chosen_type": "single",
        "oof_best_thr": best_oof_best["thr"],
        "oof_best_fbeta": best_oof_best["fbeta"],
        "oof_best_prec": best_oof_best["precision"],
        "oof_best_rec": best_oof_best["recall"],
        "oof_auc": safe_auc(y_all, best_oof),
        "oof_logloss": safe_logloss(y_all, best_oof),
    },
    "cfg": best_cfg,
    "feature_cols": FEATURE_COLS,
    "notes": "Ini masih Step 4 (opt). Final Training (Step 5) bisa train full / ensemble lebih matang.",
}

with open(OUT_DIR / "best_gate_config.json", "w") as f:
    json.dump(best_bundle, f, indent=2)

print("\nSaved best artifacts:")
print("  best model (fold packs) ->", best_model_path)
print("  best config             ->", OUT_DIR / "best_gate_config.json")
print("  opt results             ->", OPT_DIR / "opt_results.csv")
print("  fold detail             ->", OPT_DIR / "opt_fold_details.csv")

# Export globals for Step 5
BEST_GATE_BUNDLE = best_bundle
BEST_TF_CFG_NAME = best_cfg_name
BEST_TF_CFG = best_cfg
OPT_RESULTS_DF = df_sum


# Final Training (Train on Full Data)

In [None]:
# ============================================================
# Step 5 — Final Training (Train on Full Data) — TRANSFORMER ONLY (REVISI FULL + “DITINGKATKAN”)
# - Tujuan: latih ulang model Transformer terbaik pada seluruh data train (full data)
# - Input: hasil Step 2 (df_train_tabular + FEATURE_COLS)
# - Ambil konfigurasi terbaik dari:
#     /kaggle/working/recodai_luc_gate_artifacts/best_gate_config.json
#   atau dari variabel BEST_GATE_BUNDLE (jika masih ada di memory)
#
# Output:
#   /kaggle/working/recodai_luc_gate_artifacts/final_gate_model.pt
#   /kaggle/working/recodai_luc_gate_artifacts/final_gate_bundle.json
#
# Upgrade utama:
# - Full-data training pakai Transformer (FT-Transformer numeric)
# - Standardizer fit di full train (tanpa leakage issue karena final)
# - Internal val split by case_id (opsional) untuk early stopping (lebih stabil, tidak “buang” data CV)
# - AMP + Warmup Cosine + AdamW + pos_weight (imbalance)
# - Opsi MAX_MODE untuk “naikkan kapasitas” (hati-hati OOM)
# ============================================================

import os, json, gc, math, time, warnings
from pathlib import Path

import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import log_loss, roc_auc_score

# ----------------------------
# 0) Require outputs from Step 2
# ----------------------------
if "df_train_tabular" not in globals():
    raise RuntimeError("Missing `df_train_tabular`. Jalankan Step 2 dulu.")
if "FEATURE_COLS" not in globals():
    raise RuntimeError("Missing `FEATURE_COLS`. Jalankan Step 2 dulu.")

df_train_tabular = df_train_tabular.copy()
FEATURE_COLS = list(FEATURE_COLS)

# basic arrays
X_all = df_train_tabular[FEATURE_COLS].to_numpy(dtype=np.float32, copy=True)
y_all = df_train_tabular["y"].to_numpy(dtype=np.int64, copy=True)

if not np.isfinite(X_all).all():
    X_all = np.nan_to_num(X_all, nan=0.0, posinf=0.0, neginf=0.0)

print("Final training data:")
print("  rows:", len(y_all), "| pos%:", float(y_all.mean()) * 100.0, "| n_features:", X_all.shape[1])

# ----------------------------
# 1) Load best config (from disk or memory)
# ----------------------------
OUT_DIR = Path("/kaggle/working/recodai_luc_gate_artifacts")
OUT_DIR.mkdir(parents=True, exist_ok=True)

cfg_path = OUT_DIR / "best_gate_config.json"

if "BEST_GATE_BUNDLE" in globals() and isinstance(BEST_GATE_BUNDLE, dict):
    best_bundle = BEST_GATE_BUNDLE
    source = "memory(BEST_GATE_BUNDLE)"
elif cfg_path.exists():
    best_bundle = json.loads(cfg_path.read_text())
    source = str(cfg_path)
else:
    raise FileNotFoundError("Best config not found. Jalankan Step 4 dulu (Transformer opt).")

print("\nLoaded best config from:", source)
print("  type:", best_bundle.get("type"))
print("  chosen_cfg:", best_bundle.get("selection", {}).get("chosen_cfg"))

# Expect transformer bundle format from Step 4 (transformer_ft)
if best_bundle.get("type") != "transformer_ft":
    raise ValueError("best_gate_config.json bukan transformer_ft. Pastikan Step 4 yang dipakai versi TRANSFORMER ONLY.")

cfg = dict(best_bundle.get("cfg", {}))
if not cfg:
    raise ValueError("Config `cfg` kosong di best_gate_config.json (unexpected).")

# ----------------------------
# 2) Training knobs (ditingkatkan)
# ----------------------------
FINAL_SEED = int(best_bundle.get("seed", 2025))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
use_amp = (device.type == "cuda")

# Internal val split by case_id (recommended untuk early stopping)
USE_INTERNAL_VAL = True
VAL_FRAC_CASE = 0.10     # 10% case_id untuk validasi (group-safe)
EARLY_STOP = True
PATIENCE = int(cfg.get("patience", 12))
MIN_DELTA = float(cfg.get("min_delta", 1e-4))

# “Max mode” untuk menaikkan kapasitas (opsional)
# - Kalau OOM, matikan MAX_MODE atau turunkan batch_size/d_model/layers.
MAX_MODE = False

# Multi-seed ensemble opsional (lebih kuat tapi lebih lama)
N_SEEDS = 1  # set 3 kalau mau lebih powerful (inference nanti rata-rata)

def seed_everything(seed: int):
    import random
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = False
    torch.backends.cudnn.benchmark = True

seed_everything(FINAL_SEED)

print("\nDevice:", device, "| AMP:", use_amp)
print("USE_INTERNAL_VAL:", USE_INTERNAL_VAL, "| VAL_FRAC_CASE:", VAL_FRAC_CASE)
print("EARLY_STOP:", EARLY_STOP, "| PATIENCE:", PATIENCE)
print("MAX_MODE:", MAX_MODE, "| N_SEEDS:", N_SEEDS)

# Apply MAX_MODE override (lebih “powerful”)
if MAX_MODE:
    # upgrade kapasitas + regularisasi lebih kuat
    cfg = dict(cfg)  # copy
    cfg["d_model"] = int(max(cfg.get("d_model", 384), 512))
    cfg["n_layers"] = int(max(cfg.get("n_layers", 8), 12))
    cfg["n_heads"] = int(max(cfg.get("n_heads", 8), 16))
    cfg["ffn_mult"] = int(max(cfg.get("ffn_mult", 4), 4))
    cfg["dropout"] = float(max(cfg.get("dropout", 0.20), 0.28))
    cfg["attn_dropout"] = float(max(cfg.get("attn_dropout", 0.10), 0.15))
    cfg["lr"] = float(min(cfg.get("lr", 2e-4), 1.2e-4))
    cfg["weight_decay"] = float(max(cfg.get("weight_decay", 5e-3), 1e-2))
    cfg["epochs"] = int(max(cfg.get("epochs", 70), 90))
    cfg["batch_size"] = int(min(cfg.get("batch_size", 512), 512))  # jaga OOM
    cfg["warmup_frac"] = float(max(cfg.get("warmup_frac", 0.10), 0.10))
    cfg["grad_clip"] = float(cfg.get("grad_clip", 1.0))
    print("\n[MAX_MODE] Applied override cfg:")
    for k in ["d_model","n_layers","n_heads","ffn_mult","dropout","attn_dropout","lr","weight_decay","epochs","batch_size"]:
        print(f"  {k}: {cfg[k]}")

# ----------------------------
# 3) Dataset + standardizer
# ----------------------------
class TabDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = None if y is None else torch.from_numpy(y.astype(np.int64))

    def __len__(self): return self.X.shape[0]

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        return self.X[idx], self.y[idx]

def fit_standardizer(X_tr: np.ndarray):
    mu = X_tr.mean(axis=0, dtype=np.float64)
    sig = X_tr.std(axis=0, dtype=np.float64)
    sig = np.where(sig < 1e-8, 1.0, sig)
    return mu.astype(np.float32), sig.astype(np.float32)

def apply_standardizer(X_in: np.ndarray, mu: np.ndarray, sig: np.ndarray):
    return ((X_in - mu) / sig).astype(np.float32)

# ----------------------------
# 4) Model: FT-Transformer numeric
# ----------------------------
class FTTransformer(nn.Module):
    def __init__(self, n_features, d_model=384, n_heads=8, n_layers=8, ffn_mult=4,
                 dropout=0.2, attn_dropout=0.1):
        super().__init__()
        self.n_features = n_features
        self.d_model = d_model

        self.w = nn.Parameter(torch.randn(n_features, d_model) * 0.02)
        self.b = nn.Parameter(torch.zeros(n_features, d_model))
        self.feat_emb = nn.Parameter(torch.randn(n_features, d_model) * 0.02)

        self.cls = nn.Parameter(torch.randn(1, 1, d_model) * 0.02)

        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=ffn_mult * d_model,
            dropout=dropout,
            activation="gelu",
            batch_first=True,
            norm_first=True,
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)

        self.token_dropout = nn.Dropout(attn_dropout)
        self.norm = nn.LayerNorm(d_model)

        self.head = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model, 1),
        )

    def forward(self, x):
        tok = x.unsqueeze(-1) * self.w.unsqueeze(0) + self.b.unsqueeze(0)
        tok = tok + self.feat_emb.unsqueeze(0)
        tok = self.token_dropout(tok)

        B = tok.size(0)
        cls = self.cls.expand(B, -1, -1)
        seq = torch.cat([cls, tok], dim=1)

        z = self.encoder(seq)
        z = self.norm(z[:, 0])
        logit = self.head(z).squeeze(-1)
        return logit

# ----------------------------
# 5) Scheduler: warmup + cosine
# ----------------------------
def make_warmup_cosine_scheduler(optimizer, total_steps: int, warmup_steps: int):
    def lr_lambda(step):
        if step < warmup_steps:
            return float(step + 1) / float(max(1, warmup_steps))
        progress = (step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        return 0.5 * (1.0 + math.cos(math.pi * min(1.0, progress)))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)

@torch.no_grad()
def predict_proba(model, loader):
    model.eval()
    ps = []
    for xb in loader:
        xb = xb.to(device)
        logits = model(xb)
        p = torch.sigmoid(logits).detach().cpu().numpy()
        ps.append(p)
    return np.concatenate(ps, axis=0).astype(np.float32)

def safe_logloss(y_true, p):
    p = np.clip(np.asarray(p, dtype=np.float64), 1e-8, 1-1e-8)
    return float(log_loss(y_true, p, labels=[0,1]))

def safe_auc(y_true, p):
    if len(np.unique(y_true)) < 2:
        return None
    return float(roc_auc_score(y_true, p))

# ----------------------------
# 6) Internal validation split by case_id (group-safe)
# ----------------------------
def make_case_split(df: pd.DataFrame, val_frac=0.10, seed=2025):
    if "case_id" not in df.columns:
        raise ValueError("df_train_tabular must contain case_id for group-safe split.")
    # case-level label: any forged in that case
    g = df.groupby("case_id")["y"].max().reset_index().rename(columns={"y":"case_y"})
    pos_cases = g.loc[g["case_y"] == 1, "case_id"].to_numpy()
    neg_cases = g.loc[g["case_y"] == 0, "case_id"].to_numpy()

    rng = np.random.RandomState(seed)
    rng.shuffle(pos_cases)
    rng.shuffle(neg_cases)

    n_val_pos = max(1, int(len(pos_cases) * val_frac)) if len(pos_cases) > 0 else 0
    n_val_neg = max(1, int(len(neg_cases) * val_frac)) if len(neg_cases) > 0 else 0

    val_cases = np.concatenate([pos_cases[:n_val_pos], neg_cases[:n_val_neg]])
    val_set = set(map(int, val_cases.tolist()))

    is_val = df["case_id"].astype(int).map(lambda x: int(x) in val_set).to_numpy(dtype=bool)
    return is_val

# ----------------------------
# 7) Train full (optionally with internal val early stopping)
# ----------------------------
def train_full_once(seed_offset=0):
    seed_everything(FINAL_SEED + seed_offset)

    # choose train/val
    if USE_INTERNAL_VAL:
        is_val = make_case_split(df_train_tabular, val_frac=VAL_FRAC_CASE, seed=FINAL_SEED + seed_offset)
        tr_idx = np.where(~is_val)[0]
        va_idx = np.where(is_val)[0]
        X_tr, y_tr = X_all[tr_idx], y_all[tr_idx]
        X_va, y_va = X_all[va_idx], y_all[va_idx]
        print(f"  internal split: train={len(tr_idx)} | val={len(va_idx)} | val_pos%={float(y_va.mean())*100.0:.2f}")
    else:
        tr_idx = np.arange(len(y_all))
        va_idx = None
        X_tr, y_tr = X_all, y_all
        X_va = y_va = None

    # standardize from TRAIN split (kalau ada val) / full (kalau tidak)
    mu, sig = fit_standardizer(X_tr)
    X_trn = apply_standardizer(X_tr, mu, sig)

    ds_tr = TabDataset(X_trn, y_tr)
    dl_tr = DataLoader(
        ds_tr,
        batch_size=int(cfg["batch_size"]),
        shuffle=True,
        num_workers=2,
        pin_memory=(device.type=="cuda"),
        drop_last=False
    )

    if USE_INTERNAL_VAL:
        X_van = apply_standardizer(X_va, mu, sig)
        ds_va = TabDataset(X_van, y_va)
        dl_va = DataLoader(
            ds_va,
            batch_size=int(cfg["batch_size"]),
            shuffle=False,
            num_workers=2,
            pin_memory=(device.type=="cuda"),
            drop_last=False
        )
    else:
        dl_va = None

    model = FTTransformer(
        n_features=X_all.shape[1],
        d_model=int(cfg["d_model"]),
        n_heads=int(cfg["n_heads"]),
        n_layers=int(cfg["n_layers"]),
        ffn_mult=int(cfg["ffn_mult"]),
        dropout=float(cfg["dropout"]),
        attn_dropout=float(cfg["attn_dropout"]),
    ).to(device)

    # pos_weight from TRAIN split
    pos = int(y_tr.sum())
    neg = int(len(y_tr) - pos)
    pos_weight = float(neg / max(1, pos))
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([pos_weight], device=device))

    opt = torch.optim.AdamW(
        model.parameters(),
        lr=float(cfg["lr"]),
        weight_decay=float(cfg["weight_decay"]),
    )

    total_steps = int(cfg["epochs"]) * max(1, len(dl_tr))
    warmup_steps = int(float(cfg["warmup_frac"]) * total_steps)
    sch = make_warmup_cosine_scheduler(opt, total_steps=total_steps, warmup_steps=warmup_steps)

    scaler = torch.cuda.amp.GradScaler(enabled=use_amp)

    best_val = 1e9
    best_state = None
    best_epoch = -1
    bad = 0

    t0 = time.time()
    for epoch in range(int(cfg["epochs"])):
        model.train()
        loss_sum = 0.0
        n_sum = 0

        for xb, yb in dl_tr:
            xb = xb.to(device)
            yb = yb.to(device).float()

            opt.zero_grad(set_to_none=True)
            with torch.cuda.amp.autocast(enabled=use_amp):
                logits = model(xb)
                loss = loss_fn(logits, yb)

            scaler.scale(loss).backward()
            if float(cfg.get("grad_clip", 1.0)) > 0:
                scaler.unscale_(opt)
                torch.nn.utils.clip_grad_norm_(model.parameters(), float(cfg.get("grad_clip", 1.0)))
            scaler.step(opt)
            scaler.update()
            sch.step()

            loss_sum += float(loss.item()) * xb.size(0)
            n_sum += xb.size(0)

        tr_loss = loss_sum / max(1, n_sum)

        if dl_va is not None:
            p_va = predict_proba(model, dl_va)
            vll = safe_logloss(y_va, p_va)
            vauc = safe_auc(y_va, p_va)

            improved = (best_val - vll) > MIN_DELTA
            if improved:
                best_val = vll
                best_epoch = epoch
                best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}
                bad = 0
            else:
                bad += 1

            print(f"  epoch {epoch+1:03d}/{int(cfg['epochs'])} | tr_loss={tr_loss:.5f} | val_ll={vll:.5f} | val_auc={(vauc if vauc is not None else float('nan')):.5f} | bad={bad}")
            if EARLY_STOP and bad >= PATIENCE:
                print(f"  early stop at epoch {epoch+1}, best_epoch={best_epoch+1}, best_val_ll={best_val:.5f}")
                break
        else:
            # no val
            print(f"  epoch {epoch+1:03d}/{int(cfg['epochs'])} | tr_loss={tr_loss:.5f}")

        gc.collect()

    # restore best (if val used)
    if best_state is not None:
        model.load_state_dict(best_state, strict=True)

    # pack
    pack = {
        "type": "transformer_ft_full",
        "state_dict": {k: v.detach().cpu() for k, v in model.state_dict().items()},
        "mu": mu,
        "sig": sig,
        "cfg": cfg,
        "seed": int(FINAL_SEED + seed_offset),
        "pos_weight": float(pos_weight),
        "train_rows": int(len(y_tr)),
        "val_rows": int(len(y_va)) if USE_INTERNAL_VAL else 0,
        "best_epoch": int(best_epoch + 1) if best_epoch >= 0 else None,
        "best_val_logloss": float(best_val) if best_state is not None else None,
        "train_time_s": float(time.time() - t0),
    }
    return pack

# ----------------------------
# 8) Train final model(s)
# ----------------------------
final_packs = []
for s in range(N_SEEDS):
    print(f"\n[Final Train] seed_offset={s}")
    pack = train_full_once(seed_offset=s)
    final_packs.append(pack)
    gc.collect()

# ----------------------------
# 9) Save final artifacts
# ----------------------------
final_model_path = OUT_DIR / "final_gate_model.pt"
torch.save(
    {
        "feature_cols": FEATURE_COLS,
        "packs": final_packs,     # list (even if N_SEEDS=1)
        "bundle_source": source,
    },
    final_model_path
)

final_bundle = {
    "type": "transformer_ft_full",
    "n_seeds": int(N_SEEDS),
    "seeds": [int(p["seed"]) for p in final_packs],
    "feature_cols": FEATURE_COLS,
    "cfg": cfg,
    "use_internal_val": bool(USE_INTERNAL_VAL),
    "val_frac_case": float(VAL_FRAC_CASE) if USE_INTERNAL_VAL else 0.0,
    "early_stop": bool(EARLY_STOP) if USE_INTERNAL_VAL else False,
    "patience": int(PATIENCE),
    "min_delta": float(MIN_DELTA),
    "train_rows": int(len(y_all)),
    "pos_rate": float(y_all.mean()),
    "max_mode": bool(MAX_MODE),
    "notes": "Final full-data training (Transformer only). Inference nanti bisa avg antar-seed (dan/atau fold-pack dari Step 4).",
    "ref_selection": best_bundle.get("selection", {}),
}

final_bundle_path = OUT_DIR / "final_gate_bundle.json"
final_bundle_path.write_text(json.dumps(final_bundle, indent=2))

print("\nSaved final training artifacts:")
print("  model  ->", final_model_path)
print("  bundle ->", final_bundle_path)

# Export globals
FINAL_GATE_MODEL_PT = str(final_model_path)
FINAL_GATE_BUNDLE = final_bundle


# Finalize & Save Model Bundle (Reproducible)

In [None]:
# ============================================================
# Step 6 — Finalize & Save Model Bundle (Reproducible) — REVISI FULL (TRANSFORMER COMPAT)
# - Tujuan: satukan artefak penting jadi bundle yang mudah di-load ulang
# - Tidak ada submission di sini
#
# Bundle berisi (disesuaikan dengan pipeline Transformer .pt):
#  1) final model: final_gate_model.pt  (fallback: .joblib jika ada)
#  2) feature_cols.json
#  3) thresholds.json (placeholder / bisa diisi dari step tuning berikutnya)
#  4) training reports (baseline/opt/final) jika ada
#  5) metadata cfg (MATCH_CFG_DIR, PRED_CFG_DIR, DINO_CFG_DIR, roots) dari PATHS (jika ada)
#  6) manifest + bundle pack + satu file ZIP (portable)
#
# REQUIRE:
# - Step 2 (feature_cols.json sudah dibuat)
# - Step 5 (final_gate_model.pt + final_gate_bundle.json sudah ada)
# ============================================================

import os, json, time, platform, warnings, zipfile
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

warnings.filterwarnings("ignore")

OUT_DIR = Path("/kaggle/working/recodai_luc_gate_artifacts")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 0) Locate required artifacts
# ----------------------------
final_model_pt = OUT_DIR / "final_gate_model.pt"
final_model_joblib = OUT_DIR / "final_gate_model.joblib"  # legacy fallback
final_bundle_path = OUT_DIR / "final_gate_bundle.json"
feature_cols_path = OUT_DIR / "feature_cols.json"

if not feature_cols_path.exists():
    raise FileNotFoundError(f"Missing feature_cols: {feature_cols_path} (jalankan Step 2 dulu)")

# prefer .pt (Transformer), fallback .joblib
if final_model_pt.exists():
    final_model_path = final_model_pt
    model_format = "torch_pt"
elif final_model_joblib.exists():
    final_model_path = final_model_joblib
    model_format = "joblib"
else:
    raise FileNotFoundError(
        f"Missing final model: {final_model_pt} (or legacy {final_model_joblib}). "
        "Jalankan Step 5 dulu."
    )

# Optional artifacts
baseline_report_path = OUT_DIR / "baseline_cv_report.json"
opt_config_path = OUT_DIR / "best_gate_config.json"
opt_results_csv = OUT_DIR / "opt_search" / "opt_results.csv"
opt_fold_csv = OUT_DIR / "opt_search" / "opt_fold_details.csv"

oof_baseline_csv = OUT_DIR / "oof_baseline.csv"
oof_tf_baseline_csv = OUT_DIR / "oof_baseline_transformer.csv"

print("Found artifacts:")
print("  final_model  :", final_model_path, f"(format={model_format})")
print("  final_bundle :", final_bundle_path if final_bundle_path.exists() else "(missing/skip)")
print("  feature_cols :", feature_cols_path)

# ----------------------------
# 1) Load minimal metadata
# ----------------------------
feature_cols = json.loads(feature_cols_path.read_text())

final_bundle = {}
if final_bundle_path.exists():
    try:
        final_bundle = json.loads(final_bundle_path.read_text())
    except Exception:
        final_bundle = {}

baseline_report = None
if baseline_report_path.exists():
    try:
        baseline_report = json.loads(baseline_report_path.read_text())
    except Exception:
        baseline_report = None

opt_config = None
if opt_config_path.exists():
    try:
        opt_config = json.loads(opt_config_path.read_text())
    except Exception:
        opt_config = None

# ----------------------------
# 2) Threshold placeholders (can be updated later)
# ----------------------------
thresholds_path = OUT_DIR / "thresholds.json"
if thresholds_path.exists():
    thresholds = json.loads(thresholds_path.read_text())
else:
    # pilih T_gate dari:
    # - final_bundle (kalau ada)
    # - opt_config.selection.oof_best_thr (kalau ada)
    # - default 0.5
    T_gate = None
    if isinstance(final_bundle, dict):
        T_gate = final_bundle.get("oof_best_thr", None)

    if T_gate is None and isinstance(opt_config, dict):
        sel = opt_config.get("selection", {}) if isinstance(opt_config.get("selection", {}), dict) else {}
        T_gate = sel.get("oof_best_thr", None)

    if T_gate is None:
        T_gate = 0.5

    thresholds = {
        "T_gate": float(T_gate),
        "beta_for_tuning": 0.5,
        "guards": {
            "min_area_frac": None,
            "max_area_frac": None,
            "max_components": None
        },
        "notes": "Placeholder. Update after calibration/threshold tuning."
    }
    thresholds_path.write_text(json.dumps(thresholds, indent=2))

# ----------------------------
# 3) Capture dataset/cfg metadata (if available)
# ----------------------------
cfg_meta = {}
if "PATHS" in globals() and isinstance(PATHS, dict):
    cfg_meta = {
        "COMP_ROOT": PATHS.get("COMP_ROOT", None),
        "OUT_DS_ROOT": PATHS.get("OUT_DS_ROOT", None),
        "OUT_ROOT": PATHS.get("OUT_ROOT", None),
        "MATCH_CFG_DIR": PATHS.get("MATCH_CFG_DIR", None),
        "PRED_CFG_DIR": PATHS.get("PRED_CFG_DIR", None),
        "DINO_CFG_DIR": PATHS.get("DINO_CFG_DIR", None),
        "DINO_LARGE_DIR": PATHS.get("DINO_LARGE_DIR", None),
        "PRED_FEAT_TRAIN": PATHS.get("PRED_FEAT_TRAIN", None),
        "MATCH_FEAT_TRAIN": PATHS.get("MATCH_FEAT_TRAIN", None),
        "DF_TRAIN_ALL": PATHS.get("DF_TRAIN_ALL", None),
        "CV_CASE_FOLDS": PATHS.get("CV_CASE_FOLDS", None),
        "IMG_PROFILE_TRAIN": PATHS.get("IMG_PROFILE_TRAIN", None),
    }

# ----------------------------
# 4) Build reproducible manifest
# ----------------------------
task_str = "Recod.ai/LUC — Gate Model (authentic vs forged) — DINOv2 tabular features + Transformer gate"

manifest = {
    "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "python": platform.python_version(),
    "platform": platform.platform(),
    "bundle_version": "v2",
    "task": task_str,
    "model_format": model_format,
    "artifacts": {
        "final_model": str(final_model_path),
        "final_bundle": str(final_bundle_path) if final_bundle_path.exists() else None,
        "feature_cols": str(feature_cols_path),
        "thresholds": str(thresholds_path),
        "baseline_report": str(baseline_report_path) if baseline_report_path.exists() else None,
        "opt_config": str(opt_config_path) if opt_config_path.exists() else None,
        "opt_results_csv": str(opt_results_csv) if opt_results_csv.exists() else None,
        "opt_fold_details_csv": str(opt_fold_csv) if opt_fold_csv.exists() else None,
        "oof_baseline_csv": str(oof_baseline_csv) if oof_baseline_csv.exists() else None,
        "oof_baseline_transformer_csv": str(oof_tf_baseline_csv) if oof_tf_baseline_csv.exists() else None,
    },
    "cfg_meta": cfg_meta,
    "model_summary": {
        "type": (final_bundle.get("type", None) if isinstance(final_bundle, dict) else None),
        "n_seeds": (final_bundle.get("n_seeds", None) if isinstance(final_bundle, dict) else None),
        "seeds": (final_bundle.get("seeds", None) if isinstance(final_bundle, dict) else None),
        "train_rows": (final_bundle.get("train_rows", None) if isinstance(final_bundle, dict) else None),
        "pos_rate": (final_bundle.get("pos_rate", None) if isinstance(final_bundle, dict) else None),
        "feature_count": int(len(feature_cols)),
        "T_gate": float(thresholds.get("T_gate", 0.5)),
    },
    "baseline_summary": (baseline_report.get("overall") if isinstance(baseline_report, dict) else None),
    "opt_summary": (opt_config.get("selection", None) if isinstance(opt_config, dict) else None),
}

manifest_path = OUT_DIR / "model_bundle_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))

# ----------------------------
# 5) Create a single "bundle pack" (joblib) for easy reload
#    (pack hanya metadata + pointer path; model tetap file terpisah)
# ----------------------------
bundle_pack = {
    "model_format": model_format,
    "final_model_path": str(final_model_path),
    "final_bundle": final_bundle,
    "feature_cols": feature_cols,
    "thresholds": thresholds,
    "cfg_meta": cfg_meta,
    "manifest": manifest,
}

bundle_pack_path = OUT_DIR / "model_bundle_pack.joblib"
joblib.dump(bundle_pack, bundle_pack_path)

# ----------------------------
# 6) Create portable ZIP (1 file berisi semua artefak yang ada)
# ----------------------------
zip_path = OUT_DIR / "model_bundle_v2.zip"

def _safe_add(zf: zipfile.ZipFile, p: Path, arcname: str):
    if p is None:
        return
    p = Path(p)
    if p.exists() and p.is_file():
        zf.write(p, arcname=arcname)

with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
    _safe_add(zf, final_model_path, final_model_path.name)
    _safe_add(zf, final_bundle_path, final_bundle_path.name)
    _safe_add(zf, feature_cols_path, feature_cols_path.name)
    _safe_add(zf, thresholds_path, thresholds_path.name)
    _safe_add(zf, manifest_path, manifest_path.name)
    _safe_add(zf, bundle_pack_path, bundle_pack_path.name)

    # optional extras
    _safe_add(zf, baseline_report_path, baseline_report_path.name)
    _safe_add(zf, opt_config_path, opt_config_path.name)
    if opt_results_csv.exists(): _safe_add(zf, opt_results_csv, f"opt_search/{opt_results_csv.name}")
    if opt_fold_csv.exists(): _safe_add(zf, opt_fold_csv, f"opt_search/{opt_fold_csv.name}")
    if oof_baseline_csv.exists(): _safe_add(zf, oof_baseline_csv, oof_baseline_csv.name)
    if oof_tf_baseline_csv.exists(): _safe_add(zf, oof_tf_baseline_csv, oof_tf_baseline_csv.name)

print("\nOK — Model bundle finalized")
print("  manifest   ->", manifest_path)
print("  pack       ->", bundle_pack_path)
print("  thresholds ->", thresholds_path)
print("  zip        ->", zip_path)

print("\nBundle summary:")
print("  model_format :", model_format)
print("  feature_cnt  :", len(feature_cols))
print("  T_gate       :", thresholds.get("T_gate"))
print("  task         :", task_str)
