# Set Paths & Select Config (CFG)

In [7]:
# ============================================================
# STAGE 0 — Set Paths & Select Config (CFG) (Kaggle-ready, offline)
# - Root kompetisi: /kaggle/input/recodai-luc-scientific-image-forgery-detection
# - Root dataset output: /kaggle/input/recod-ailuc-dinov2-base  (atau variasi nama lain)
# - Auto-detect jika nama folder beda
# - Auto-pilih CFG terbaik untuk MATCH + PRED berdasarkan coverage fitur train
#
# Output globals:
# - COMP_ROOT, OUT_DS_ROOT, OUT_ROOT
# - PATHS (dict jalur penting)
# - MATCH_CFG_DIR, PRED_CFG_DIR, DINO_CFG_DIR (opsional)
# ============================================================

import os, re, json
from pathlib import Path
import pandas as pd

# ----------------------------
# Helper: find competition root
# ----------------------------
def find_comp_root(preferred: str = "/kaggle/input/recodai-luc-scientific-image-forgery-detection") -> Path:
    p = Path(preferred)
    if p.exists():
        return p
    # fallback: scan /kaggle/input for a dir that looks like the competition
    base = Path("/kaggle/input")
    if not base.exists():
        raise FileNotFoundError("/kaggle/input not found (are you in Kaggle notebook?)")
    cands = []
    for d in base.iterdir():
        if not d.is_dir():
            continue
        # heuristic: must have sample_submission.csv and test_images/train_images
        if (d / "sample_submission.csv").exists() and ((d / "train_images").exists() or (d / "test_images").exists()):
            cands.append(d)
    if not cands:
        raise FileNotFoundError(
            "Competition root not found. Expected folder with sample_submission.csv and train_images/test_images."
        )
    # prefer one containing 'recodai' and 'forgery'
    cands.sort(key=lambda x: (("recodai" not in x.name.lower()), ("forgery" not in x.name.lower()), x.name))
    return cands[0]

# ----------------------------
# Helper: find output dataset root
# ----------------------------
def find_output_dataset_root(preferred_names=(
    "recod-ailuc-dinov2-base",
    "recod-ai-luc-dinov2-base",
    "recodai-luc-dinov2-base",
    "recodai-luc-dinov2",
)) -> Path:
    base = Path("/kaggle/input")
    # try preferred direct hits first
    for nm in preferred_names:
        p = base / nm
        if p.exists():
            return p

    # fallback: scan for a dataset that contains recodai_luc/artifacts
    cands = []
    for d in base.iterdir():
        if not d.is_dir():
            continue
        # either directly has recodai_luc, or nested one-level
        if (d / "recodai_luc" / "artifacts").exists():
            cands.append(d)
            continue
        # some datasets wrap inside one folder
        inner = list(d.glob("*/recodai_luc/artifacts"))
        if inner:
            cands.append(d)
    if not cands:
        raise FileNotFoundError(
            "Output dataset root not found. Expected something like /kaggle/input/<...>/recodai_luc/artifacts/"
        )
    # prefer those containing 'dinov2' in name
    cands.sort(key=lambda x: (("dinov2" not in x.name.lower()), x.name))
    return cands[0]

# ----------------------------
# Helper: resolve OUT_ROOT = <dataset>/recodai_luc
# ----------------------------
def resolve_out_root(out_ds_root: Path) -> Path:
    direct = out_ds_root / "recodai_luc"
    if direct.exists():
        return direct
    # else find nested
    hits = list(out_ds_root.glob("*/recodai_luc"))
    if hits:
        return hits[0]
    raise FileNotFoundError(f"Could not locate recodai_luc folder under {out_ds_root}")

# ----------------------------
# Helper: pick best cfg directory by coverage of train feature csv
# ----------------------------
def pick_best_cfg(cache_root: Path, prefix: str, feat_train_pattern: str) -> Path:
    """
    prefix: e.g. 'match_base_cfg_' or 'pred_base'
    feat_train_pattern: glob pattern for feature csv inside cfg dir
    """
    cands = []
    for d in cache_root.iterdir():
        if not d.is_dir():
            continue
        if not d.name.startswith(prefix):
            continue
        # find feature file
        feat_files = list(d.glob(feat_train_pattern))
        if not feat_files:
            continue
        feat_path = feat_files[0]
        # score by row count (coverage)
        try:
            n = sum(1 for _ in open(feat_path, "r", encoding="utf-8", errors="ignore")) - 1
        except Exception:
            n = -1
        cands.append((n, d, feat_path))

    if not cands:
        raise FileNotFoundError(f"No cfg folders found under {cache_root} with prefix={prefix} and {feat_train_pattern}")

    # choose max rows, tie-break by name
    cands.sort(key=lambda x: (-x[0], x[1].name))
    best_n, best_dir, best_feat = cands[0]
    return best_dir

# ----------------------------
# 0) Locate roots
# ----------------------------
COMP_ROOT = find_comp_root("/kaggle/input/recodai-luc-scientific-image-forgery-detection")
OUT_DS_ROOT = find_output_dataset_root()

OUT_ROOT = resolve_out_root(OUT_DS_ROOT)  # .../recodai_luc
ART_DIR = OUT_ROOT / "artifacts"
CACHE_DIR = OUT_ROOT / "cache"

# ----------------------------
# 1) Competition paths (raw images/masks)
# ----------------------------
PATHS = {}
PATHS["COMP_ROOT"] = str(COMP_ROOT)
PATHS["SAMPLE_SUB"] = str(COMP_ROOT / "sample_submission.csv")

# common competition layout (handle if nested)
PATHS["TRAIN_IMAGES"] = str(COMP_ROOT / "train_images")
PATHS["TEST_IMAGES"]  = str(COMP_ROOT / "test_images")
PATHS["TRAIN_MASKS"]  = str(COMP_ROOT / "train_masks")
PATHS["SUPP_IMAGES"]  = str(COMP_ROOT / "supplemental_images")
PATHS["SUPP_MASKS"]   = str(COMP_ROOT / "supplemental_masks")

# optional subfolders inside train_images
PATHS["TRAIN_AUTH_DIR"] = str(COMP_ROOT / "train_images" / "authentic")
PATHS["TRAIN_FORG_DIR"] = str(COMP_ROOT / "train_images" / "forged")

# ----------------------------
# 2) Output dataset paths (clean artifacts + cache)
# ----------------------------
PATHS["OUT_DS_ROOT"] = str(OUT_DS_ROOT)
PATHS["OUT_ROOT"] = str(OUT_ROOT)
PATHS["ART_DIR"] = str(ART_DIR)
PATHS["CACHE_DIR"] = str(CACHE_DIR)

# artifacts (train tables / folds / profiles)
PATHS["DF_TRAIN_ALL"] = str(ART_DIR / "df_train_all.parquet")
PATHS["DF_TRAIN_CLS"] = str(ART_DIR / "df_train_cls.parquet")
PATHS["DF_TRAIN_SEG"] = str(ART_DIR / "df_train_seg.parquet")
PATHS["DF_TEST"]      = str(ART_DIR / "df_test.parquet")
PATHS["CV_CASE_FOLDS"]   = str(ART_DIR / "cv_case_folds.csv")
PATHS["CV_SAMPLE_FOLDS"] = str(ART_DIR / "cv_sample_folds.csv")
PATHS["IMG_PROFILE_TRAIN"] = str(ART_DIR / "image_profile_train.parquet")
PATHS["IMG_PROFILE_TEST"]  = str(ART_DIR / "image_profile_test.parquet")
PATHS["MASK_PROFILE"]      = str(ART_DIR / "mask_profile.parquet")
PATHS["CASE_SUMMARY"]      = str(ART_DIR / "case_summary.parquet")

# ----------------------------
# 3) Select best MATCH/PRED CFG dirs automatically
# ----------------------------
if not CACHE_DIR.exists():
    raise FileNotFoundError(f"CACHE_DIR not found: {CACHE_DIR}")

# Match cfg dirs look like: match_base_cfg_<hash>
MATCH_CFG_DIR = pick_best_cfg(
    CACHE_DIR,
    prefix="match_base_cfg_",
    feat_train_pattern="match_features_train_all.csv"
)
# Pred cfg dirs look like: pred_base_v3_v7_cfg_<hash> (name may vary; use startswith 'pred_base')
# We'll scan by startswith 'pred_base' and require pred_features_train_all.csv
PRED_CFG_DIR = pick_best_cfg(
    CACHE_DIR,
    prefix="pred_base",
    feat_train_pattern="pred_features_train_all.csv"
)

# DINO cache cfg (optional)
DINO_CFG_DIR = None
dino_root = CACHE_DIR / "dino_v2_large"
if dino_root.exists():
    # choose any cfg_* that has manifest_train_all.csv
    dino_cands = []
    for d in dino_root.iterdir():
        if d.is_dir() and d.name.startswith("cfg_") and (d / "manifest_train_all.csv").exists():
            dino_cands.append(d)
    if dino_cands:
        dino_cands.sort(key=lambda x: x.name)
        DINO_CFG_DIR = dino_cands[0]

# attach feature file paths
PATHS["MATCH_CFG_DIR"] = str(MATCH_CFG_DIR)
PATHS["PRED_CFG_DIR"]  = str(PRED_CFG_DIR)
PATHS["DINO_CFG_DIR"]  = str(DINO_CFG_DIR) if DINO_CFG_DIR else ""

PATHS["MATCH_FEAT_TRAIN"] = str(MATCH_CFG_DIR / "match_features_train_all.csv")
PATHS["MATCH_FEAT_TEST"]  = str(MATCH_CFG_DIR / "match_features_test.csv")
PATHS["PRED_FEAT_TRAIN"]  = str(PRED_CFG_DIR / "pred_features_train_all.csv")
PATHS["PRED_FEAT_TEST"]   = str(PRED_CFG_DIR / "pred_features_test.csv")

# ----------------------------
# 4) Sanity checks (no hard fail for optional files)
# ----------------------------
must_exist = [
    ("sample_submission.csv", PATHS["SAMPLE_SUB"]),
    ("df_train_all.parquet",  PATHS["DF_TRAIN_ALL"]),
    ("cv_case_folds.csv",     PATHS["CV_CASE_FOLDS"]),
    ("match_features_train_all.csv", PATHS["MATCH_FEAT_TRAIN"]),
    ("pred_features_train_all.csv",  PATHS["PRED_FEAT_TRAIN"]),
]
missing = [name for name, p in must_exist if not Path(p).exists()]
if missing:
    raise FileNotFoundError("Missing required files: " + ", ".join(missing))

print("OK — Roots")
print("  COMP_ROOT   :", COMP_ROOT)
print("  OUT_DS_ROOT :", OUT_DS_ROOT)
print("  OUT_ROOT    :", OUT_ROOT)
print("\nOK — Selected CFG")
print("  MATCH_CFG_DIR:", MATCH_CFG_DIR.name)
print("  PRED_CFG_DIR :", PRED_CFG_DIR.name)
print("  DINO_CFG_DIR :", (DINO_CFG_DIR.name if DINO_CFG_DIR else "(not found / optional)"))

print("\nOK — Key files")
for k in ["DF_TRAIN_ALL","CV_CASE_FOLDS","MATCH_FEAT_TRAIN","PRED_FEAT_TRAIN","IMG_PROFILE_TRAIN"]:
    p = Path(PATHS[k])
    print(f"  {k:16s}: {p}  {'(exists)' if p.exists() else '(missing/optional)'}")


OK — Roots
  COMP_ROOT   : /kaggle/input/recodai-luc-scientific-image-forgery-detection
  OUT_DS_ROOT : /kaggle/input/recod-ailuc-dinov2-base
  OUT_ROOT    : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc

OK — Selected CFG
  MATCH_CFG_DIR: match_base_cfg_f9f7ea3a65c5
  PRED_CFG_DIR : pred_base_v3_v7_cfg_5dbf0aa165
  DINO_CFG_DIR : cfg_3246fd54aab0

OK — Key files
  DF_TRAIN_ALL    : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/df_train_all.parquet  (exists)
  CV_CASE_FOLDS   : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/cv_case_folds.csv  (exists)
  MATCH_FEAT_TRAIN: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/match_base_cfg_f9f7ea3a65c5/match_features_train_all.csv  (exists)
  PRED_FEAT_TRAIN : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/pred_base_v3_v7_cfg_5dbf0aa165/pred_features_train_all.csv  (exists)
  IMG_PROFILE_TRAIN: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/image_profile_train.parquet  (exists)


# Build Training Table (X, y, folds)

In [8]:
# ============================================================
# STEP 2 — Build Training Table (X, y, folds) (DINOv2-LARGE pipeline)
# - Fokus: siapkan df_train_tabular + (X_train, y_train, folds, feature_cols)
# - Sumber utama: output dataset (pred_features + (opsional) match_features + image_profile)
# - Split: gunakan cv_case_folds.csv (anti leakage, by case_id)
# - Tidak ada submission di sini
#
# Catatan:
# - DINOv2 Large model path (offline): /kaggle/input/dinov2/pytorch/large/1
#   (Di step ini hanya dicek exist; ekstraksi DINO tidak dilakukan di step ini.)
#
# REQUIRE: Jalankan STAGE 0 (Set Paths & Select Config) dulu => PATHS ada.
# ============================================================

import os, json, math, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

# ----------------------------
# 0) Require PATHS
# ----------------------------
if "PATHS" not in globals() or not isinstance(PATHS, dict):
    raise RuntimeError("Missing PATHS. Jalankan dulu STAGE 0 — Set Paths & Select Config (CFG).")

# ----------------------------
# 1) Check DINOv2 Large local path (offline)
# ----------------------------
DINO_LARGE_DIR = Path("/kaggle/input/dinov2/pytorch/large/1")
if not DINO_LARGE_DIR.exists():
    raise FileNotFoundError(f"DINOv2-Large path not found: {DINO_LARGE_DIR}")
PATHS["DINO_LARGE_DIR"] = str(DINO_LARGE_DIR)

# ----------------------------
# 2) Prefer WORKING features if exist (because you may have re-generated there)
# ----------------------------
def prefer_working(input_path: str, working_candidate: str | None = None) -> Path:
    p_in = Path(input_path)
    if working_candidate is not None:
        p_w = Path(working_candidate)
        if p_w.exists():
            return p_w
    return p_in

# Build working candidates from selected cfg dir names
match_cfg_name = Path(PATHS["MATCH_CFG_DIR"]).name
pred_cfg_name  = Path(PATHS["PRED_CFG_DIR"]).name

WORK_ROOT = Path("/kaggle/working/recodai_luc/cache")
match_feat_work = WORK_ROOT / match_cfg_name / "match_features_train_all.csv"
pred_feat_work  = WORK_ROOT / pred_cfg_name  / "pred_features_train_all.csv"

# Resolve final feature paths
PRED_FEAT_TRAIN = prefer_working(PATHS["PRED_FEAT_TRAIN"], str(pred_feat_work))
MATCH_FEAT_TRAIN = prefer_working(PATHS["MATCH_FEAT_TRAIN"], str(match_feat_work))

# Base artifacts
DF_TRAIN_ALL = Path(PATHS["DF_TRAIN_ALL"])
CV_CASE_FOLDS = Path(PATHS["CV_CASE_FOLDS"])
IMG_PROFILE_TRAIN = Path(PATHS.get("IMG_PROFILE_TRAIN", ""))

for need_name, need_path in [
    ("df_train_all.parquet", DF_TRAIN_ALL),
    ("cv_case_folds.csv", CV_CASE_FOLDS),
    ("pred_features_train_all.csv", PRED_FEAT_TRAIN),
]:
    if not need_path.exists():
        raise FileNotFoundError(f"Missing required file: {need_name} -> {need_path}")

print("Using:")
print("  DF_TRAIN_ALL     :", DF_TRAIN_ALL)
print("  CV_CASE_FOLDS    :", CV_CASE_FOLDS)
print("  PRED_FEAT_TRAIN  :", PRED_FEAT_TRAIN)
print("  MATCH_FEAT_TRAIN :", MATCH_FEAT_TRAIN, "(optional)" if MATCH_FEAT_TRAIN.exists() else "(missing/skip)")
print("  IMG_PROFILE_TRAIN:", IMG_PROFILE_TRAIN, "(optional)" if IMG_PROFILE_TRAIN.exists() else "(missing/skip)")
print("  DINO_LARGE_DIR   :", DINO_LARGE_DIR)

# ----------------------------
# 3) Load minimal inputs
# ----------------------------
df_base = pd.read_parquet(DF_TRAIN_ALL)
df_cv   = pd.read_csv(CV_CASE_FOLDS)

df_pred = pd.read_csv(PRED_FEAT_TRAIN)

# Optional match_features: only used if pred_features does NOT already contain match cols you want
df_match = None
if MATCH_FEAT_TRAIN.exists():
    try:
        df_match = pd.read_csv(MATCH_FEAT_TRAIN)
    except Exception:
        df_match = None

df_prof = None
if IMG_PROFILE_TRAIN.exists():
    try:
        df_prof = pd.read_parquet(IMG_PROFILE_TRAIN)
    except Exception:
        df_prof = None

# ----------------------------
# 4) Normalize keys: uid/sample_id, case_id, variant
# ----------------------------
def ensure_uid_case_variant(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    # uid column name variants
    if "uid" not in df.columns:
        for alt in ["sample_id", "id", "key"]:
            if alt in df.columns:
                df = df.rename(columns={alt: "uid"})
                break

    if "uid" not in df.columns:
        raise ValueError("Cannot find uid/sample_id column in pred_features. Expected column 'uid' or 'sample_id'.")

    # case_id + variant may already exist
    if "case_id" not in df.columns or "variant" not in df.columns:
        # parse from uid patterns:
        # - "10015__auth" or "10015__forg"
        # - "10015_auth"  or "10015_forg"
        uid = df["uid"].astype(str)
        if "case_id" not in df.columns:
            # take leading digits
            df["case_id"] = uid.str.extract(r"^(\d+)")[0].astype("Int64")
        if "variant" not in df.columns:
            # try __variant first else _variant
            v = uid.str.extract(r"__(\w+)$")[0]
            v2 = uid.str.extract(r"_(\w+)$")[0]
            df["variant"] = v.fillna(v2).fillna("unk")

    # enforce dtypes
    df["case_id"] = df["case_id"].astype(int)
    df["variant"] = df["variant"].astype(str)
    df["uid"] = df["uid"].astype(str)
    return df

df_pred = ensure_uid_case_variant(df_pred)

# Base label table: ensure has y_forged and uid key if needed
df_base2 = df_base.copy()
if "uid" not in df_base2.columns:
    # common in your pipeline: sample_id
    if "sample_id" in df_base2.columns:
        df_base2 = df_base2.rename(columns={"sample_id": "uid"})
    elif "uid" not in df_base2.columns:
        # some tables use case_id+variant; build uid like pred does if possible
        if "case_id" in df_base2.columns and "variant" in df_base2.columns:
            df_base2["uid"] = df_base2["case_id"].astype(str) + "__" + df_base2["variant"].astype(str)
        else:
            # fallback: no uid; we can still merge by (case_id, variant) if available later
            pass

# Ensure label exists
label_col = None
for cand in ["y_forged", "has_mask", "is_forged", "forged"]:
    if cand in df_base2.columns:
        label_col = cand
        break

if label_col is None and "y_forged" in df_pred.columns:
    label_col = "y_forged"

if label_col is None:
    raise ValueError("Cannot find label column (y_forged/has_mask/etc) in df_train_all or pred_features.")

# CV folds: must have case_id and fold
if "case_id" not in df_cv.columns or "fold" not in df_cv.columns:
    raise ValueError("cv_case_folds.csv must contain columns: case_id, fold")

df_cv["case_id"] = df_cv["case_id"].astype(int)
df_cv["fold"] = df_cv["fold"].astype(int)

# ----------------------------
# 5) Merge: start from df_pred (already 1 row per uid)
# ----------------------------
df_train = df_pred.copy()

# attach label (prefer df_pred if already has y_forged)
if "y_forged" in df_train.columns:
    df_train["y"] = df_train["y_forged"].astype(int)
else:
    # merge from df_base2
    if "uid" in df_base2.columns:
        df_train = df_train.merge(df_base2[["uid", label_col]].rename(columns={label_col: "y"}), on="uid", how="left")
    else:
        # merge by case_id+variant
        if {"case_id","variant",label_col}.issubset(df_base2.columns):
            df_train = df_train.merge(df_base2[["case_id","variant",label_col]].rename(columns={label_col:"y"}),
                                      on=["case_id","variant"], how="left")
        else:
            raise ValueError("Could not merge label from df_train_all (missing uid or case_id+variant).")
    df_train["y"] = df_train["y"].astype(int)

# attach folds (override any fold column that might exist)
df_train = df_train.drop(columns=["fold"], errors="ignore").merge(df_cv[["case_id","fold"]], on="case_id", how="left")
if df_train["fold"].isna().any():
    miss = int(df_train["fold"].isna().sum())
    raise ValueError(f"Some rows missing fold assignment after merging cv_case_folds.csv: {miss} rows")

df_train["fold"] = df_train["fold"].astype(int)

# optional: merge match_features if you want extra columns not already present
if df_match is not None:
    df_match = ensure_uid_case_variant(df_match)
    # only bring new numeric columns
    base_cols = set(df_train.columns)
    new_cols = [c for c in df_match.columns if c not in base_cols]
    # keep id columns for merge
    keep_cols = ["uid"] + [c for c in new_cols if c not in ["case_id","variant"]]
    # merge
    df_train = df_train.merge(df_match[keep_cols], on="uid", how="left")

# optional: merge image profile
if df_prof is not None:
    # prefer merge by case_id
    if "case_id" in df_prof.columns:
        df_prof2 = df_prof.copy()
        df_prof2["case_id"] = df_prof2["case_id"].astype(int)
        # drop duplicates per case_id if any (keep first)
        df_prof2 = df_prof2.drop_duplicates("case_id")
        # avoid column clashes
        clash = set(df_prof2.columns).intersection(df_train.columns)
        clash -= {"case_id"}
        if clash:
            df_prof2 = df_prof2.rename(columns={c: f"profile_{c}" for c in clash})
        df_train = df_train.merge(df_prof2, on="case_id", how="left")

# ----------------------------
# 6) Feature engineering (safe transforms + caps)
# ----------------------------
def safe_log1p(x):
    x = np.asarray(x, dtype=np.float64)
    x = np.where(np.isfinite(x), x, 0.0)
    x = np.clip(x, 0.0, None)
    return np.log1p(x)

# Peak ratio can be 1e9 sentinel; cap + log transform
if "peak_ratio" in df_train.columns:
    df_train["peak_ratio_cap"] = np.clip(df_train["peak_ratio"].astype(float).fillna(0.0), 0.0, 1e6)
    df_train["log_peak_ratio"] = safe_log1p(df_train["peak_ratio_cap"])
else:
    df_train["peak_ratio_cap"] = 0.0
    df_train["log_peak_ratio"] = 0.0

# Best weight also can be huge; cap + log
if "best_weight" in df_train.columns:
    df_train["best_weight_cap"] = np.clip(df_train["best_weight"].astype(float).fillna(0.0), 0.0, 1e9)
    df_train["log_best_weight"] = safe_log1p(df_train["best_weight_cap"])
else:
    df_train["best_weight_cap"] = 0.0
    df_train["log_best_weight"] = 0.0

# Replace inf/nan for all numeric columns (later selection will ensure only numeric)
for c in df_train.columns:
    if pd.api.types.is_numeric_dtype(df_train[c]):
        df_train[c] = df_train[c].replace([np.inf, -np.inf], np.nan)

# ----------------------------
# 7) Select feature columns (numeric only, exclude identifiers/labels)
# ----------------------------
ID_COLS = {"uid","case_id","variant"}
TARGET_COLS = {"y", "y_forged", "has_mask", "is_forged", "forged"}
SPLIT_COLS = {"fold"}

# numeric columns only
num_cols = [c for c in df_train.columns if pd.api.types.is_numeric_dtype(df_train[c])]

# exclude label/split + optionally exclude raw id-like numeric (case_id)
feature_cols = [c for c in num_cols if c not in TARGET_COLS and c not in SPLIT_COLS and c not in ["case_id"]]

# ensure engineered columns are included
for must in ["log_peak_ratio", "log_best_weight"]:
    if must in df_train.columns and must not in feature_cols:
        feature_cols.append(must)

# fill NaN for feature columns
df_train[feature_cols] = df_train[feature_cols].fillna(0.0)

# cast to float32 for memory
df_train[feature_cols] = df_train[feature_cols].astype(np.float32)

# ----------------------------
# 8) Final outputs
# ----------------------------
df_train_tabular = df_train[["uid","case_id","variant","fold","y"] + feature_cols].copy()

X_train = df_train_tabular[feature_cols]
y_train = df_train_tabular["y"].astype(int)
folds   = df_train_tabular["fold"].astype(int)

print("\nOK — Training table built")
print("  df_train_tabular:", df_train_tabular.shape)
print("  X_train:", X_train.shape, "| y pos%:", float(y_train.mean())*100.0)
print("  folds:", folds.nunique(), "unique folds")
print("  feature_cols:", len(feature_cols))

# quick sanity
if X_train.shape[0] != y_train.shape[0]:
    raise RuntimeError("X_train and y_train row mismatch")
if y_train.isna().any():
    raise RuntimeError("y_train contains NaN")
if folds.isna().any():
    raise RuntimeError("folds contains NaN")

# export globals for next steps
FEATURE_COLS = feature_cols
print("\nFeature head:", FEATURE_COLS[:20])
print("Feature tail:", FEATURE_COLS[-10:])

# Optional: save the feature list now (handy for consistency)
OUT_ART = Path("/kaggle/working/recodai_luc_gate_artifacts")
OUT_ART.mkdir(parents=True, exist_ok=True)
with open(OUT_ART / "feature_cols.json", "w") as f:
    json.dump(FEATURE_COLS, f, indent=2)

print(f"\nSaved feature list -> {OUT_ART/'feature_cols.json'}")


Using:
  DF_TRAIN_ALL     : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/df_train_all.parquet
  CV_CASE_FOLDS    : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/cv_case_folds.csv
  PRED_FEAT_TRAIN  : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/pred_base_v3_v7_cfg_5dbf0aa165/pred_features_train_all.csv
  MATCH_FEAT_TRAIN : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/match_base_cfg_f9f7ea3a65c5/match_features_train_all.csv (optional)
  IMG_PROFILE_TRAIN: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/artifacts/image_profile_train.parquet (optional)
  DINO_LARGE_DIR   : /kaggle/input/dinov2/pytorch/large/1

OK — Training table built
  df_train_tabular: (5176, 52)
  X_train: (5176, 47) | y pos%: 54.07650695517774
  folds: 5 unique folds
  feature_cols: 47

Feature head: ['feat_exists', 'match_exists', 'has_peak', 'peak_ratio', 'best_weight', 'best_count', 'best_mean_sim', 'n_pairs_thr', 'n_pairs_mnn', 'best_inlier_ratio', 'best_weight

# Train Baseline Model (Leakage-Safe CV)

In [9]:
# ============================================================
# Step 3 — Train Baseline Model (Leakage-Safe CV) — REVISI FULL
# - Baseline: StandardScaler + LogisticRegression
# - CV: pakai kolom `fold` (dibuat by case_id dari cv_case_folds.csv)
# - Output:
#   * OOF probabilities (untuk calibration/threshold di step berikutnya)
#   * CV report (AUC, F1, Precision, Recall, LogLoss)
#   * Simpan model per fold + model_full
#
# Catatan penting revisi:
# - Pakai sklearn.clone (lebih aman daripada joblib dumps/loads untuk deepcopy estimator)
# - log_loss selalu pakai labels=[0,1] (biar aman kalau suatu fold cuma 1 kelas)
# - AUC aman jika 1 kelas (return None)
# - Semua NaN/inf di X sudah harus ditangani di Step 2; tetap ada guard kecil
# ============================================================

import json, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from IPython.display import display
from sklearn.base import clone
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, log_loss
import joblib

# ----------------------------
# 0) Require outputs from Step 2
# ----------------------------
need_vars = ["df_train_tabular", "FEATURE_COLS"]
for v in need_vars:
    if v not in globals():
        raise RuntimeError(f"Missing `{v}`. Jalankan dulu Step 2 — Build Training Table (X, y, folds).")

df_train_tabular = df_train_tabular.copy()
FEATURE_COLS = list(FEATURE_COLS)

required_cols = {"uid", "case_id", "variant", "fold", "y"}
missing_cols = [c for c in required_cols if c not in df_train_tabular.columns]
if missing_cols:
    raise ValueError(f"df_train_tabular missing columns: {missing_cols}. Pastikan Step 2 berhasil.")

# ----------------------------
# 1) Build arrays + sanity
# ----------------------------
X = df_train_tabular[FEATURE_COLS].to_numpy(dtype=np.float32, copy=True)
y = df_train_tabular["y"].to_numpy(dtype=np.int32, copy=True)
folds = df_train_tabular["fold"].to_numpy(dtype=np.int32, copy=True)

# extra guard (harusnya sudah bersih di Step 2)
if not np.isfinite(X).all():
    X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

n = len(df_train_tabular)
unique_folds = sorted(pd.Series(folds).unique().tolist())
n_folds = len(unique_folds)

print("Baseline setup:")
print("  rows      :", n)
print("  folds     :", n_folds, "|", unique_folds)
print("  pos%      :", float(y.mean()) * 100.0)
print("  n_features:", X.shape[1])

# ----------------------------
# 2) Define baseline model
# ----------------------------
# Kamu bisa tweak parameter baseline di sini kalau perlu (tetap baseline sederhana):
BASELINE_PARAMS = dict(
    solver="lbfgs",
    max_iter=4000,
    C=1.0,
    class_weight="balanced",
)

baseline = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("clf", LogisticRegression(**BASELINE_PARAMS)),
])

# ----------------------------
# 3) CV training (Leakage-safe: fold sudah by case_id)
# ----------------------------
oof_pred = np.zeros(n, dtype=np.float32)
fold_reports = []

models_dir = Path("/kaggle/working/recodai_luc_gate_artifacts/baseline_folds")
models_dir.mkdir(parents=True, exist_ok=True)

for f in unique_folds:
    tr_idx = np.where(folds != f)[0]
    va_idx = np.where(folds == f)[0]

    X_tr, y_tr = X[tr_idx], y[tr_idx]
    X_va, y_va = X[va_idx], y[va_idx]

    model_f = clone(baseline)
    model_f.fit(X_tr, y_tr)

    p_va = model_f.predict_proba(X_va)[:, 1].astype(np.float32)
    oof_pred[va_idx] = p_va

    # Metrics (fold-safe)
    auc = None
    if len(np.unique(y_va)) > 1:
        auc = float(roc_auc_score(y_va, p_va))

    yhat = (p_va >= 0.5).astype(np.int32)

    rep = {
        "fold": int(f),
        "n_val": int(len(va_idx)),
        "pos_val": int(y_va.sum()),
        "auc": auc,
        "f1@0.5": float(f1_score(y_va, yhat, zero_division=0)),
        "precision@0.5": float(precision_score(y_va, yhat, zero_division=0)),
        "recall@0.5": float(recall_score(y_va, yhat, zero_division=0)),
        "logloss": float(log_loss(y_va, np.clip(p_va, 1e-6, 1 - 1e-6), labels=[0, 1])),
    }
    fold_reports.append(rep)

    # save fold model
    joblib.dump(model_f, models_dir / f"baseline_fold_{f}.joblib")

    del model_f
    gc.collect()

# ----------------------------
# 4) Overall OOF metrics
# ----------------------------
oof_auc = None
if len(np.unique(y)) > 1:
    oof_auc = float(roc_auc_score(y, oof_pred))

oof_yhat = (oof_pred >= 0.5).astype(np.int32)

overall = {
    "rows": int(n),
    "folds": int(n_folds),
    "pos_total": int(y.sum()),
    "pos_rate": float(y.mean()),
    "oof_auc": oof_auc,
    "oof_f1@0.5": float(f1_score(y, oof_yhat, zero_division=0)),
    "oof_precision@0.5": float(precision_score(y, oof_yhat, zero_division=0)),
    "oof_recall@0.5": float(recall_score(y, oof_yhat, zero_division=0)),
    "oof_logloss": float(log_loss(y, np.clip(oof_pred, 1e-6, 1 - 1e-6), labels=[0, 1])),
}

df_rep = pd.DataFrame(fold_reports).sort_values("fold").reset_index(drop=True)
print("\nPer-fold report:")
display(df_rep)

print("\nOOF overall:")
print(overall)

# ----------------------------
# 5) Train baseline on full data & save artifacts
# ----------------------------
out_dir = Path("/kaggle/working/recodai_luc_gate_artifacts")
out_dir.mkdir(parents=True, exist_ok=True)

model_full = clone(baseline)
model_full.fit(X, y)
joblib.dump(model_full, out_dir / "baseline_model_full.joblib")

# Save OOF predictions table
df_oof = df_train_tabular[["uid", "case_id", "variant", "fold", "y"]].copy()
df_oof["oof_pred_baseline"] = oof_pred
df_oof.to_csv(out_dir / "oof_baseline.csv", index=False)

# Save report JSON
report = {
    "model": "LogisticRegression + StandardScaler (class_weight=balanced)",
    "params": BASELINE_PARAMS,
    "feature_count": int(len(FEATURE_COLS)),
    "fold_reports": fold_reports,
    "overall": overall,
}
with open(out_dir / "baseline_cv_report.json", "w") as f:
    json.dump(report, f, indent=2)

print("\nSaved artifacts:")
print("  fold models  ->", models_dir)
print("  full model   ->", out_dir / "baseline_model_full.joblib")
print("  oof preds    ->", out_dir / "oof_baseline.csv")
print("  cv report    ->", out_dir / "baseline_cv_report.json")

# Export globals for next steps
OOF_PRED_BASELINE = oof_pred
BASELINE_OVERALL = overall
BASELINE_FOLD_REPORTS = fold_reports


Baseline setup:
  rows      : 5176
  folds     : 5 | [0, 1, 2, 3, 4]
  pos%      : 54.07650695517774
  n_features: 47

Per-fold report:


Unnamed: 0,fold,n_val,pos_val,auc,f1@0.5,precision@0.5,recall@0.5,logloss
0,0,1034,559,0.533153,0.56043,0.560932,0.559928,0.696173
1,1,1041,561,0.543366,0.581694,0.559211,0.606061,0.688935
2,2,1032,559,0.532439,0.569456,0.55,0.59034,0.688766
3,3,1035,560,0.534414,0.585654,0.5552,0.619643,0.692037
4,4,1034,560,0.541281,0.574188,0.564767,0.583929,0.686037



OOF overall:
{'rows': 5176, 'folds': 5, 'pos_total': 2799, 'pos_rate': 0.5407650695517774, 'oof_auc': 0.5366708736502595, 'oof_f1@0.5': 0.5744496446524527, 'oof_precision@0.5': 0.5579124579124579, 'oof_recall@0.5': 0.5919971418363701, 'oof_logloss': 0.6903884325713943}

Saved artifacts:
  fold models  -> /kaggle/working/recodai_luc_gate_artifacts/baseline_folds
  full model   -> /kaggle/working/recodai_luc_gate_artifacts/baseline_model_full.joblib
  oof preds    -> /kaggle/working/recodai_luc_gate_artifacts/oof_baseline.csv
  cv report    -> /kaggle/working/recodai_luc_gate_artifacts/baseline_cv_report.json


# Optimize Model & Hyperparameters (Iterative)

In [None]:
# ============================================================
# Step 4 — Optimize Model & Hyperparameters (Iterative)
# - Fokus: iterasi model gate tabular (tanpa submission)
# - Model kandidat: LogisticRegression, HistGradientBoosting, ExtraTrees
# - Validasi: Leakage-safe CV pakai kolom `fold` (by case_id)
# - Skor utama untuk compare: Best F-beta (beta=0.5) dari OOF (lebih anti-FP)
#   + tetap log AUC & LogLoss sebagai sanity metric.
#
# Output:
# - /kaggle/working/recodai_luc_gate_artifacts/opt_search/opt_results.csv
# - /kaggle/working/recodai_luc_gate_artifacts/opt_search/opt_results.json
# - /kaggle/working/recodai_luc_gate_artifacts/opt_search/oof_preds_<model>.csv (top models)
# - /kaggle/working/recodai_luc_gate_artifacts/best_gate_model.joblib
# - /kaggle/working/recodai_luc_gate_artifacts/best_gate_config.json
#
# REQUIRE:
# - Step 2 sudah jalan: df_train_tabular, FEATURE_COLS
# ============================================================

import os, json, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score, log_loss, fbeta_score, precision_score, recall_score
import joblib

# ----------------------------
# 0) Require data from Step 2
# ----------------------------
need_vars = ["df_train_tabular", "FEATURE_COLS"]
for v in need_vars:
    if v not in globals():
        raise RuntimeError(f"Missing `{v}`. Jalankan dulu Step 2 — Build Training Table (X, y, folds).")

df_train_tabular = df_train_tabular.copy()
FEATURE_COLS = list(FEATURE_COLS)

X = df_train_tabular[FEATURE_COLS].values.astype(np.float32)
y = df_train_tabular["y"].values.astype(int)
folds = df_train_tabular["fold"].values.astype(int)
uids = df_train_tabular["uid"].astype(str).values

unique_folds = sorted(pd.Series(folds).unique().tolist())
n = len(y)
pos_rate = float(y.mean())

print("Optimize setup:")
print(f"  rows={n} | folds={len(unique_folds)} | pos%={pos_rate*100:.2f} | n_features={X.shape[1]}")

# ----------------------------
# 1) Helpers: metrics & threshold search
# ----------------------------
def compute_best_fbeta(y_true, p, beta=0.5, grid=201):
    """
    Cari threshold terbaik untuk F-beta dari probabilitas p.
    beta<1 menekankan precision (anti false-positive), cocok untuk gate.
    """
    p = np.asarray(p, dtype=np.float64)
    p = np.clip(p, 1e-8, 1-1e-8)

    best = {"fbeta": -1.0, "thr": 0.5, "precision": 0.0, "recall": 0.0}
    for thr in np.linspace(0.01, 0.99, grid):
        yh = (p >= thr).astype(int)
        f = fbeta_score(y_true, yh, beta=beta, zero_division=0)
        if f > best["fbeta"]:
            best["fbeta"] = float(f)
            best["thr"] = float(thr)
            best["precision"] = float(precision_score(y_true, yh, zero_division=0))
            best["recall"] = float(recall_score(y_true, yh, zero_division=0))
    return best

def safe_auc(y_true, p):
    if len(np.unique(y_true)) < 2:
        return None
    return float(roc_auc_score(y_true, p))

def safe_logloss(y_true, p):
    p = np.clip(np.asarray(p, dtype=np.float64), 1e-8, 1-1e-8)
    return float(log_loss(y_true, p))

# ----------------------------
# 2) CV runner (leakage-safe)
# ----------------------------
def run_cv(model_builder, model_name, beta=0.5, save_oof=False, out_dir=None):
    """
    model_builder(fold_idx) -> sklearn estimator (sudah siap fit).
    Untuk HGB: kita pakai sample_weight agar balance.
    """
    oof = np.zeros(n, dtype=np.float32)
    fold_rows = []

    for f in unique_folds:
        tr = np.where(folds != f)[0]
        va = np.where(folds == f)[0]

        X_tr, y_tr = X[tr], y[tr]
        X_va, y_va = X[va], y[va]

        est = model_builder(f)

        # sample_weight untuk model yang tidak punya class_weight (HGB)
        fit_kwargs = {}
        if isinstance(est, HistGradientBoostingClassifier):
            # weight_pos = neg/pos (balanced)
            pos = max(1, int(y_tr.sum()))
            neg = max(1, int(len(y_tr) - y_tr.sum()))
            w_pos = neg / pos
            sw = np.where(y_tr == 1, w_pos, 1.0).astype(np.float32)
            fit_kwargs["sample_weight"] = sw

        est.fit(X_tr, y_tr, **fit_kwargs)

        if hasattr(est, "predict_proba"):
            p_va = est.predict_proba(X_va)[:, 1].astype(np.float32)
        else:
            # fallback (harusnya tidak kejadian)
            p_va = est.predict(X_va).astype(np.float32)

        oof[va] = p_va

        fold_auc = safe_auc(y_va, p_va)
        fold_ll  = safe_logloss(y_va, p_va)
        best_fold = compute_best_fbeta(y_va, p_va, beta=beta, grid=101)

        fold_rows.append({
            "model": model_name,
            "fold": int(f),
            "n_val": int(len(va)),
            "pos_val": int(y_va.sum()),
            "auc": fold_auc,
            "logloss": fold_ll,
            "best_fbeta": best_fold["fbeta"],
            "best_thr": best_fold["thr"],
            "best_prec": best_fold["precision"],
            "best_rec": best_fold["recall"],
        })

        del est
        gc.collect()

    # overall OOF
    oof_auc = safe_auc(y, oof)
    oof_ll  = safe_logloss(y, oof)
    best_oof = compute_best_fbeta(y, oof, beta=beta, grid=201)

    summary = {
        "model": model_name,
        "oof_auc": oof_auc,
        "oof_logloss": oof_ll,
        "oof_best_fbeta": best_oof["fbeta"],
        "oof_best_thr": best_oof["thr"],
        "oof_best_prec": best_oof["precision"],
        "oof_best_rec": best_oof["recall"],
    }

    if save_oof and out_dir is not None:
        out_dir = Path(out_dir)
        out_dir.mkdir(parents=True, exist_ok=True)
        df_o = pd.DataFrame({
            "uid": uids,
            "y": y,
            "fold": folds,
            f"oof_pred_{model_name}": oof
        })
        df_o.to_csv(out_dir / f"oof_preds_{model_name}.csv", index=False)

    return summary, fold_rows, oof

# ----------------------------
# 3) Define candidate configs (kamu bisa tambah/ubah parameter di sini)
# ----------------------------
# Penjelasan singkat (ID):
# - Logistic: baseline kuat, probabilitas relatif stabil
# - HGB: non-linear cepat, bagus untuk interaksi fitur
# - ExtraTrees: robust untuk noise, sering bagus di tabular feature hasil match/pred
BETA = 0.5  # beta<1 => fokus precision (anti false-positive)
RANDOM_SEED = 2025

candidates = []

# (A) Logistic Regression grid kecil
for C in [0.25, 0.5, 1.0, 2.0, 4.0]:
    name = f"logreg_C{C:g}"
    def builder_factory(Cval):
        def _builder(_fold):
            return Pipeline(steps=[
                ("scaler", StandardScaler(with_mean=True, with_std=True)),
                ("clf", LogisticRegression(
                    solver="lbfgs",
                    max_iter=3000,
                    C=Cval,
                    class_weight="balanced",
                ))
            ])
        return _builder
    candidates.append((name, builder_factory(C)))

# (B) HistGradientBoosting grid kecil
for lr in [0.05, 0.08, 0.12]:
    for md in [3, 4]:
        for leaf in [31, 63]:
            name = f"hgb_lr{lr}_md{md}_leaf{leaf}"
            def builder_factory(lrval, mdval, leafval):
                def _builder(_fold):
                    return HistGradientBoostingClassifier(
                        learning_rate=lrval,
                        max_depth=mdval,
                        max_leaf_nodes=leafval,
                        max_iter=250,
                        early_stopping=False,
                        random_state=RANDOM_SEED,
                    )
                return _builder
            candidates.append((name, builder_factory(lr, md, leaf)))

# (C) ExtraTrees grid kecil
for n_est in [400, 800]:
    for md in [None, 24]:
        for msl in [1, 3]:
            name = f"et_n{n_est}_md{md if md is not None else 'None'}_msl{msl}"
            def builder_factory(n_est_val, md_val, msl_val):
                def _builder(_fold):
                    return ExtraTreesClassifier(
                        n_estimators=n_est_val,
                        max_depth=md_val,
                        min_samples_leaf=msl_val,
                        max_features="sqrt",
                        class_weight="balanced",
                        n_jobs=-1,
                        random_state=RANDOM_SEED,
                    )
                return _builder
            candidates.append((name, builder_factory(n_est, md, msl)))

print(f"\nTotal candidates: {len(candidates)}")
print("Scoring: OOF best F-beta (beta=0.5) is primary")

# ----------------------------
# 4) Run iterative search
# ----------------------------
OUT_DIR = Path("/kaggle/working/recodai_luc_gate_artifacts")
OPT_DIR = OUT_DIR / "opt_search"
OPT_DIR.mkdir(parents=True, exist_ok=True)

all_summaries = []
all_fold_rows = []
oof_store = {}  # store OOF preds for possible ensemble

for i, (name, builder) in enumerate(candidates, 1):
    print(f"[{i:03d}/{len(candidates)}] CV -> {name}")
    summ, fold_rows, oof = run_cv(builder, name, beta=BETA, save_oof=False)
    all_summaries.append(summ)
    all_fold_rows.extend(fold_rows)
    oof_store[name] = oof
    print("  oof_best_fbeta:", f"{summ['oof_best_fbeta']:.5f}",
          "| oof_auc:", f"{(summ['oof_auc'] if summ['oof_auc'] is not None else float('nan')):.5f}",
          "| oof_logloss:", f"{summ['oof_logloss']:.5f}",
          "| thr:", f"{summ['oof_best_thr']:.3f}")

df_sum = pd.DataFrame(all_summaries)
df_fold = pd.DataFrame(all_fold_rows)

# rank by primary metric (oof_best_fbeta), tie-break by logloss (lower better)
df_sum["rank_key"] = list(zip(-df_sum["oof_best_fbeta"], df_sum["oof_logloss"]))
df_sum = df_sum.sort_values(["oof_best_fbeta", "oof_logloss"], ascending=[False, True]).reset_index(drop=True)

print("\nTop 10 candidates:")
display(df_sum.head(10))

# save search results
df_sum.drop(columns=["rank_key"], errors="ignore").to_csv(OPT_DIR / "opt_results.csv", index=False)
with open(OPT_DIR / "opt_results.json", "w") as f:
    json.dump(df_sum.drop(columns=["rank_key"], errors="ignore").to_dict(orient="records"), f, indent=2)

# ----------------------------
# 5) Optional: simple ensemble of top-K (avg OOF)
# ----------------------------
TOPK = 3
top_names = df_sum["model"].head(TOPK).tolist()
if len(top_names) >= 2:
    oof_ens = np.mean([oof_store[nm] for nm in top_names], axis=0).astype(np.float32)
    ens_best = compute_best_fbeta(y, oof_ens, beta=BETA, grid=201)
    ens_auc  = safe_auc(y, oof_ens)
    ens_ll   = safe_logloss(y, oof_ens)

    ens_summary = {
        "model": f"ensemble_avg_top{TOPK}",
        "members": top_names,
        "oof_auc": ens_auc,
        "oof_logloss": ens_ll,
        "oof_best_fbeta": ens_best["fbeta"],
        "oof_best_thr": ens_best["thr"],
        "oof_best_prec": ens_best["precision"],
        "oof_best_rec": ens_best["recall"],
    }
    print("\nEnsemble OOF (avg) result:")
    print(ens_summary)

    # append to summary table (for visibility)
    df_sum2 = pd.concat([df_sum.drop(columns=["rank_key"], errors="ignore"),
                         pd.DataFrame([ens_summary])], ignore_index=True)
else:
    ens_summary = None
    df_sum2 = df_sum.drop(columns=["rank_key"], errors="ignore")

# save fold detail
df_fold.to_csv(OPT_DIR / "opt_fold_details.csv", index=False)

# ----------------------------
# 6) Select best model (compare best single vs ensemble)
# ----------------------------
best_single = df_sum.iloc[0].to_dict()
best_choice = {"type": "single", "model": best_single["model"], "summary": best_single}

if ens_summary is not None and ens_summary["oof_best_fbeta"] >= best_single["oof_best_fbeta"]:
    best_choice = {"type": "ensemble", "model": ens_summary["model"], "summary": ens_summary}

print("\nBest choice:")
print(best_choice)

# ----------------------------
# 7) Train final model(s) on FULL DATA and save bundle
# ----------------------------
def build_estimator_by_name(name: str):
    # rebuild the estimator from the name string we used above
    if name.startswith("logreg_C"):
        C = float(name.split("logreg_C", 1)[1])
        return Pipeline(steps=[
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", LogisticRegression(
                solver="lbfgs",
                max_iter=3000,
                C=C,
                class_weight="balanced",
            ))
        ])

    if name.startswith("hgb_"):
        # parse: hgb_lr{lr}_md{md}_leaf{leaf}
        parts = name.split("_")
        lr = float(parts[1].replace("lr", ""))
        md = int(parts[2].replace("md", ""))
        leaf = int(parts[3].replace("leaf", ""))
        return HistGradientBoostingClassifier(
            learning_rate=lr,
            max_depth=md,
            max_leaf_nodes=leaf,
            max_iter=250,
            early_stopping=False,
            random_state=RANDOM_SEED,
        )

    if name.startswith("et_"):
        # parse: et_n{n}_md{md}_msl{msl}
        parts = name.split("_")
        n_est = int(parts[1].replace("n", ""))
        md_str = parts[2].replace("md", "")
        md = None if md_str == "None" else int(md_str)
        msl = int(parts[3].replace("msl", ""))
        return ExtraTreesClassifier(
            n_estimators=n_est,
            max_depth=md,
            min_samples_leaf=msl,
            max_features="sqrt",
            class_weight="balanced",
            n_jobs=-1,
            random_state=RANDOM_SEED,
        )

    raise ValueError(f"Unknown model name pattern: {name}")

bundle = {
    "type": best_choice["type"],
    "beta": BETA,
    "feature_cols": FEATURE_COLS,
}

# save top-K OOF csv for debugging (optional)
for nm in top_names[:3]:
    df_o = pd.DataFrame({"uid": uids, "y": y, "fold": folds, "oof_pred": oof_store[nm]})
    df_o.to_csv(OPT_DIR / f"oof_preds_{nm}.csv", index=False)

if best_choice["type"] == "single":
    best_name = best_choice["model"]
    est = build_estimator_by_name(best_name)

    fit_kwargs = {}
    if isinstance(est, HistGradientBoostingClassifier):
        pos = max(1, int(y.sum()))
        neg = max(1, int(len(y) - y.sum()))
        w_pos = neg / pos
        sw = np.where(y == 1, w_pos, 1.0).astype(np.float32)
        fit_kwargs["sample_weight"] = sw

    est.fit(X, y, **fit_kwargs)
    joblib.dump(est, OUT_DIR / "best_gate_model.joblib")

    bundle["model_name"] = best_name
    bundle["members"] = [best_name]
    bundle["oof_best_thr"] = best_choice["summary"]["oof_best_thr"]
    bundle["oof_best_fbeta"] = best_choice["summary"]["oof_best_fbeta"]

else:
    # ensemble: train each member on full data and save list
    members = best_choice["summary"]["members"]
    models = []
    for nm in members:
        est = build_estimator_by_name(nm)
        fit_kwargs = {}
        if isinstance(est, HistGradientBoostingClassifier):
            pos = max(1, int(y.sum()))
            neg = max(1, int(len(y) - y.sum()))
            w_pos = neg / pos
            sw = np.where(y == 1, w_pos, 1.0).astype(np.float32)
            fit_kwargs["sample_weight"] = sw
        est.fit(X, y, **fit_kwargs)
        models.append(est)

    joblib.dump(models, OUT_DIR / "best_gate_model.joblib")  # list of estimators

    bundle["model_name"] = best_choice["model"]
    bundle["members"] = members
    bundle["oof_best_thr"] = best_choice["summary"]["oof_best_thr"]
    bundle["oof_best_fbeta"] = best_choice["summary"]["oof_best_fbeta"]

with open(OUT_DIR / "best_gate_config.json", "w") as f:
    json.dump(bundle, f, indent=2)

print("\nSaved best artifacts:")
print("  best model  ->", OUT_DIR / "best_gate_model.joblib")
print("  best config ->", OUT_DIR / "best_gate_config.json")
print("  opt results ->", OPT_DIR / "opt_results.csv")
print("  fold detail ->", OPT_DIR / "opt_fold_details.csv")

# Export globals (untuk step berikutnya seperti calibration/threshold lebih lanjut jika mau)
BEST_GATE_BUNDLE = bundle


Optimize setup:
  rows=5176 | folds=5 | pos%=54.08 | n_features=47

Total candidates: 25
Scoring: OOF best F-beta (beta=0.5) is primary
[001/25] CV -> logreg_C0.25
  oof_best_fbeta: 0.59989 | oof_auc: 0.53703 | oof_logloss: 0.68981 | thr: 0.441
[002/25] CV -> logreg_C0.5
  oof_best_fbeta: 0.59997 | oof_auc: 0.53658 | oof_logloss: 0.69011 | thr: 0.426
[003/25] CV -> logreg_C1
  oof_best_fbeta: 0.59962 | oof_auc: 0.53667 | oof_logloss: 0.69039 | thr: 0.426
[004/25] CV -> logreg_C2
  oof_best_fbeta: 0.59984 | oof_auc: 0.53669 | oof_logloss: 0.69067 | thr: 0.426
[005/25] CV -> logreg_C4
  oof_best_fbeta: 0.60035 | oof_auc: 0.53695 | oof_logloss: 0.69095 | thr: 0.426
[006/25] CV -> hgb_lr0.05_md3_leaf31
  oof_best_fbeta: 0.60125 | oof_auc: 0.53916 | oof_logloss: 0.69015 | thr: 0.387
[007/25] CV -> hgb_lr0.05_md3_leaf63
  oof_best_fbeta: 0.60125 | oof_auc: 0.53916 | oof_logloss: 0.69015 | thr: 0.387
[008/25] CV -> hgb_lr0.05_md4_leaf31
  oof_best_fbeta: 0.59902 | oof_auc: 0.52809 | oof_loglo

# Final Training (Train on Full Data)

In [None]:
# ============================================================
# Step 5 — Final Training (Train on Full Data)
# - Tujuan: latih ulang model terbaik pada seluruh data train (full data)
# - Input: hasil Step 2 (df_train_tabular + FEATURE_COLS)
# - Ambil konfigurasi terbaik dari:
#     /kaggle/working/recodai_luc_gate_artifacts/best_gate_config.json
#   atau dari variabel BEST_GATE_BUNDLE (jika masih ada di memory)
# - Output:
#   /kaggle/working/recodai_luc_gate_artifacts/final_gate_model.joblib
#   /kaggle/working/recodai_luc_gate_artifacts/final_gate_bundle.json
#
# Catatan:
# - Tidak ada submission di sini
# - Untuk HGB: pakai sample_weight (balanced) karena tidak punya class_weight
# ============================================================

import json, gc, warnings
from pathlib import Path
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
import joblib

# ----------------------------
# 0) Require outputs from Step 2
# ----------------------------
if "df_train_tabular" not in globals():
    raise RuntimeError("Missing `df_train_tabular`. Jalankan Step 2 — Build Training Table dulu.")
if "FEATURE_COLS" not in globals():
    raise RuntimeError("Missing `FEATURE_COLS`. Jalankan Step 2 — Build Training Table dulu.")

df_train_tabular = df_train_tabular.copy()
FEATURE_COLS = list(FEATURE_COLS)

X = df_train_tabular[FEATURE_COLS].values.astype(np.float32)
y = df_train_tabular["y"].values.astype(int)

print("Final training data:")
print("  rows:", len(y), "| pos%:", float(y.mean()) * 100.0, "| n_features:", X.shape[1])

# ----------------------------
# 1) Load best config (from disk or memory)
# ----------------------------
OUT_DIR = Path("/kaggle/working/recodai_luc_gate_artifacts")
OUT_DIR.mkdir(parents=True, exist_ok=True)

cfg_path = OUT_DIR / "best_gate_config.json"

if "BEST_GATE_BUNDLE" in globals() and isinstance(BEST_GATE_BUNDLE, dict):
    best_cfg = BEST_GATE_BUNDLE
    source = "memory(BEST_GATE_BUNDLE)"
elif cfg_path.exists():
    best_cfg = json.loads(cfg_path.read_text())
    source = str(cfg_path)
else:
    raise FileNotFoundError(
        "Best config not found. Jalankan Step 4 — Optimize Model & Hyperparameters dulu "
        "atau pastikan best_gate_config.json ada."
    )

print("\nLoaded best config from:", source)
print("  type      :", best_cfg.get("type"))
print("  model_name :", best_cfg.get("model_name"))
print("  members   :", best_cfg.get("members"))

# Optional: override if you want manual final choice
# FORCE_MEMBERS = None  # e.g. ["logreg_C1", "et_n800_mdNone_msl1"]
FORCE_MEMBERS = None

members = best_cfg.get("members", [])
if FORCE_MEMBERS is not None:
    members = list(FORCE_MEMBERS)

if not members:
    raise ValueError("No members found in best config. Expected 'members' list.")

# Use stored seed if present
RANDOM_SEED = int(best_cfg.get("random_seed", 2025))

# ----------------------------
# 2) Rebuild estimator from name (must match naming in Step 4)
# ----------------------------
def build_estimator_by_name(name: str):
    # Logistic Regression pattern: logreg_C{C}
    if name.startswith("logreg_C"):
        C = float(name.split("logreg_C", 1)[1])
        return Pipeline(steps=[
            ("scaler", StandardScaler(with_mean=True, with_std=True)),
            ("clf", LogisticRegression(
                solver="lbfgs",
                max_iter=3000,
                C=C,
                class_weight="balanced",
            ))
        ])

    # HistGradientBoosting pattern: hgb_lr{lr}_md{md}_leaf{leaf}
    if name.startswith("hgb_"):
        parts = name.split("_")
        lr = float(parts[1].replace("lr", ""))
        md = int(parts[2].replace("md", ""))
        leaf = int(parts[3].replace("leaf", ""))
        return HistGradientBoostingClassifier(
            learning_rate=lr,
            max_depth=md,
            max_leaf_nodes=leaf,
            max_iter=250,
            early_stopping=False,
            random_state=RANDOM_SEED,
        )

    # ExtraTrees pattern: et_n{n}_md{md}_msl{msl}
    if name.startswith("et_"):
        parts = name.split("_")
        n_est = int(parts[1].replace("n", ""))
        md_str = parts[2].replace("md", "")
        md = None if md_str == "None" else int(md_str)
        msl = int(parts[3].replace("msl", ""))
        return ExtraTreesClassifier(
            n_estimators=n_est,
            max_depth=md,
            min_samples_leaf=msl,
            max_features="sqrt",
            class_weight="balanced",
            n_jobs=-1,
            random_state=RANDOM_SEED,
        )

    raise ValueError(f"Unknown model name pattern: {name}")

def fit_with_optional_sample_weight(est, X, y):
    fit_kwargs = {}
    # HGB tidak punya class_weight -> pakai sample_weight balanced
    if isinstance(est, HistGradientBoostingClassifier):
        pos = max(1, int(y.sum()))
        neg = max(1, int(len(y) - y.sum()))
        w_pos = neg / pos
        sw = np.where(y == 1, w_pos, 1.0).astype(np.float32)
        fit_kwargs["sample_weight"] = sw
    est.fit(X, y, **fit_kwargs)
    return est

# ----------------------------
# 3) Train final model(s) on full data
# ----------------------------
trained_models = []
for i, nm in enumerate(members, 1):
    print(f"[{i}/{len(members)}] Training member: {nm}")
    est = build_estimator_by_name(nm)
    est = fit_with_optional_sample_weight(est, X, y)
    trained_models.append(est)
    gc.collect()

# If only one member, save single estimator; else save list (ensemble)
final_obj = trained_models[0] if len(trained_models) == 1 else trained_models

# ----------------------------
# 4) Save final artifacts
# ----------------------------
final_model_path = OUT_DIR / "final_gate_model.joblib"
joblib.dump(final_obj, final_model_path)

final_bundle = {
    "type": "single" if len(trained_models) == 1 else "ensemble",
    "model_name": "final_gate_model",
    "members": members,
    "random_seed": RANDOM_SEED,
    "feature_cols": FEATURE_COLS,
    # (Opsional) simpan threshold OOF terbaik dari step sebelumnya untuk referensi
    "oof_best_thr": best_cfg.get("oof_best_thr", None),
    "oof_best_fbeta": best_cfg.get("oof_best_fbeta", None),
    "train_rows": int(len(y)),
    "pos_rate": float(y.mean()),
}

final_bundle_path = OUT_DIR / "final_gate_bundle.json"
final_bundle_path.write_text(json.dumps(final_bundle, indent=2))

print("\nSaved final training artifacts:")
print("  model  ->", final_model_path)
print("  bundle ->", final_bundle_path)

# Export globals (untuk step berikutnya jika mau calibration/threshold lanjut)
FINAL_GATE_MODEL = final_obj
FINAL_GATE_BUNDLE = final_bundle


# Finalize & Save Model Bundle (Reproducible)

In [None]:
# ============================================================
# Step 6 — Finalize & Save Model Bundle (Reproducible)
# - Tujuan: satukan semua artefak penting menjadi 1 bundle yang mudah di-load ulang
# - Tidak ada submission di sini
#
# Bundle berisi:
#  1) final model (final_gate_model.joblib)
#  2) feature_cols.json
#  3) thresholds.json (placeholder / bisa diisi dari step tuning berikutnya)
#  4) training report (baseline + optimize + final summary)
#  5) metadata cfg (MATCH_CFG_DIR, PRED_CFG_DIR, DINO_CFG_DIR, roots)
#
# REQUIRE:
# - Step 2 (feature_cols.json sudah dibuat) dan Step 5 (final model sudah ada)
# ============================================================

import os, json, time, platform, warnings
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

warnings.filterwarnings("ignore")

OUT_DIR = Path("/kaggle/working/recodai_luc_gate_artifacts")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# 0) Locate required artifacts
# ----------------------------
final_model_path = OUT_DIR / "final_gate_model.joblib"
final_bundle_path = OUT_DIR / "final_gate_bundle.json"
feature_cols_path = OUT_DIR / "feature_cols.json"

if not final_model_path.exists():
    raise FileNotFoundError(f"Missing final model: {final_model_path} (jalankan Step 5 dulu)")
if not feature_cols_path.exists():
    raise FileNotFoundError(f"Missing feature_cols: {feature_cols_path} (jalankan Step 2 dulu)")

# Optional artifacts
baseline_report_path = OUT_DIR / "baseline_cv_report.json"
opt_config_path = OUT_DIR / "best_gate_config.json"
opt_results_csv = OUT_DIR / "opt_search" / "opt_results.csv"
opt_fold_csv = OUT_DIR / "opt_search" / "opt_fold_details.csv"
oof_baseline_csv = OUT_DIR / "oof_baseline.csv"

# ----------------------------
# 1) Load minimal metadata
# ----------------------------
feature_cols = json.loads(feature_cols_path.read_text())

final_bundle = {}
if final_bundle_path.exists():
    final_bundle = json.loads(final_bundle_path.read_text())

baseline_report = None
if baseline_report_path.exists():
    try:
        baseline_report = json.loads(baseline_report_path.read_text())
    except Exception:
        baseline_report = None

opt_config = None
if opt_config_path.exists():
    try:
        opt_config = json.loads(opt_config_path.read_text())
    except Exception:
        opt_config = None

# ----------------------------
# 2) Threshold placeholders (can be updated later)
# ----------------------------
thresholds_path = OUT_DIR / "thresholds.json"
if thresholds_path.exists():
    thresholds = json.loads(thresholds_path.read_text())
else:
    # Placeholder: nanti bisa di-update dari step tuning threshold/guard
    thresholds = {
        "T_gate": final_bundle.get("oof_best_thr", 0.5),
        "beta_for_tuning": 0.5,
        "guards": {
            "min_area_frac": None,
            "max_area_frac": None,
            "max_components": None
        },
        "notes": "Placeholder. Update after calibration/threshold tuning."
    }
    thresholds_path.write_text(json.dumps(thresholds, indent=2))

# ----------------------------
# 3) Capture dataset/cfg metadata (if available)
# ----------------------------
cfg_meta = {}
if "PATHS" in globals() and isinstance(PATHS, dict):
    cfg_meta = {
        "COMP_ROOT": PATHS.get("COMP_ROOT", None),
        "OUT_DS_ROOT": PATHS.get("OUT_DS_ROOT", None),
        "OUT_ROOT": PATHS.get("OUT_ROOT", None),
        "MATCH_CFG_DIR": PATHS.get("MATCH_CFG_DIR", None),
        "PRED_CFG_DIR": PATHS.get("PRED_CFG_DIR", None),
        "DINO_CFG_DIR": PATHS.get("DINO_CFG_DIR", None),
        "DINO_LARGE_DIR": PATHS.get("DINO_LARGE_DIR", None),
        "PRED_FEAT_TRAIN": PATHS.get("PRED_FEAT_TRAIN", None),
        "MATCH_FEAT_TRAIN": PATHS.get("MATCH_FEAT_TRAIN", None),
        "DF_TRAIN_ALL": PATHS.get("DF_TRAIN_ALL", None),
        "CV_CASE_FOLDS": PATHS.get("CV_CASE_FOLDS", None),
        "IMG_PROFILE_TRAIN": PATHS.get("IMG_PROFILE_TRAIN", None),
    }

# ----------------------------
# 4) Build reproducible manifest
# ----------------------------
manifest = {
    "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "python": platform.python_version(),
    "platform": platform.platform(),
    "bundle_version": "v1",
    "task": "Recod.ai/LUC — Gate Model (authentic vs forged) — DINOv2 Large pipeline",
    "artifacts": {
        "final_model": str(final_model_path),
        "final_bundle": str(final_bundle_path) if final_bundle_path.exists() else None,
        "feature_cols": str(feature_cols_path),
        "thresholds": str(thresholds_path),
        "baseline_report": str(baseline_report_path) if baseline_report_path.exists() else None,
        "opt_config": str(opt_config_path) if opt_config_path.exists() else None,
        "opt_results_csv": str(opt_results_csv) if opt_results_csv.exists() else None,
        "opt_fold_details_csv": str(opt_fold_csv) if opt_fold_csv.exists() else None,
        "oof_baseline_csv": str(oof_baseline_csv) if oof_baseline_csv.exists() else None,
    },
    "cfg_meta": cfg_meta,
    "model_summary": {
        "type": final_bundle.get("type", None),
        "members": final_bundle.get("members", None),
        "train_rows": final_bundle.get("train_rows", None),
        "pos_rate": final_bundle.get("pos_rate", None),
        "feature_count": len(feature_cols),
    },
    "baseline_summary": (baseline_report.get("overall") if isinstance(baseline_report, dict) else None),
    "opt_summary": (opt_config if isinstance(opt_config, dict) else None),
}

manifest_path = OUT_DIR / "model_bundle_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))

# ----------------------------
# 5) Create a single "bundle pack" (joblib) for easy reload
# ----------------------------
# Note: we store model object bytes only via joblib file path to avoid duplication.
# Bundle pack contains pointers + key data.
bundle_pack = {
    "final_model_path": str(final_model_path),
    "final_bundle": final_bundle,
    "feature_cols": feature_cols,
    "thresholds": thresholds,
    "cfg_meta": cfg_meta,
    "manifest": manifest,
}

bundle_pack_path = OUT_DIR / "model_bundle_pack.joblib"
joblib.dump(bundle_pack, bundle_pack_path)

print("OK — Model bundle finalized")
print("  manifest  ->", manifest_path)
print("  pack      ->", bundle_pack_path)
print("  thresholds->", thresholds_path)

print("\nBundle summary:")
print("  type        :", manifest["model_summary"]["type"])
print("  members     :", manifest["model_summary"]["members"])
print("  feature_cnt :", manifest["model_summary"]["feature_count"])
print("  T_gate      :", thresholds.get("T_gate"))
