# Set Paths & Select Config (CFG)

In [43]:
# ============================================================
# STAGE — Set Paths & Select Config (CFG)  (ONE CELL) — REVISI FULL
# - Robust: find token cache even when /kaggle/working cache empty
# - No if/else (uses try/except)
# ============================================================

import os, json, random, hashlib
from pathlib import Path
from dataclasses import dataclass, asdict

# ----------------------------
# Helpers
# ----------------------------
def first_existing(paths):
    return next(Path(p) for p in paths if Path(p).exists())

def jhash(obj) -> str:
    s = json.dumps(obj, sort_keys=True, ensure_ascii=True, separators=(",", ":"))
    return hashlib.md5(s.encode("utf-8")).hexdigest()[:12]

def newest_token_cache_anywhere(parents):
    cands = []
    for parent in parents:
        parent = Path(parent)
        for d in parent.glob("dinov2_base_518_cfg_*"):
            try:
                _ = next((d / "train").glob("*.npz"))
                _ = (d / "tokens_manifest_train.parquet").exists()
                cands.append(d)
            except Exception:
                pass
    cands = sorted(cands, key=lambda d: d.stat().st_mtime, reverse=True)
    return cands[0]

def newest_match_cache_anywhere(parents):
    cands = []
    for parent in parents:
        parent = Path(parent)
        for d in parent.glob("match_cfg_*"):
            try:
                _ = (d / "match_manifest_train.parquet").exists()
                cands.append(d)
            except Exception:
                pass
    cands = sorted(cands, key=lambda d: d.stat().st_mtime, reverse=True)
    return cands[0]

# ----------------------------
# Core Paths (Competition + DINO)
# ----------------------------
COMP_ROOT = Path("/kaggle/input/recodai-luc-scientific-image-forgery-detection")
DINO_DIR  = Path("/kaggle/input/dinov2/pytorch/base/1")

# Prefer working bundle if exists; fallback to input dataset bundle
BUNDLE_ROOT = first_existing([
    "/kaggle/working/recodai_luc",
    "/kaggle/input/recod-ailuc-dinov2-base/recodai_luc",
])

PROF_DIR = first_existing([
    "/kaggle/working/recodai_luc_prof",
    "/kaggle/input/recod-ailuc-dinov2-base/recodai_luc_prof",
])

CACHE_ROOT = Path(BUNDLE_ROOT) / "cache"

# ----------------------------
# Existing Artifacts (from PROF)
# ----------------------------
TRAIN_MANIFEST = Path(PROF_DIR) / "train_manifest.parquet"
TEST_MANIFEST  = Path(PROF_DIR) / "test_manifest.parquet"
FOLDS_PATH     = Path(PROF_DIR) / "folds.parquet"

# ----------------------------
# Find VALID token/match caches (search working + input bundle cache)
# ----------------------------
CACHE_PARENTS = [
    CACHE_ROOT,
    Path("/kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache"),
    Path("/kaggle/working/recodai_luc/cache"),
]

TOKEN_CACHE_ROOT = newest_token_cache_anywhere(CACHE_PARENTS)
TOKEN_MANIFEST_TRAIN = TOKEN_CACHE_ROOT / "tokens_manifest_train.parquet"
TOKEN_MANIFEST_TEST  = TOKEN_CACHE_ROOT / "tokens_manifest_test.parquet"
TOK_TRAIN_DIR = TOKEN_CACHE_ROOT / "train"
TOK_TEST_DIR  = TOKEN_CACHE_ROOT / "test"

MATCH_CACHE_ROOT = newest_match_cache_anywhere(CACHE_PARENTS)
MATCH_MANIFEST_TRAIN = MATCH_CACHE_ROOT / "match_manifest_train.parquet"
MATCH_MANIFEST_TEST  = MATCH_CACHE_ROOT / "match_manifest_test.parquet"

# Pred ensemble dir (features)
PRED_DIR = Path("/kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/pred_ens")
PRED_FEATS_TRAIN = PRED_DIR / "pred_features_train.csv"
PRED_FEATS_TEST  = PRED_DIR / "pred_features_test.csv"

# ----------------------------
# SEG CFG (UNet+ASPP on token grid)
# ----------------------------
@dataclass
class SegCFG:
    run_name: str = "seg_unet_aspp_tok37_dinov2b"
    seed: int = 42
    img_size: int = 518
    patch: int = 14
    htok: int = 37
    wtok: int = 37
    dim: int = 768
    save_dtype: str = "float16"
    cv_n_splits: int = 5
    epochs: int = 35
    batch_size: int = 16
    lr: float = 3e-4
    base_ch: int = 128

SEG_CFG = SegCFG()
SEG_CFG_ID = jhash(asdict(SEG_CFG))

# ----------------------------
# Gate CFG (for later stages)
# ----------------------------
@dataclass
class GateCFG:
    run_name: str = "gate_lgbm_v1"
    seed: int = 42
    cv_n_splits: int = 5
    model_type: str = "lgbm"
    use_calibration: bool = True
    calibration_method: str = "sigmoid"
    threshold_strategy: str = "stability_f1"
    lgbm_params: dict = None

GATE_CFG = GateCFG(
    lgbm_params=dict(
        objective="binary",
        learning_rate=0.03,
        n_estimators=2500,
        num_leaves=63,
        max_depth=-1,
        min_data_in_leaf=80,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        lambda_l1=0.0,
        lambda_l2=1.0,
        random_state=42,
        n_jobs=-1,
    )
)
GATE_CFG_ID = jhash(asdict(GATE_CFG))

# ----------------------------
# Output dirs (new artifacts go here)
# ----------------------------
WORK_ROOT = Path("/kaggle/working/recodai_luc")
SEG_RUN_DIR  = WORK_ROOT / "seg_runs"  / f"{SEG_CFG.run_name}_{SEG_CFG_ID}"
GATE_OUT_DIR = WORK_ROOT / "gate_runs" / f"{GATE_CFG.run_name}_{GATE_CFG_ID}"
SEG_RUN_DIR.mkdir(parents=True, exist_ok=True)
GATE_OUT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Seeds
# ----------------------------
os.environ["PYTHONHASHSEED"] = str(SEG_CFG.seed)
random.seed(SEG_CFG.seed)

# ----------------------------
# Print summary
# ----------------------------
print("COMP_ROOT:", COMP_ROOT)
print("DINO_DIR :", DINO_DIR)
print("BUNDLE_ROOT:", BUNDLE_ROOT)
print("PROF_DIR  :", PROF_DIR)
print("CACHE_ROOT :", CACHE_ROOT)

print("\nTOKEN_CACHE_ROOT:", TOKEN_CACHE_ROOT)
print("TOK_TRAIN_DIR   :", TOK_TRAIN_DIR)
print("TOK_TEST_DIR    :", TOK_TEST_DIR)
print("TOKEN_MANIFEST_TRAIN:", TOKEN_MANIFEST_TRAIN.name)

print("\nMATCH_CACHE_ROOT:", MATCH_CACHE_ROOT)
print("MATCH_MANIFEST_TRAIN:", MATCH_MANIFEST_TRAIN.name)

print("\nPRED_DIR        :", PRED_DIR)
print("PRED_FEATS_TRAIN:", PRED_FEATS_TRAIN.name)
print("PRED_FEATS_TEST :", PRED_FEATS_TEST.name)

print("\nSEG_CFG_ID :", SEG_CFG_ID)
print("GATE_CFG_ID:", GATE_CFG_ID)
print("\nSEG_CFG:", json.dumps(asdict(SEG_CFG), indent=2))
print("\nGATE_CFG:", json.dumps(asdict(GATE_CFG), indent=2))


COMP_ROOT: /kaggle/input/recodai-luc-scientific-image-forgery-detection
DINO_DIR : /kaggle/input/dinov2/pytorch/base/1
BUNDLE_ROOT: /kaggle/working/recodai_luc
PROF_DIR  : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc_prof
CACHE_ROOT : /kaggle/working/recodai_luc/cache

TOKEN_CACHE_ROOT: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/dinov2_base_518_cfg_543289469500
TOK_TRAIN_DIR   : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/dinov2_base_518_cfg_543289469500/train
TOK_TEST_DIR    : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/dinov2_base_518_cfg_543289469500/test
TOKEN_MANIFEST_TRAIN: tokens_manifest_train.parquet

MATCH_CACHE_ROOT: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/match_cfg_2ed747746f9c
MATCH_MANIFEST_TRAIN: match_manifest_train.parquet

PRED_DIR        : /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/pred_ens
PRED_FEATS_TRAIN: pred_features_train.csv
PRED_FEATS_TEST : pred_features_test.csv

SEG_CFG_ID : 898cfeeba43

# Train Segmentation Decoder (UNet+ASPP)

In [None]:
# ============================================================
# STAGE — Train Segmentation Decoder (UNet + ASPP)  [ONE CELL] — REVISI FULL
# Fix: mask .npy can be (H,W) or (1,H,W) or (K,H,W) -> force to (H,W) before interpolate
# ============================================================

import os, gc, json, math, random
from pathlib import Path
import numpy as np
import pandas as pd
from dataclasses import asdict
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# ----------------------------
# Device / seed
# ----------------------------
try:
    _ = torch.zeros(1).cuda()
    DEVICE = torch.device("cuda")
except Exception:
    DEVICE = torch.device("cpu")

torch.manual_seed(SEG_CFG.seed)
random.seed(SEG_CFG.seed)
np.random.seed(SEG_CFG.seed)

print("DEVICE:", DEVICE)
print("TOK_TRAIN_DIR:", TOK_TRAIN_DIR)
print("SEG_RUN_DIR  :", SEG_RUN_DIR)

# ----------------------------
# Build df_seg: (case_id, mask_path, npz_path, fold)
# ----------------------------
mask_train = sorted((Path(COMP_ROOT) / "train_masks").glob("*.npy"))
mask_supp  = sorted((Path(COMP_ROOT) / "supplemental_masks").glob("*.npy"))
mask_paths = mask_train + mask_supp

df_mask = pd.DataFrame({
    "case_id": [p.stem for p in mask_paths],
    "mask_path": [str(p) for p in mask_paths],
})
df_mask["case_id"] = df_mask["case_id"].astype(str)

df_folds = pd.read_parquet(FOLDS_PATH).copy()
df_folds["case_id"] = df_folds["case_id"].astype(str)
df_folds["fold"] = df_folds["fold"].astype(int)

df_seg = df_mask.merge(df_folds[["case_id","fold"]], on="case_id", how="inner")
df_seg["npz_path"] = df_seg["case_id"].map(lambda s: str(Path(TOK_TRAIN_DIR) / f"{s}.npz"))

df_seg["npz_exists"] = df_seg["npz_path"].map(lambda s: Path(s).exists())
df_seg = df_seg[df_seg["npz_exists"]].drop(columns=["npz_exists"]).drop_duplicates("case_id").reset_index(drop=True)

print("masks(train):", len(mask_train), "| masks(supp):", len(mask_supp))
print("df_seg:", df_seg.shape)
print("folds:", sorted(df_seg["fold"].unique().tolist()))
display(df_seg.head(3))

# ----------------------------
# Model: UNet decoder + ASPP (token-grid)
# ----------------------------
class ConvGNAct(nn.Module):
    def __init__(self, in_ch, out_ch, k=3, s=1, p=1, g=16):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, k, s, p, bias=False)
        self.gn = nn.GroupNorm(num_groups=min(g, out_ch), num_channels=out_ch)
        self.act = nn.SiLU(inplace=True)
    def forward(self, x):
        return self.act(self.gn(self.conv(x)))

class ASPP(nn.Module):
    def __init__(self, in_ch, out_ch, rates=(1, 4, 8, 12)):
        super().__init__()
        self.branches = nn.ModuleList()
        for r in rates:
            if r == 1:
                self.branches.append(nn.Sequential(
                    nn.Conv2d(in_ch, out_ch, 1, bias=False),
                    nn.GroupNorm(num_groups=min(16, out_ch), num_channels=out_ch),
                    nn.SiLU(inplace=True),
                ))
            else:
                self.branches.append(nn.Sequential(
                    nn.Conv2d(in_ch, out_ch, 3, padding=r, dilation=r, bias=False),
                    nn.GroupNorm(num_groups=min(16, out_ch), num_channels=out_ch),
                    nn.SiLU(inplace=True),
                ))
        self.proj = nn.Sequential(
            nn.Conv2d(out_ch * len(rates), out_ch, 1, bias=False),
            nn.GroupNorm(num_groups=min(16, out_ch), num_channels=out_ch),
            nn.SiLU(inplace=True),
        )
    def forward(self, x):
        ys = [b(x) for b in self.branches]
        return self.proj(torch.cat(ys, dim=1))

class UNetASPP(nn.Module):
    def __init__(self, in_ch=768, base_ch=128):
        super().__init__()
        c1, c2, c3 = base_ch, base_ch*2, base_ch*4

        self.enc1 = nn.Sequential(ConvGNAct(in_ch, c1), ConvGNAct(c1, c1))
        self.down1 = nn.Conv2d(c1, c1, 3, stride=2, padding=1)

        self.enc2 = nn.Sequential(ConvGNAct(c1, c2), ConvGNAct(c2, c2))
        self.down2 = nn.Conv2d(c2, c2, 3, stride=2, padding=1)

        self.enc3 = nn.Sequential(ConvGNAct(c2, c3), ConvGNAct(c3, c3))
        self.aspp = ASPP(c3, c3)

        self.up2 = nn.ConvTranspose2d(c3, c2, 2, stride=2)
        self.dec2 = nn.Sequential(ConvGNAct(c2+c2, c2), ConvGNAct(c2, c2))

        self.up1 = nn.ConvTranspose2d(c2, c1, 2, stride=2)
        self.dec1 = nn.Sequential(ConvGNAct(c1+c1, c1), ConvGNAct(c1, c1))

        self.head = nn.Conv2d(c1, 1, 1)

    def forward(self, x):
        e1 = self.enc1(x)                 # (B,c1,37,37)
        d1 = self.down1(e1)               # (B,c1,19,19)
        e2 = self.enc2(d1)                # (B,c2,19,19)
        d2 = self.down2(e2)               # (B,c2,10,10)
        e3 = self.enc3(d2)                # (B,c3,10,10)
        b  = self.aspp(e3)                # (B,c3,10,10)

        u2 = self.up2(b)                  # (B,c2,20,20)
        u2 = u2[:, :, :e2.shape[2], :e2.shape[3]]
        x2 = self.dec2(torch.cat([u2, e2], dim=1))

        u1 = self.up1(x2)                 # (B,c1,38,38)
        u1 = u1[:, :, :e1.shape[2], :e1.shape[3]]
        x1 = self.dec1(torch.cat([u1, e1], dim=1))

        return self.head(x1)              # (B,1,37,37)

# ----------------------------
# Dataset: token npz + mask npy -> y_tok (37x37)
# ----------------------------
class TokMaskDS(Dataset):
    def __init__(self, df: pd.DataFrame, htok=37, wtok=37, dim=768, aug=False):
        self.case_id = df["case_id"].astype(str).tolist()
        self.npz_path = df["npz_path"].astype(str).tolist()
        self.mask_path = df["mask_path"].astype(str).tolist()
        self.htok, self.wtok, self.dim = int(htok), int(wtok), int(dim)
        self.aug = bool(aug)

    def __len__(self):
        return len(self.case_id)

    def __getitem__(self, i):
        # tokens
        with np.load(self.npz_path[i]) as z:
            a = z[z.files[0]]
        a = np.asarray(a)

        # robust to (H,W,C) or (C,H,W)
        try:
            x = np.moveaxis(a, -1, 0)
            assert x.shape[0] == self.dim
        except Exception:
            x = a
        x = torch.from_numpy(np.asarray(x, dtype=np.float32))  # (768,37,37)

        # mask: robust to (H,W) or (1,H,W) or (K,H,W)
        m = np.load(self.mask_path[i])
        m = np.asarray(m)
        m = np.squeeze(m)
        if m.ndim == 3:
            m = m.max(axis=0)
        m = np.asarray(m, dtype=np.float32)

        y = torch.from_numpy(m).unsqueeze(0).unsqueeze(0)      # (1,1,H,W)
        y = F.interpolate(y, size=(self.htok, self.wtok), mode="nearest")[0, 0]  # (37,37)

        if self.aug:
            r = random.random()
            if r < 0.25:
                x = torch.flip(x, dims=[2]); y = torch.flip(y, dims=[1])
            elif r < 0.50:
                x = torch.flip(x, dims=[1]); y = torch.flip(y, dims=[0])
            elif r < 0.75:
                x = torch.flip(x, dims=[1,2]); y = torch.flip(y, dims=[0,1])

        return x, y

# ----------------------------
# Loss (BCE + soft-dice)
# ----------------------------
def soft_dice_loss(logits, targets, eps=1e-6):
    probs = torch.sigmoid(logits)
    num = 2.0 * (probs * targets).sum(dim=(2,3)) + eps
    den = (probs + targets).sum(dim=(2,3)) + eps
    return (1.0 - (num / den)).mean()

bce = nn.BCEWithLogitsLoss()

# ----------------------------
# Train one fold
# ----------------------------
def train_one_fold(df_all, fold):
    df_tr = df_all[df_all["fold"] != int(fold)].reset_index(drop=True)
    df_va = df_all[df_all["fold"] == int(fold)].reset_index(drop=True)

    ds_tr = TokMaskDS(df_tr, htok=SEG_CFG.htok, wtok=SEG_CFG.wtok, dim=SEG_CFG.dim, aug=True)
    ds_va = TokMaskDS(df_va, htok=SEG_CFG.htok, wtok=SEG_CFG.wtok, dim=SEG_CFG.dim, aug=False)

    dl_tr = DataLoader(ds_tr, batch_size=SEG_CFG.batch_size, shuffle=True, num_workers=2, pin_memory=True, drop_last=True)
    dl_va = DataLoader(ds_va, batch_size=SEG_CFG.batch_size, shuffle=False, num_workers=2, pin_memory=True)

    model = UNetASPP(in_ch=SEG_CFG.dim, base_ch=SEG_CFG.base_ch).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=SEG_CFG.lr, weight_decay=1e-4)

    best = float("inf")
    best_state = None
    patience = 6
    bad = 0
    hist = {"train_loss": [], "val_loss": []}

    for epoch in range(1, SEG_CFG.epochs + 1):
        model.train()
        tr_loss = 0.0
        n_tr = 0

        for x, y in tqdm(dl_tr, desc=f"fold{fold} train ep{epoch}", leave=False):
            x = x.to(DEVICE, non_blocking=True)
            y = y.to(DEVICE, non_blocking=True).unsqueeze(1)   # (B,1,37,37)

            opt.zero_grad(set_to_none=True)
            logits = model(x)
            loss = bce(logits, y) + 0.5 * soft_dice_loss(logits, y)
            loss.backward()
            opt.step()

            bs = x.size(0)
            tr_loss += loss.item() * bs
            n_tr += bs

        model.eval()
        va_loss = 0.0
        n_va = 0
        with torch.inference_mode():
            for x, y in dl_va:
                x = x.to(DEVICE, non_blocking=True)
                y = y.to(DEVICE, non_blocking=True).unsqueeze(1)
                logits = model(x)
                loss = bce(logits, y) + 0.5 * soft_dice_loss(logits, y)
                bs = x.size(0)
                va_loss += loss.item() * bs
                n_va += bs

        tr = tr_loss / max(1, n_tr)
        va = va_loss / max(1, n_va)
        hist["train_loss"].append(tr)
        hist["val_loss"].append(va)

        print(f"[fold {fold}] ep{epoch:02d} train={tr:.5f} val={va:.5f}")

        if va < best:
            best = va
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                break

    model.load_state_dict(best_state, strict=True)
    return model, hist

# ----------------------------
# Train CV + Save
# ----------------------------
uniq_folds = sorted(df_seg["fold"].unique().tolist())

all_hist = {}
model_paths = []

for f in uniq_folds:
    model, hist = train_one_fold(df_seg, f)

    out_path = Path(SEG_RUN_DIR) / f"seg_unet_aspp_fold{int(f)}.pt"
    torch.save(
        {
            "state_dict": model.state_dict(),
            "seg_cfg": asdict(SEG_CFG),
            "seg_cfg_id": SEG_CFG_ID,
            "fold": int(f),
            "token_grid": {"htok": SEG_CFG.htok, "wtok": SEG_CFG.wtok, "dim": SEG_CFG.dim},
        },
        out_path
    )

    all_hist[str(int(f))] = hist
    model_paths.append(str(out_path))
    print("SAVED:", out_path)

summary_path = Path(SEG_RUN_DIR) / "seg_cv_history.json"
summary_path.write_text(json.dumps(
    {"seg_cfg": asdict(SEG_CFG), "seg_cfg_id": SEG_CFG_ID, "models": model_paths, "history": all_hist},
    indent=2
))
print("SAVED:", summary_path)

gc.collect()
try:
    torch.cuda.empty_cache()
except Exception:
    pass


DEVICE: cpu
TOK_TRAIN_DIR: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cache/dinov2_base_518_cfg_543289469500/train
SEG_RUN_DIR  : /kaggle/working/recodai_luc/seg_runs/seg_unet_aspp_tok37_dinov2b_898cfeeba43a
masks(train): 2751 | masks(supp): 48
df_seg: (2795, 4)
folds: [0, 1, 2, 3, 4]


Unnamed: 0,case_id,mask_path,fold,npz_path
0,10,/kaggle/input/recodai-luc-scientific-image-for...,3,/kaggle/input/recod-ailuc-dinov2-base/recodai_...
1,10015,/kaggle/input/recodai-luc-scientific-image-for...,4,/kaggle/input/recod-ailuc-dinov2-base/recodai_...
2,10017,/kaggle/input/recodai-luc-scientific-image-for...,2,/kaggle/input/recod-ailuc-dinov2-base/recodai_...




fold0 train ep1:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[fold 0] ep01 train=0.56343 val=0.50878


fold0 train ep2:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[fold 0] ep02 train=0.49327 val=0.46714


fold0 train ep3:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[fold 0] ep03 train=0.45646 val=0.44441


fold0 train ep4:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[fold 0] ep04 train=0.43185 val=0.43497


fold0 train ep5:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[fold 0] ep05 train=0.41134 val=0.42622


fold0 train ep6:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 16

[fold 0] ep06 train=0.39456 val=0.41172


fold0 train ep7:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^Exception ignored in: ^
<function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>AssertionError
: Traceback (most recent call last):
can only test a child process  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__

    Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>self._shutdown_workers()

[fold 0] ep07 train=0.38021 val=0.40858


fold0 train ep8:   0%|          | 0/139 [00:00<?, ?it/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
    if w.is_alive():
       Exception ignored in: ^<function _MultiProcessingDataLoaderIter.__del__ at 0x7cf50570c680>^^
^^Traceback (most recent call last):
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1664, in __del__
^^    ^self._shutdown_workers()^
^  File "/usr/local/lib/python3.12/dist-packages/torch/utils/data/dataloader.py", line 1647, in _shutdown_workers
^    
if w.is_alive():
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
      assert self._parent_pid == os.getpid(), 'can only test a child process'  
     ^  ^^ ^ ^ ^  ^  ^^^^^^^^^
^  File 

[fold 0] ep08 train=0.36798 val=0.40896


fold0 train ep9:   0%|          | 0/139 [00:00<?, ?it/s]

# Seg OOF/Test Inference + Post-Process + Export pred_features

In [None]:
# ============================================================
# STAGE — Seg OOF/Test Inference + Post-Process + Export pred_features  [ONE CELL]
# - Load fold models from SEG_RUN_DIR
# - Inference on TRAIN (OOF by fold) + TEST (ensemble)
# - Save prob token-grid as .npz in CACHE_ROOT/pred_ens/{train,test}/{case_id}.npz
# - Build pred_features_{train,test}.csv (schema compatible with your gate stage)
#
# REQUIRE globals:
#   COMP_ROOT, CACHE_ROOT, TRAIN_MANIFEST, TEST_MANIFEST, FOLDS_PATH
#   TOKEN_CACHE_ROOT or TOK_TRAIN_DIR/TOK_TEST_DIR
#   SEG_RUN_DIR, SEG_CFG, SEG_CFG_ID
# ============================================================

import os, re, gc, json, math
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn.functional as F

# ----------------------------
# Device
# ----------------------------
try:
    _ = torch.zeros(1).cuda()
    DEVICE = torch.device("cuda")
except Exception:
    DEVICE = torch.device("cpu")

print("DEVICE:", DEVICE)
print("SEG_RUN_DIR:", SEG_RUN_DIR)

# ----------------------------
# Minimal model definition (must match training cell)
# ----------------------------
import torch.nn as nn

class ConvGNAct(nn.Module):
    def __init__(self, in_ch, out_ch, k=3, s=1, p=1, g=16):
        super().__init__()
        self.conv = nn.Conv2d(in_ch, out_ch, k, s, p, bias=False)
        self.gn = nn.GroupNorm(num_groups=min(g, out_ch), num_channels=out_ch)
        self.act = nn.SiLU(inplace=True)
    def forward(self, x):
        return self.act(self.gn(self.conv(x)))

class ASPP(nn.Module):
    def __init__(self, in_ch, out_ch, rates=(1, 4, 8, 12)):
        super().__init__()
        self.branches = nn.ModuleList()
        for r in rates:
            if r == 1:
                self.branches.append(nn.Sequential(
                    nn.Conv2d(in_ch, out_ch, 1, bias=False),
                    nn.GroupNorm(num_groups=min(16, out_ch), num_channels=out_ch),
                    nn.SiLU(inplace=True),
                ))
            else:
                self.branches.append(nn.Sequential(
                    nn.Conv2d(in_ch, out_ch, 3, padding=r, dilation=r, bias=False),
                    nn.GroupNorm(num_groups=min(16, out_ch), num_channels=out_ch),
                    nn.SiLU(inplace=True),
                ))
        self.proj = nn.Sequential(
            nn.Conv2d(out_ch * len(rates), out_ch, 1, bias=False),
            nn.GroupNorm(num_groups=min(16, out_ch), num_channels=out_ch),
            nn.SiLU(inplace=True),
        )
    def forward(self, x):
        ys = [b(x) for b in self.branches]
        return self.proj(torch.cat(ys, dim=1))

class UNetASPP(nn.Module):
    def __init__(self, in_ch=768, base_ch=128):
        super().__init__()
        c1, c2, c3 = base_ch, base_ch*2, base_ch*4

        self.enc1 = nn.Sequential(ConvGNAct(in_ch, c1), ConvGNAct(c1, c1))
        self.down1 = nn.Conv2d(c1, c1, 3, stride=2, padding=1)

        self.enc2 = nn.Sequential(ConvGNAct(c1, c2), ConvGNAct(c2, c2))
        self.down2 = nn.Conv2d(c2, c2, 3, stride=2, padding=1)

        self.enc3 = nn.Sequential(ConvGNAct(c2, c3), ConvGNAct(c3, c3))
        self.aspp = ASPP(c3, c3)

        self.up2 = nn.ConvTranspose2d(c3, c2, 2, stride=2)
        self.dec2 = nn.Sequential(ConvGNAct(c2+c2, c2), ConvGNAct(c2, c2))

        self.up1 = nn.ConvTranspose2d(c2, c1, 2, stride=2)
        self.dec1 = nn.Sequential(ConvGNAct(c1+c1, c1), ConvGNAct(c1, c1))

        self.head = nn.Conv2d(c1, 1, 1)

    def forward(self, x):
        e1 = self.enc1(x)
        d1 = self.down1(e1)
        e2 = self.enc2(d1)
        d2 = self.down2(e2)
        e3 = self.enc3(d2)
        b  = self.aspp(e3)

        u2 = self.up2(b)
        u2 = u2[:, :, :e2.shape[2], :e2.shape[3]]
        x2 = self.dec2(torch.cat([u2, e2], dim=1))

        u1 = self.up1(x2)
        u1 = u1[:, :, :e1.shape[2], :e1.shape[3]]
        x1 = self.dec1(torch.cat([u1, e1], dim=1))

        return self.head(x1)

# ----------------------------
# Load fold models
# ----------------------------
ckpts = sorted(Path(SEG_RUN_DIR).glob("seg_unet_aspp_fold*.pt"))
print("Found ckpts:", len(ckpts))
fold_models = {}
for p in ckpts:
    m = re.search(r"fold(\d+)", p.name)
    fold = int(m.group(1))
    pack = torch.load(p, map_location="cpu")
    model = UNetASPP(in_ch=SEG_CFG.dim, base_ch=SEG_CFG.base_ch)
    model.load_state_dict(pack["state_dict"], strict=True)
    model.eval().to(DEVICE)
    fold_models[fold] = model

fold_list = sorted(fold_models.keys())
print("Folds:", fold_list)

# ----------------------------
# Token loader (from token cache train/test dirs)
# - expects npz contains array (37,37,768) or (768,37,37)
# ----------------------------
TOK_TRAIN_DIR = Path(TOKEN_CACHE_ROOT) / "train" if "TOKEN_CACHE_ROOT" in globals() else Path(TOK_TRAIN_DIR)
TOK_TEST_DIR  = Path(TOKEN_CACHE_ROOT) / "test"  if "TOKEN_CACHE_ROOT" in globals() else Path(TOK_TEST_DIR)

def load_token_npz(npz_path: str):
    with np.load(npz_path) as z:
        a = z[z.files[0]]
    a = np.asarray(a)
    try:
        x = np.moveaxis(a, -1, 0)  # (C,H,W)
        _ = x.shape[0]
        return x
    except Exception:
        return a

# ----------------------------
# Inference helper
# ----------------------------
@torch.inference_mode()
def infer_probs(model, x):
    x = x.to(DEVICE, non_blocking=True)
    logits = model(x)
    probs = torch.sigmoid(logits).squeeze(1)  # (B,37,37)
    return probs.detach().cpu().numpy().astype(np.float32)

def save_prob_tok(out_dir: Path, case_id: str, prob_tok: np.ndarray):
    out_dir.mkdir(parents=True, exist_ok=True)
    p = out_dir / f"{case_id}.npz"
    np.savez_compressed(p, prob_tok=prob_tok.astype(np.float16))
    return str(p)

# ----------------------------
# Feature extraction from token-prob map
# ----------------------------
def feats_from_prob_tok(prob_tok: np.ndarray, thr_tok=0.5):
    m = (prob_tok >= thr_tok).astype(np.uint8)
    area_frac_tok = float(m.mean())
    mean_prob_tok = float(prob_tok.mean())
    has_prob = int(area_frac_tok > 0.0)
    n_inst = int(has_prob)  # simple proxy (no CC to keep fast + stable)
    return mean_prob_tok, area_frac_tok, has_prob, n_inst

# ----------------------------
# Load manifests + folds + labels
# ----------------------------
df_train = pd.read_parquet(TRAIN_MANIFEST).copy()
df_test  = pd.read_parquet(TEST_MANIFEST).copy()
df_folds = pd.read_parquet(FOLDS_PATH).copy()

df_train["case_id"] = df_train["case_id"].astype(str)
df_test["case_id"]  = df_test["case_id"].astype(str)
df_folds["case_id"] = df_folds["case_id"].astype(str)

df_train = df_train.merge(df_folds[["case_id","fold"]], on="case_id", how="left")
df_train["fold"] = df_train["fold"].astype(int)

# label y: forged=1, authentic=0; train_manifest already has y in your pipeline
if "y" not in df_train.columns:
    df_train["y"] = (df_train["label"].astype(str).str.lower() == "forged").astype(int)

# ----------------------------
# Output dirs (pred_ens)
# ----------------------------
PRED_DIR = Path(CACHE_ROOT) / "pred_ens"
PRED_TRAIN_DIR = PRED_DIR / "train"
PRED_TEST_DIR  = PRED_DIR / "test"
PRED_DIR.mkdir(parents=True, exist_ok=True)
PRED_TRAIN_DIR.mkdir(parents=True, exist_ok=True)
PRED_TEST_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# OOF inference (train): use model of that fold only
# ----------------------------
rows_train = []
BATCH = 32

train_case = df_train["case_id"].tolist()
train_fold = df_train["fold"].tolist()

for i0 in tqdm(range(0, len(df_train), BATCH), desc="TRAIN OOF"):
    sl = slice(i0, min(len(df_train), i0 + BATCH))
    cids = train_case[sl]
    folds = train_fold[sl]

    # group by fold inside batch (simple)
    for fold in sorted(set(folds)):
        idxs = [k for k, ff in enumerate(folds) if ff == fold]
        if len(idxs) == 0:
            continue

        grids = []
        cid_sub = []
        for k in idxs:
            cid = cids[k]
            npz_path = str(TOK_TRAIN_DIR / f"{cid}.npz")
            g = load_token_npz(npz_path)            # (C,H,W)
            grids.append(g.astype(np.float32))
            cid_sub.append(cid)

        x = torch.from_numpy(np.stack(grids, axis=0))
        prob_batch = infer_probs(fold_models[int(fold)], x)

        for cid, prob_tok in zip(cid_sub, prob_batch):
            pred_npz = save_prob_tok(PRED_TRAIN_DIR, str(cid), prob_tok)
            mean_prob_tok, area_frac_tok, has_prob, n_inst = feats_from_prob_tok(prob_tok, thr_tok=0.5)

            rows_train.append({
                "case_id": str(cid),
                "uid_safe": str(cid),
                "split": "train",
                "y": int(df_train.loc[df_train["case_id"] == str(cid), "y"].iloc[0]),
                "fold": int(fold),
                "n_inst": n_inst,
                "area_frac": area_frac_tok,
                "area_frac_tok": area_frac_tok,
                "mean_prob_tok": mean_prob_tok,
                "has_prob": has_prob,
                "prob_exists": True,
                "npz_path": pred_npz,
            })

df_pred_train = pd.DataFrame(rows_train).sort_values(["case_id"]).reset_index(drop=True)
print("df_pred_train:", df_pred_train.shape)
display(df_pred_train.head(3))

# ----------------------------
# TEST inference (ensemble avg)
# ----------------------------
rows_test = []
test_case = df_test["case_id"].tolist()

for i0 in tqdm(range(0, len(df_test), BATCH), desc="TEST ensemble"):
    sl = slice(i0, min(len(df_test), i0 + BATCH))
    cids = test_case[sl]

    grids = []
    for cid in cids:
        npz_path = str(TOK_TEST_DIR / f"{cid}.npz")
        g = load_token_npz(npz_path)
        grids.append(g.astype(np.float32))

    x = torch.from_numpy(np.stack(grids, axis=0))

    acc = infer_probs(fold_models[int(fold_list[0])], x)
    for fold in fold_list[1:]:
        acc = acc + infer_probs(fold_models[int(fold)], x)
    prob_tok_batch = acc / float(len(fold_list))

    for cid, prob_tok in zip(cids, prob_tok_batch):
        pred_npz = save_prob_tok(PRED_TEST_DIR, str(cid), prob_tok)
        mean_prob_tok, area_frac_tok, has_prob, n_inst = feats_from_prob_tok(prob_tok, thr_tok=0.5)

        rows_test.append({
            "case_id": str(cid),
            "uid_safe": str(cid),
            "split": "test",
            "y": np.nan,
            "fold": -1,
            "n_inst": n_inst,
            "area_frac": area_frac_tok,
            "area_frac_tok": area_frac_tok,
            "mean_prob_tok": mean_prob_tok,
            "has_prob": has_prob,
            "prob_exists": True,
            "npz_path": pred_npz,
        })

df_pred_test = pd.DataFrame(rows_test).sort_values(["case_id"]).reset_index(drop=True)
print("df_pred_test:", df_pred_test.shape)
display(df_pred_test.head(3))

# ----------------------------
# Export pred_features_{train,test}.csv (for Gate model)
# (Add a few safe placeholder cols expected by many gate notebooks)
# ----------------------------
def export_features(df_pred, out_csv: Path):
    df = df_pred.copy()
    df["has_match"] = 0
    df["best_peak_score"] = 0
    df["match_exists"] = False
    df["n_inst"] = df["n_inst"].fillna(0).astype(int)
    df["has_prob"] = df["has_prob"].fillna(0).astype(int)
    df["area_frac_tok"] = df["area_frac_tok"].fillna(0.0).astype(float)
    df["area_frac"] = df["area_frac"].fillna(0.0).astype(float)
    df["mean_prob_tok"] = df["mean_prob_tok"].fillna(0.0).astype(float)

    cols = [
        "case_id","uid_safe","split","y","fold",
        "n_inst","area_frac","area_frac_tok","mean_prob_tok",
        "best_peak_score","has_match","has_prob",
        "match_exists","prob_exists","npz_path"
    ]
    df = df[cols]
    df.to_csv(out_csv, index=False)
    return df

pred_train_csv = PRED_DIR / "pred_features_train.csv"
pred_test_csv  = PRED_DIR / "pred_features_test.csv"

_ = export_features(df_pred_train, pred_train_csv)
_ = export_features(df_pred_test, pred_test_csv)

print("SAVED:")
print(" -", PRED_TRAIN_DIR)
print(" -", PRED_TEST_DIR)
print(" -", pred_train_csv)
print(" -", pred_test_csv)

gc.collect()
try:
    torch.cuda.empty_cache()
except Exception:
    pass


# Build Training Table (X, y, folds)

In [None]:
# ============================================================
# STAGE — Build Training Table (X, y, folds)  [ONE CELL]
# - Input : pred_features_train.csv + train_manifest.parquet + folds.parquet
# - Output: gate_train_table_<CFG_ID>.parquet  (+ feature_cols.json)
#
# REQUIRE globals:
#   PRED_DIR (or PRED_FEATS_TRAIN), TRAIN_MANIFEST, FOLDS_PATH, GATE_OUT_DIR, GATE_CFG (+GATE_CFG_ID optional)
# ============================================================

import json
import numpy as np
import pandas as pd
from pathlib import Path

# ----------------------------
# Paths
# ----------------------------
PRED_FEATS_TRAIN = Path(PRED_DIR) / "pred_features_train.csv"
GATE_OUT_DIR = Path(GATE_OUT_DIR)
GATE_OUT_DIR.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Load
# ----------------------------
df_feat  = pd.read_csv(PRED_FEATS_TRAIN)
df_man   = pd.read_parquet(TRAIN_MANIFEST)
df_folds = pd.read_parquet(FOLDS_PATH)

# ----------------------------
# Case id column normalize (supports uid_safe fallback)
# ----------------------------
cid_col_feat = "case_id" if "case_id" in df_feat.columns else "uid_safe"
cid_col_man  = "case_id" if "case_id" in df_man.columns else "uid"

df_feat[cid_col_feat] = df_feat[cid_col_feat].astype(str)
df_man[cid_col_man]   = df_man[cid_col_man].astype(str)
df_folds["case_id"]   = df_folds["case_id"].astype(str)

# ----------------------------
# Build y from manifest if missing/NaN in features
# ----------------------------
if "y" not in df_feat.columns:
    df_feat = df_feat.merge(df_man[[cid_col_man, "label"]].rename(columns={cid_col_man: cid_col_feat}),
                            on=cid_col_feat, how="left")
    df_feat["y"] = (df_feat["label"].astype(str).str.lower() == "forged").astype(int)

df_feat["y"] = pd.to_numeric(df_feat["y"], errors="coerce")
miss_y = int(df_feat["y"].isna().sum())
if miss_y > 0:
    df_tmp = df_man[[cid_col_man, "label"]].rename(columns={cid_col_man: cid_col_feat})
    df_feat = df_feat.drop(columns=["label"], errors="ignore").merge(df_tmp, on=cid_col_feat, how="left")
    df_feat["y"] = df_feat["y"].fillna((df_feat["label"].astype(str).str.lower() == "forged").astype(int))

df_feat["y"] = df_feat["y"].astype(int)

# ----------------------------
# Attach folds
# ----------------------------
df = df_feat.merge(df_folds[["case_id","fold"]].rename(columns={"case_id": cid_col_feat}),
                   on=cid_col_feat, how="left")

# ----------------------------
# Finalize columns
# ----------------------------
df = df.rename(columns={cid_col_feat: "case_id"}).copy()
df["case_id"] = df["case_id"].astype(str)
df["fold"] = df["fold"].astype(int)

DROP = {"case_id", "uid_safe", "split", "label", "y", "fold", "npz_path"}
FEATURE_COLS = [c for c in df.columns if c not in DROP]

df[FEATURE_COLS] = df[FEATURE_COLS].replace([np.inf, -np.inf], np.nan)

# ----------------------------
# Report folds
# ----------------------------
fold_tab = df.groupby("fold")["y"].agg(["count", "mean"]).reset_index()
fold_tab["forged%"] = (100.0 * fold_tab["mean"])
fold_tab = fold_tab.drop(columns=["mean"])

print("FEATURE_COLS:", len(FEATURE_COLS))
print(fold_tab.to_string(index=False))

# ----------------------------
# Save
# ----------------------------
CFG_ID = GATE_CFG_ID if "GATE_CFG_ID" in globals() else "gate"
train_tab_path = GATE_OUT_DIR / f"gate_train_table_{CFG_ID}.parquet"
feat_cols_path = GATE_OUT_DIR / "feature_cols.json"

df_gate_train = df[["case_id","fold","y"] + FEATURE_COLS].copy()
df_gate_train.to_parquet(train_tab_path, index=False)
feat_cols_path.write_text(json.dumps(FEATURE_COLS, indent=2))

print("SAVED:")
print(" -", train_tab_path)
print(" -", feat_cols_path)

display(df_gate_train.head(3))


# Build & Export Test Feature Table

In [None]:
# ============================================================
# STAGE — Build & Export Test Feature Table  [ONE CELL]
# - Input : pred_features_test.csv
# - Output: gate_test_table_<CFG_ID>.parquet
#
# REQUIRE globals:
#   PRED_DIR (or PRED_FEATS_TEST), GATE_OUT_DIR, feature_cols.json exists (from train table stage)
# ============================================================

import json
import numpy as np
import pandas as pd
from pathlib import Path

# ----------------------------
# Paths
# ----------------------------
PRED_FEATS_TEST = Path(PRED_DIR) / "pred_features_test.csv"
GATE_OUT_DIR = Path(GATE_OUT_DIR)
GATE_OUT_DIR.mkdir(parents=True, exist_ok=True)

CFG_ID = GATE_CFG_ID if "GATE_CFG_ID" in globals() else "gate"
test_tab_path = GATE_OUT_DIR / f"gate_test_table_{CFG_ID}.parquet"
feat_cols_path = GATE_OUT_DIR / "feature_cols.json"

# ----------------------------
# Load + normalize case_id column
# ----------------------------
df_test_feat = pd.read_csv(PRED_FEATS_TEST)

cid_col = "case_id" if "case_id" in df_test_feat.columns else "uid_safe"
df_test_feat[cid_col] = df_test_feat[cid_col].astype(str)

df_test_feat = df_test_feat.rename(columns={cid_col: "case_id"}).copy()
df_test_feat["case_id"] = df_test_feat["case_id"].astype(str)

# ----------------------------
# Ensure feature columns match training
# ----------------------------
FEATURE_COLS = json.loads(Path(feat_cols_path).read_text())

for c in FEATURE_COLS:
    if c not in df_test_feat.columns:
        df_test_feat[c] = 0.0

df_test_feat[FEATURE_COLS] = df_test_feat[FEATURE_COLS].replace([np.inf, -np.inf], np.nan).fillna(0.0)

df_gate_test = df_test_feat[["case_id"] + FEATURE_COLS].copy()

# ----------------------------
# Save
# ----------------------------
df_gate_test.to_parquet(test_tab_path, index=False)

print("df_gate_test:", df_gate_test.shape)
print("SAVED:", test_tab_path)
display(df_gate_test.head(3))


# Train Baseline Model (Leakage-Safe CV)

In [None]:
# ============================================================
# STAGE — Train Baseline Gate Model (Leakage-Safe CV)  [ONE CELL]
# - Model: LightGBM (binary) + optional calibration (sigmoid/isotonic)
# - CV    : use fold column from gate_train_table_*.parquet
# - Output:
#   - gate_oof_pred_<CFG_ID>.parquet
#   - gate_cv_metrics_<CFG_ID>.json
#   - gate_models_<CFG_ID>.pkl
#   - gate_calibrators_<CFG_ID>.pkl (if enabled)
# ============================================================

import json, pickle
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression

# LightGBM (baseline)
import lightgbm as lgb

# ----------------------------
# Paths
# ----------------------------
CFG_ID = GATE_CFG_ID if "GATE_CFG_ID" in globals() else "gate"
GATE_OUT_DIR = Path(GATE_OUT_DIR)
train_tab_path = GATE_OUT_DIR / f"gate_train_table_{CFG_ID}.parquet"
feat_cols_path = GATE_OUT_DIR / "feature_cols.json"

oof_path   = GATE_OUT_DIR / f"gate_oof_pred_{CFG_ID}.parquet"
metrics_p  = GATE_OUT_DIR / f"gate_cv_metrics_{CFG_ID}.json"
models_p   = GATE_OUT_DIR / f"gate_models_{CFG_ID}.pkl"
calibs_p   = GATE_OUT_DIR / f"gate_calibrators_{CFG_ID}.pkl"

# ----------------------------
# Load
# ----------------------------
df = pd.read_parquet(train_tab_path)
FEATURE_COLS = json.loads(feat_cols_path.read_text())

X = df[FEATURE_COLS].to_numpy(dtype=np.float32)
y = df["y"].to_numpy(dtype=np.int64)
folds = df["fold"].to_numpy(dtype=np.int64)

uniq_folds = sorted(np.unique(folds).tolist())
print("Train rows:", len(df))
print("FEATURE_COLS:", len(FEATURE_COLS))
print("Folds:", uniq_folds)

# ----------------------------
# Train CV
# ----------------------------
oof = np.zeros(len(df), dtype=np.float32)
fold_metrics = {}
models = {}
calibrators = {}

use_cal = bool(GATE_CFG.use_calibration)
cal_method = str(GATE_CFG.calibration_method)

for f in uniq_folds:
    tr_idx = np.where(folds != f)[0]
    va_idx = np.where(folds == f)[0]

    Xtr, ytr = X[tr_idx], y[tr_idx]
    Xva, yva = X[va_idx], y[va_idx]

    params = dict(GATE_CFG.lgbm_params or {})
    params.setdefault("objective", "binary")
    params.setdefault("random_state", int(GATE_CFG.seed))
    params.setdefault("n_jobs", -1)

    clf = lgb.LGBMClassifier(**params)
    clf.fit(Xtr, ytr)

    # raw probs
    p_va = clf.predict_proba(Xva)[:, 1].astype(np.float32)

    # optional calibration (fit on train, apply to val)
    if use_cal:
        base = lgb.LGBMClassifier(**params)
        cal = CalibratedClassifierCV(base, method=cal_method, cv=3)
        cal.fit(Xtr, ytr)
        p_va = cal.predict_proba(Xva)[:, 1].astype(np.float32)
        calibrators[int(f)] = cal

    oof[va_idx] = p_va
    models[int(f)] = clf

    auc = float(roc_auc_score(yva, p_va))
    thr = 0.5
    pred = (p_va >= thr).astype(int)
    f1 = float(f1_score(yva, pred))
    prec = float(precision_score(yva, pred, zero_division=0))
    rec = float(recall_score(yva, pred, zero_division=0))

    fold_metrics[int(f)] = {"auc": auc, "f1@0.5": f1, "precision@0.5": prec, "recall@0.5": rec}
    print(f"[fold {f}] AUC={auc:.5f} F1@0.5={f1:.5f} P={prec:.5f} R={rec:.5f}")

# ----------------------------
# Save OOF + metrics
# ----------------------------
df_oof = df[["case_id", "fold", "y"]].copy()
df_oof["oof_prob"] = oof

oof_auc = float(roc_auc_score(y, oof))
oof_pred = (oof >= 0.5).astype(int)
oof_f1 = float(f1_score(y, oof_pred))

report = {
    "cfg_id": CFG_ID,
    "n_rows": int(len(df)),
    "n_features": int(len(FEATURE_COLS)),
    "oof_auc": oof_auc,
    "oof_f1@0.5": oof_f1,
    "fold_metrics": fold_metrics,
    "calibration": {"enabled": use_cal, "method": cal_method},
}

df_oof.to_parquet(oof_path, index=False)
metrics_p.write_text(json.dumps(report, indent=2))

with open(models_p, "wb") as f:
    pickle.dump(models, f)

if use_cal:
    with open(calibs_p, "wb") as f:
        pickle.dump(calibrators, f)

print("SAVED:")
print(" -", oof_path)
print(" -", metrics_p)
print(" -", models_p)
if use_cal:
    print(" -", calibs_p)

display(df_oof.head(5))


# Optimize Model & Hyperparameters (Iterative)

In [None]:
# ============================================================
# STAGE — Optimize Model & Hyperparameters (Iterative) [ONE CELL]
# - Optuna hyperopt for LightGBM (leakage-safe CV using provided folds)
# - Objective: mean AUC across folds (fast + stable); stores best params for next stages
#
# REQUIRE globals:
#   GATE_OUT_DIR, GATE_CFG (seed), GATE_CFG_ID
#   gate_train_table_<CFG_ID>.parquet, feature_cols.json (from previous stage)
# ============================================================

import json, pickle, math
import numpy as np
import pandas as pd
from pathlib import Path

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

# ----------------------------
# Settings
# ----------------------------
N_TRIALS = 40          # increase if you want stronger search
NUM_BOOST_MAX = 6000   # upper bound for boosting rounds
EARLY_STOP = 150
SEED = int(getattr(GATE_CFG, "seed", 42))

CFG_ID = GATE_CFG_ID if "GATE_CFG_ID" in globals() else "gate"
GATE_OUT_DIR = Path(GATE_OUT_DIR)
train_tab_path = GATE_OUT_DIR / f"gate_train_table_{CFG_ID}.parquet"
feat_cols_path = GATE_OUT_DIR / "feature_cols.json"

study_path = GATE_OUT_DIR / f"optuna_lgbm_study_{CFG_ID}.pkl"
best_params_path = GATE_OUT_DIR / f"best_lgbm_params_{CFG_ID}.json"
trials_csv_path  = GATE_OUT_DIR / f"optuna_trials_{CFG_ID}.csv"

# ----------------------------
# Load data
# ----------------------------
df = pd.read_parquet(train_tab_path)
FEATURE_COLS = json.loads(feat_cols_path.read_text())

X = df[FEATURE_COLS].to_numpy(dtype=np.float32)
y = df["y"].to_numpy(dtype=np.int64)
folds = df["fold"].to_numpy(dtype=np.int64)
uniq_folds = sorted(np.unique(folds).tolist())

print("rows:", len(df), "| features:", len(FEATURE_COLS), "| folds:", uniq_folds)

# ----------------------------
# CV scorer
# ----------------------------
def cv_auc_lgb(params: dict) -> float:
    aucs = []
    for f in uniq_folds:
        tr_idx = np.where(folds != f)[0]
        va_idx = np.where(folds == f)[0]

        Xtr, ytr = X[tr_idx], y[tr_idx]
        Xva, yva = X[va_idx], y[va_idx]

        dtr = lgb.Dataset(Xtr, label=ytr, free_raw_data=False)
        dva = lgb.Dataset(Xva, label=yva, reference=dtr, free_raw_data=False)

        bst = lgb.train(
            params=params,
            train_set=dtr,
            num_boost_round=int(params.get("num_boost_round", 2000)),
            valid_sets=[dva],
            valid_names=["val"],
            callbacks=[lgb.early_stopping(EARLY_STOP, verbose=False)],
        )
        p = bst.predict(Xva, num_iteration=bst.best_iteration)
        aucs.append(float(roc_auc_score(yva, p)))
    return float(np.mean(aucs))

# ----------------------------
# Optuna (resume-able)
# ----------------------------
import optuna

try:
    with open(study_path, "rb") as f:
        study = pickle.load(f)
    print("Loaded existing study:", study_path)
except Exception:
    study = optuna.create_study(direction="maximize", study_name=f"lgbm_gate_{CFG_ID}")

def objective(trial: optuna.Trial) -> float:
    params = {
        "objective": "binary",
        "metric": "auc",
        "verbosity": -1,
        "seed": SEED,
        "feature_pre_filter": False,

        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.08, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 16, 256),
        "max_depth": trial.suggest_int("max_depth", -1, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 300),
        "min_sum_hessian_in_leaf": trial.suggest_float("min_sum_hessian_in_leaf", 1e-3, 5.0, log=True),

        "feature_fraction": trial.suggest_float("feature_fraction", 0.55, 0.95),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.55, 0.95),
        "bagging_freq": trial.suggest_int("bagging_freq", 0, 4),

        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0.0, 2.0),

        "num_boost_round": trial.suggest_int("num_boost_round", 800, NUM_BOOST_MAX),
    }

    score = cv_auc_lgb(params)
    return score

study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

# ----------------------------
# Save artifacts
# ----------------------------
with open(study_path, "wb") as f:
    pickle.dump(study, f)

best = study.best_params.copy()

BEST_LGBM_PARAMS = {
    "objective": "binary",
    "metric": "auc",
    "verbosity": -1,
    "seed": SEED,
    "feature_pre_filter": False,
    **best,
}

best_params_path.write_text(json.dumps(
    {"best_value_auc": float(study.best_value), "best_params": BEST_LGBM_PARAMS},
    indent=2
))

# trials table
try:
    tdf = study.trials_dataframe(attrs=("number", "value", "state", "params"))
    tdf.to_csv(trials_csv_path, index=False)
except Exception:
    pass

print("BEST AUC:", float(study.best_value))
print("BEST_LGBM_PARAMS:", json.dumps(BEST_LGBM_PARAMS, indent=2))
print("SAVED:")
print(" -", study_path)
print(" -", best_params_path)
print(" -", trials_csv_path)

BEST_LGBM_PARAMS


# Final Training (Train on Full Data)

In [None]:
# ============================================================
# STAGE — Final Training (Train on Full Data)  [ONE CELL]
# - Train Gate model on ALL training rows using BEST params (from optuna json if exists)
# - Optional calibration on full data (sigmoid / isotonic)
# - Predict TEST probs
# - Output:
#   - gate_final_model_<CFG_ID>.pkl
#   - gate_test_pred_<CFG_ID>.parquet
#   - gate_final_bundle_<CFG_ID>.json (metadata)
#
# REQUIRE globals:
#   GATE_OUT_DIR, GATE_CFG, GATE_CFG_ID
#   gate_train_table_<CFG_ID>.parquet, gate_test_table_<CFG_ID>.parquet, feature_cols.json
# ============================================================

import json, pickle
import numpy as np
import pandas as pd
from pathlib import Path

import lightgbm as lgb
from sklearn.calibration import CalibratedClassifierCV

CFG_ID = GATE_CFG_ID if "GATE_CFG_ID" in globals() else "gate"
GATE_OUT_DIR = Path(GATE_OUT_DIR)

train_tab_path = GATE_OUT_DIR / f"gate_train_table_{CFG_ID}.parquet"
test_tab_path  = GATE_OUT_DIR / f"gate_test_table_{CFG_ID}.parquet"
feat_cols_path = GATE_OUT_DIR / "feature_cols.json"

best_params_path = GATE_OUT_DIR / f"best_lgbm_params_{CFG_ID}.json"

final_model_path = GATE_OUT_DIR / f"gate_final_model_{CFG_ID}.pkl"
test_pred_path   = GATE_OUT_DIR / f"gate_test_pred_{CFG_ID}.parquet"
bundle_path      = GATE_OUT_DIR / f"gate_final_bundle_{CFG_ID}.json"

# ----------------------------
# Load tables
# ----------------------------
df_tr = pd.read_parquet(train_tab_path)
df_te = pd.read_parquet(test_tab_path)
FEATURE_COLS = json.loads(feat_cols_path.read_text())

Xtr = df_tr[FEATURE_COLS].to_numpy(dtype=np.float32)
ytr = df_tr["y"].to_numpy(dtype=np.int64)
Xte = df_te[FEATURE_COLS].to_numpy(dtype=np.float32)

print("Train:", Xtr.shape, " Test:", Xte.shape, " Features:", len(FEATURE_COLS))

# ----------------------------
# Params: prefer optuna best if exists, else fallback to GATE_CFG.lgbm_params
# ----------------------------
try:
    best_pack = json.loads(best_params_path.read_text())
    params = dict(best_pack["best_params"])
    print("Using BEST params from:", best_params_path.name)
except Exception:
    params = dict(GATE_CFG.lgbm_params or {})
    params.setdefault("objective", "binary")
    print("Using fallback params from GATE_CFG.lgbm_params")

params.setdefault("objective", "binary")
params.setdefault("random_state", int(getattr(GATE_CFG, "seed", 42)))
params.setdefault("n_jobs", -1)

# ----------------------------
# Train full model
# ----------------------------
base = lgb.LGBMClassifier(**params)
base.fit(Xtr, ytr)

# ----------------------------
# Optional calibration (fit on full data)
# ----------------------------
use_cal = bool(getattr(GATE_CFG, "use_calibration", True))
cal_method = str(getattr(GATE_CFG, "calibration_method", "sigmoid"))

final_model = base
if use_cal:
    base2 = lgb.LGBMClassifier(**params)
    cal = CalibratedClassifierCV(base2, method=cal_method, cv=5)
    cal.fit(Xtr, ytr)
    final_model = cal

# ----------------------------
# Predict test
# ----------------------------
p_test = final_model.predict_proba(Xte)[:, 1].astype(np.float32)

df_out = df_te[["case_id"]].copy()
df_out["prob_forged"] = p_test
df_out.to_parquet(test_pred_path, index=False)

# ----------------------------
# Save model
# ----------------------------
with open(final_model_path, "wb") as f:
    pickle.dump({"model": final_model, "feature_cols": FEATURE_COLS, "cfg_id": CFG_ID}, f)

bundle = {
    "cfg_id": CFG_ID,
    "model_type": "lgbm_calibrated" if use_cal else "lgbm",
    "calibration": {"enabled": use_cal, "method": cal_method},
    "n_train": int(len(df_tr)),
    "n_test": int(len(df_te)),
    "n_features": int(len(FEATURE_COLS)),
    "feature_cols_path": str(feat_cols_path),
    "best_params_path": str(best_params_path),
    "final_model_path": str(final_model_path),
    "test_pred_path": str(test_pred_path),
}
bundle_path.write_text(json.dumps(bundle, indent=2))

print("SAVED:")
print(" -", final_model_path)
print(" -", test_pred_path)
print(" -", bundle_path)
display(df_out.head(5))


# Finalize & Save Model Bundle (Reproducible)

In [None]:
# ============================================================
# STAGE — Finalize & Save Model Bundle (Reproducible)  [ONE CELL]
# - Collects segmentation models + gate model + configs + feature cols into one portable ZIP
# - Output folder:
#   /kaggle/working/recodai_luc_artifacts_bundle/<BUNDLE_NAME>/
#   + <BUNDLE_NAME>.zip
#
# REQUIRE globals:
#   SEG_RUN_DIR, SEG_CFG, SEG_CFG_ID
#   GATE_OUT_DIR, GATE_CFG, GATE_CFG_ID
# ============================================================

import os, json, time, zipfile, hashlib, platform
from pathlib import Path
from dataclasses import asdict

# ----------------------------
# Output root
# ----------------------------
BUNDLE_NAME = f"recodai_luc_bundle_seg{SEG_CFG_ID}_gate{GATE_CFG_ID}"
OUT_ROOT = Path("/kaggle/working/recodai_luc_artifacts_bundle") / BUNDLE_NAME
OUT_ROOT.mkdir(parents=True, exist_ok=True)

# ----------------------------
# Files to include
# ----------------------------
seg_ckpts = sorted(Path(SEG_RUN_DIR).glob("seg_unet_aspp_fold*.pt"))
seg_hist  = Path(SEG_RUN_DIR) / "seg_cv_history.json"

gate_files = [
    Path(GATE_OUT_DIR) / f"feature_cols.json",
    Path(GATE_OUT_DIR) / f"best_lgbm_params_{GATE_CFG_ID}.json",
    Path(GATE_OUT_DIR) / f"gate_final_model_{GATE_CFG_ID}.pkl",
    Path(GATE_OUT_DIR) / f"gate_final_bundle_{GATE_CFG_ID}.json",
]

# optional (not fatal if missing): oof, test pred, optuna study
opt_files = [
    Path(GATE_OUT_DIR) / f"gate_oof_pred_{GATE_CFG_ID}.parquet",
    Path(GATE_OUT_DIR) / f"gate_test_pred_{GATE_CFG_ID}.parquet",
    Path(GATE_OUT_DIR) / f"optuna_lgbm_study_{GATE_CFG_ID}.pkl",
    Path(GATE_OUT_DIR) / f"optuna_trials_{GATE_CFG_ID}.csv",
]

# ----------------------------
# Save metadata
# ----------------------------
meta = {
    "bundle_name": BUNDLE_NAME,
    "created_utc": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    "platform": {"python": platform.python_version(), "system": platform.platform()},
    "seg": {"seg_run_dir": str(SEG_RUN_DIR), "seg_cfg_id": SEG_CFG_ID, "seg_cfg": asdict(SEG_CFG)},
    "gate": {"gate_out_dir": str(GATE_OUT_DIR), "gate_cfg_id": GATE_CFG_ID, "gate_cfg": asdict(GATE_CFG)},
}

meta_path = OUT_ROOT / "bundle_meta.json"
meta_path.write_text(json.dumps(meta, indent=2))

# ----------------------------
# Copy artifacts into OUT_ROOT (no if/else; copy only existing via try/except)
# ----------------------------
def copy_into(src: Path, dst_dir: Path):
    dst = dst_dir / src.name
    dst.write_bytes(src.read_bytes())
    return dst

copied = []

for p in seg_ckpts:
    copied.append(str(copy_into(p, OUT_ROOT)))
copied.append(str(copy_into(seg_hist, OUT_ROOT)))
for p in gate_files + opt_files:
    try:
        copied.append(str(copy_into(p, OUT_ROOT)))
    except Exception:
        pass

# ----------------------------
# Checksums
# ----------------------------
def md5_file(p: Path) -> str:
    h = hashlib.md5()
    with p.open("rb") as f:
        for chunk in iter(lambda: f.read(1024 * 1024), b""):
            h.update(chunk)
    return h.hexdigest()

cksum = {Path(p).name: md5_file(Path(p)) for p in copied if Path(p).exists()}
(OUT_ROOT / "checksums_md5.json").write_text(json.dumps(cksum, indent=2))

# ----------------------------
# Zip bundle
# ----------------------------
zip_path = OUT_ROOT.parent / f"{BUNDLE_NAME}.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    for p in sorted(OUT_ROOT.rglob("*")):
        z.write(p, arcname=str(p.relative_to(OUT_ROOT.parent)))

print("BUNDLE_DIR:", OUT_ROOT)
print("ZIP       :", zip_path)
print("FILES COPIED:", len(copied))
print("Seg ckpts:", len(seg_ckpts))
print("SAVED:")
print(" -", meta_path)
print(" -", OUT_ROOT / "checksums_md5.json")
print(" -", zip_path)
