# Set Paths & Select Config (CFG)

In [5]:
import os, json, random, hashlib
from pathlib import Path
from dataclasses import dataclass, asdict

def first_existing(paths):
    for p in paths:
        p = Path(p)
        if p.exists():
            return p
    return None

def newest_valid_dir(parent: Path, prefix: str, must_have: str):
    if parent is None or (not parent.exists()):
        return None 
    cands = [d for d in parent.iterdie() if d.is_dir() and d.name.startswith(prefix)]
    ok = [d for d in cands if (d / must_have).exists()]
    if not ok:
        return None 
    ok.sort(key=lambda d: d.stat().st_time, reverse=True)
    return ok[0]

def jhash(obj) -> str:
    s = json.dumps(obj, sort_key=True, ensure_escii=True, separators=(",",":"))
    return hashlib.md5(s.encode("utf-8")).hexdigest()[:12]

In [9]:
COMP_ROOT = Path("/kaggle/input/recodai-luc-scientific-image-forgery-detection")
DINO_DIR  = Path("/kaggle/input/dinov2/pytorch/base/1")

BUNDLE_ROOT = first_existing([
    "/kaggle/working/recodai_luc",
    "/kaggle/input/recod-ailuc-dinov2-base/recodai_luc",
])

PROF_DIR = first_existing([
    "/kaggle/working/recodai_luc_prof",
    "/kaggle/input/recod-ailuc-dinov2-base/recodai_luc_prof",
])

CACHE_ROOT = Path(BUNDLE_ROOT) / "cahce"

TRAIN_MANIFEST = Path(PROF_DIR) / "train_manifest.parquet"
TEST_MANIFEST  = Path(PROF_DIR) / "test_manifest.parquet"
FOLDS_PATH     = Path(PROF_DIR) / "folds.parquet"

print("COMP_ROOT:", COMP_ROOT)
print("DINO_DIR :", DINO_DIR)
print("BUNDLE_ROOT:", BUNDLE_ROOT)
print("PROF_DIR:", PROF_DIR)
print("CACHE_ROOT:", CACHE_ROOT)

COMP_ROOT: /kaggle/input/recodai-luc-scientific-image-forgery-detection
DINO_DIR : /kaggle/input/dinov2/pytorch/base/1
BUNDLE_ROOT: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc
PROF_DIR: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc_prof
CACHE_ROOT: /kaggle/input/recod-ailuc-dinov2-base/recodai_luc/cahce


In [None]:
TOKEN_CACHE_ROOT = newest_valid_dir(CACHE_ROOT, "dinov2_base_518_cfg_", "tokens_manifest_train.parquet")
if TOKEN_CACHE_ROOT is None:
    TOKEN_CACHE_ROOT = newest_valid_dir(CACHE_ROOT, "dinov2_base_518_cfg_bind_", "tokens_manifest_train.parquet")

MATCH_CACHE_ROOT = newest_valid_dir(CACHE_ROOT, "match_cfg_", "match_manifest_train.parquet")

PRED_DIR = Path(CACHE_ROOT) / "pred_ens"
PRED_FEATS_TRAIN = PRED_DIR / "pred_features_train.csv"
PRED_FEATS_TEST  = PRED_DIR / "pred_features_test.csv"

GATE_OUT_DIR = Path("/kaggle/working/recodai_luc_gate_artifacts")

print("TOKEN_CACHE_ROOT:", TOKEN_CACHE_ROOT)
print("MATCH_CACHE_ROOT:", MATCH_CACHE_ROOT)
print("PRED_DIR:", PRED_DIR)
print("PRED_FEATS_TRAIN:", PRED_FEATS_TRAIN)
print("PRED_FEATS_TEST :", PRED_FEATS_TEST)
print("GATE_OUT_DIR:", GATE_OUT_DIR)

In [None]:
@dataclass
class GateCFG:
    run_name: str = "gate_lgbm_v1"
    seed: int = 42
    cv_n_splits: int = 5
    model_type: str = "lgbm"          # lgbm | logreg | catboost
    use_calibration: bool = True
    calibration_method: str = "sigmoid"  # sigmoid | isotonic
    threshold_strategy: str = "stability_f1"  # stability_f1 | best_f1
    lgbm_params: dict = None

CFG = GateCFG(
    lgbm_params=dict(
        objective="binary",
        learning_rate=0.03,
        n_estimators=2500,
        num_leaves=63,
        max_depth=-1,
        min_data_in_leaf=80,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        lambda_l1=0.0,
        lambda_l2=1.0,
        random_state=42,
        n_jobs=-1,
    )
)

CFG_ID = jhash(asdict(CFG))

os.environ["PYTHONHASHSEED"] = str(CFG.seed)
random.seed(CFG.seed)

print("CFG_ID:", CFG_ID)
print("CFG:", json.dumps(asdict(CFG), indent=2))

# Build Training Table (X, y, folds)

# Build & Export Test Feature Table

# Train Baseline Model (Leakage-Safe CV)

# Optimize Model & Hyperparameters (Iterative)

# Final Training (Train on Full Data)

# Finalize & Save Model Bundle (Reproducible)