# Pahami struktur data & indeks object_id

In [1]:
# ============================================================
# STAGE 1 (REVISI FULL) — Pahami struktur data & indeks object_id -> split
# + (OPSIONAL) Sequential scan per split (chunked) untuk ringkasan lightcurve files
#
# Root dataset:
#   D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge
#
# Output (artifacts/):
#   - train_log_clean.csv
#   - test_log_clean.csv
#   - index_object_split.csv
#   - splits_summary.csv
#   - split_files_summary.csv      (jika SCAN_SPLIT_FILES=True)
# ============================================================

import os, re
from pathlib import Path
import pandas as pd

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")

# Sequential scan settings (opsional)
SCAN_SPLIT_FILES = True            # True = scan train/test_full_lightcurves.csv tiap split (chunked)
CHUNK_ROWS = 1_000_000             # makin besar makin cepat tapi lebih berat RAM
ONLY_SCAN_SPLITS_IN_LOG = True     # True = scan split yang muncul di log saja, bukan selalu 01..20

# ----------------------------
# PATHS
# ----------------------------
PATHS = {
    "root": DATA_ROOT,
    "train_log": DATA_ROOT / "train_log.csv",
    "test_log":  DATA_ROOT / "test_log.csv",
    "sample_submission": DATA_ROOT / "sample_submission.csv",
    "artifacts": DATA_ROOT / "artifacts",
}
PATHS["artifacts"].mkdir(parents=True, exist_ok=True)

def _fail(msg: str):
    raise RuntimeError(msg)

for k in ["train_log", "test_log", "sample_submission"]:
    if not PATHS[k].exists():
        _fail(f"Missing file: {PATHS[k]}")

# ----------------------------
# Helpers
# ----------------------------
_SPLIT_RE = re.compile(r"(\d+)")
def normalize_split(x) -> str:
    if pd.isna(x):
        return None
    s = str(x).strip()
    m = _SPLIT_RE.search(s)
    if not m:
        return s
    n = int(m.group(1))
    return f"split_{n:02d}"

def read_csv_safely(path: Path) -> pd.DataFrame:
    return pd.read_csv(path, low_memory=False)

def ensure_cols(df: pd.DataFrame, cols, name="df"):
    missing = [c for c in cols if c not in df.columns]
    if missing:
        _fail(f"{name} missing columns: {missing}\nFound columns: {list(df.columns)}")

def file_bytes(path: Path) -> int:
    try:
        return path.stat().st_size
    except Exception:
        return -1

def scan_object_ids_csv(csv_path: Path, chunk_rows: int = 1_000_000) -> dict:
    """
    Scan hanya kolom 'object_id' secara chunked.
    Return: total_rows, unique_object_ids (count), ok (bool), err (str)
    """
    res = {
        "path": str(csv_path),
        "exists": csv_path.exists(),
        "bytes": file_bytes(csv_path),
        "total_rows": 0,
        "unique_object_ids": 0,
        "ok": False,
        "err": "",
    }
    if not csv_path.exists():
        res["err"] = "file_missing"
        return res

    try:
        uniq = set()
        total = 0
        # Hanya baca kolom object_id agar ringan
        for chunk in pd.read_csv(csv_path, usecols=["object_id"], dtype={"object_id": "string"},
                                 chunksize=chunk_rows, low_memory=False):
            # dropna + convert to python str
            vals = chunk["object_id"].dropna().astype(str).tolist()
            total += len(vals)
            uniq.update(vals)

        res["total_rows"] = int(total)
        res["unique_object_ids"] = int(len(uniq))
        res["ok"] = True
        return res
    except ValueError as e:
        # biasanya terjadi jika kolom object_id tidak ditemukan
        res["err"] = f"ValueError: {e}"
        return res
    except Exception as e:
        res["err"] = f"{type(e).__name__}: {e}"
        return res

# ----------------------------
# Load logs
# ----------------------------
df_train_log = read_csv_safely(PATHS["train_log"])
df_test_log  = read_csv_safely(PATHS["test_log"])

ensure_cols(df_train_log, ["object_id", "Z", "EBV", "split", "target"], "train_log")
ensure_cols(df_test_log,  ["object_id", "Z", "EBV", "split"], "test_log")

df_train_log = df_train_log.copy()
df_test_log  = df_test_log.copy()

# Normalize split naming
df_train_log["split"] = df_train_log["split"].apply(normalize_split)
df_test_log["split"]  = df_test_log["split"].apply(normalize_split)

# Coerce types
df_train_log["object_id"] = df_train_log["object_id"].astype(str)
df_test_log["object_id"]  = df_test_log["object_id"].astype(str)

for col in ["Z", "EBV"]:
    df_train_log_CONFIRM = pd.to_numeric(df_train_log[col], errors="coerce")
    df_test_log_CONFIRM  = pd.to_numeric(df_test_log[col], errors="coerce")
    df_train_log[col] = df_train_log_CONFIRM
    df_test_log[col]  = df_test_log_CONFIRM

if "Z_err" in df_train_log.columns:
    df_train_log["Z_err"] = pd.to_numeric(df_train_log["Z_err"], errors="coerce")
if "Z_err" in df_test_log.columns:
    df_test_log["Z_err"] = pd.to_numeric(df_test_log["Z_err"], errors="coerce")

df_train_log["target"] = pd.to_numeric(df_train_log["target"], errors="coerce").astype("Int64")

# ----------------------------
# Basic sanity checks
# ----------------------------
dup_tr = int(df_train_log["object_id"].duplicated().sum())
dup_te = int(df_test_log["object_id"].duplicated().sum())
if dup_tr > 0 or dup_te > 0:
    print(f"[WARN] Duplicate object_id found | train={dup_tr}, test={dup_te}. Keeping first occurrence.")
    df_train_log = df_train_log.drop_duplicates("object_id", keep="first").reset_index(drop=True)
    df_test_log  = df_test_log.drop_duplicates("object_id", keep="first").reset_index(drop=True)

overlap = set(df_train_log["object_id"]).intersection(set(df_test_log["object_id"]))
if len(overlap) > 0:
    print(f"[WARN] Found {len(overlap)} object_id present in BOTH train and test (unexpected). Example: {list(sorted(overlap))[:3]}")

# ----------------------------
# Build split -> ids mapping
# ----------------------------
split_to_train_ids = df_train_log.groupby("split")["object_id"].apply(list).to_dict()
split_to_test_ids  = df_test_log.groupby("split")["object_id"].apply(list).to_dict()

all_splits_in_log = sorted(set(df_train_log["split"].dropna()).union(set(df_test_log["split"].dropna())))

# Optional: enforce split_01..split_20 existence check
missing_split_dirs = [s for s in all_splits_in_log if not (PATHS["root"] / s).exists()]
if missing_split_dirs:
    print("[WARN] Some split folders referenced in logs do not exist on disk:")
    for s in missing_split_dirs[:50]:
        print("  -", s)

# ----------------------------
# Combined index dataframe (1 row per object_id)
# ----------------------------
df_train_idx = df_train_log[["object_id", "split", "Z", "EBV"]].copy()
df_train_idx["is_train"] = 1
df_train_idx["target"] = df_train_log["target"]

df_test_idx = df_test_log[["object_id", "split", "Z", "EBV"]].copy()
df_test_idx["is_train"] = 0
df_test_idx["target"] = pd.NA

if "Z_err" in df_train_log.columns or "Z_err" in df_test_log.columns:
    df_train_idx["Z_err"] = df_train_log["Z_err"] if "Z_err" in df_train_log.columns else pd.NA
    df_test_idx["Z_err"]  = df_test_log["Z_err"]  if "Z_err" in df_test_log.columns  else pd.NA

df_index = pd.concat([df_train_idx, df_test_idx], ignore_index=True)
df_index = df_index.sort_values(["is_train", "split", "object_id"], ascending=[False, True, True]).reset_index(drop=True)

# ----------------------------
# Summaries
# ----------------------------
pos = int((df_train_log["target"] == 1).sum())
neg = int((df_train_log["target"] == 0).sum())
tot = len(df_train_log)
pos_rate = pos / max(tot, 1)

print("=== DATASET SUMMARY ===")
print(f"Root             : {PATHS['root']}")
print(f"Train objects    : {len(df_train_log):,}")
print(f"Test objects     : {len(df_test_log):,}")
print(f"Train target     : pos={pos:,} | neg={neg:,} | pos_rate={pos_rate:.4f}")
print(f"Splits in log    : {len(all_splits_in_log)} | example: {all_splits_in_log[:5]}")

def _split_summary(df_log: pd.DataFrame, is_train: int) -> pd.DataFrame:
    g = df_log.groupby("split").agg(
        n_objects=("object_id", "count"),
        z_mean=("Z", "mean"),
        z_std=("Z", "std"),
        ebv_mean=("EBV", "mean"),
        ebv_std=("EBV", "std"),
    ).reset_index()
    g["is_train"] = is_train
    if is_train and "target" in df_log.columns:
        gg = df_log.groupby("split")["target"].agg(
            pos=lambda x: int((x == 1).sum()),
            neg=lambda x: int((x == 0).sum()),
        ).reset_index()
        g = g.merge(gg, on="split", how="left")
        g["pos_rate"] = g["pos"] / g["n_objects"].clip(lower=1)
    return g

df_sum_train = _split_summary(df_train_log, 1)
df_sum_test  = _split_summary(df_test_log, 0)
df_splits_summary = pd.concat([df_sum_train, df_sum_test], ignore_index=True).sort_values(
    ["is_train","split"], ascending=[False, True]
)

print("\n=== SPLIT SUMMARY (top 10 rows) ===")
print(df_splits_summary.head(10).to_string(index=False))

# ----------------------------
# OPTIONAL: Sequential scan split files (chunked)
# ----------------------------
df_split_files_summary = None

if SCAN_SPLIT_FILES:
    print("\n=== SEQUENTIAL SCAN SPLIT FILES (chunked object_id only) ===")
    if ONLY_SCAN_SPLITS_IN_LOG:
        splits_to_scan = all_splits_in_log
    else:
        splits_to_scan = [f"split_{i:02d}" for i in range(1, 21)]

    rows = []
    for i, sp in enumerate(splits_to_scan, start=1):
        sp_dir = PATHS["root"] / sp
        train_lc = sp_dir / "train_full_lightcurves.csv"
        test_lc  = sp_dir / "test_full_lightcurves.csv"

        print(f"[{i}/{len(splits_to_scan)}] {sp} | scanning train/test lightcurves...")

        tr_scan = scan_object_ids_csv(train_lc, chunk_rows=CHUNK_ROWS)
        te_scan = scan_object_ids_csv(test_lc,  chunk_rows=CHUNK_ROWS)

        # Compare to logs (berapa object log yang seharusnya ada)
        tr_log_n = len(split_to_train_ids.get(sp, []))
        te_log_n = len(split_to_test_ids.get(sp, []))

        rows.append({
            "split": sp,
            "train_lc_exists": tr_scan["exists"],
            "train_lc_bytes": tr_scan["bytes"],
            "train_lc_total_rows": tr_scan["total_rows"],
            "train_lc_unique_object_ids": tr_scan["unique_object_ids"],
            "train_lc_ok": tr_scan["ok"],
            "train_lc_err": tr_scan["err"],
            "train_log_objects": tr_log_n,
            "train_coverage_ratio": (tr_scan["unique_object_ids"] / tr_log_n) if tr_log_n > 0 else pd.NA,

            "test_lc_exists": te_scan["exists"],
            "test_lc_bytes": te_scan["bytes"],
            "test_lc_total_rows": te_scan["total_rows"],
            "test_lc_unique_object_ids": te_scan["unique_object_ids"],
            "test_lc_ok": te_scan["ok"],
            "test_lc_err": te_scan["err"],
            "test_log_objects": te_log_n,
            "test_coverage_ratio": (te_scan["unique_object_ids"] / te_log_n) if te_log_n > 0 else pd.NA,
        })

    df_split_files_summary = pd.DataFrame(rows).sort_values("split").reset_index(drop=True)

    print("\n=== SPLIT FILES SUMMARY (top 10) ===")
    print(df_split_files_summary.head(10).to_string(index=False))

# ----------------------------
# Save artifacts
# ----------------------------
(df_train_log).to_csv(PATHS["artifacts"] / "train_log_clean.csv", index=False)
(df_test_log).to_csv(PATHS["artifacts"] / "test_log_clean.csv", index=False)
(df_index).to_csv(PATHS["artifacts"] / "index_object_split.csv", index=False)
(df_splits_summary).to_csv(PATHS["artifacts"] / "splits_summary.csv", index=False)

if df_split_files_summary is not None:
    df_split_files_summary.to_csv(PATHS["artifacts"] / "split_files_summary.csv", index=False)

print("\nSaved artifacts to:", PATHS["artifacts"])
print(" - train_log_clean.csv")
print(" - test_log_clean.csv")
print(" - index_object_split.csv")
print(" - splits_summary.csv")
if df_split_files_summary is not None:
    print(" - split_files_summary.csv")


  df_index = pd.concat([df_train_idx, df_test_idx], ignore_index=True)


=== DATASET SUMMARY ===
Root             : D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge
Train objects    : 3,043
Test objects     : 7,135
Train target     : pos=148 | neg=2,895 | pos_rate=0.0486
Splits in log    : 20 | example: ['split_01', 'split_02', 'split_03', 'split_04', 'split_05']

=== SPLIT SUMMARY (top 10 rows) ===
   split  n_objects   z_mean    z_std  ebv_mean  ebv_std  is_train  pos  neg  pos_rate
split_01        155 0.609961 0.553551  0.061677 0.080969         1   12  143  0.077419
split_02        170 0.677911 0.568435  0.053988 0.063161         1   12  158  0.070588
split_03        138 0.749656 0.499070  0.057594 0.065305         1    3  135  0.021739
split_04        145 0.763510 0.603055  0.051007 0.060881         1   12  133  0.082759
split_05        165 0.661696 0.504670  0.050903 0.045186         1    6  159  0.036364
split_06        155 0.611903 0.435373  0.055794 0.064640         1    9  146  0.058065
split_07       

# Baseline super cepat (cek pipeline benar)

In [2]:
# ============================================================
# STAGE 2 — BASELINE SUPER CEPAT (CEK PIPELINE BENAR) [REVISI]
# Model: Logistic Regression (CPU cepat)
# Fitur: hanya dari log (Z, EBV, Z_err + missing flags)
#
# Fix:
# - sample_submission.csv di dataset kamu pakai kolom 'prediction' (bukan 'target')
# - kode ini auto-detect nama kolom prediksi dari sample_submission
#
# Input:
#   artifacts/train_log_clean.csv
#   artifacts/test_log_clean.csv
#   sample_submission.csv
#
# Output:
#   artifacts/baseline_oof.csv
#   artifacts/baseline_threshold.txt
#   submissions/sub_baseline_logreg.csv
# ============================================================

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"
SUB_DIR   = DATA_ROOT / "submissions"
SUB_DIR.mkdir(parents=True, exist_ok=True)

SEED = 2025
N_FOLDS = 5

# ----------------------------
# Load artifacts from Stage 1
# ----------------------------
train_path = ART_DIR / "train_log_clean.csv"
test_path  = ART_DIR / "test_log_clean.csv"
sample_sub_path = DATA_ROOT / "sample_submission.csv"

if not train_path.exists():
    raise FileNotFoundError(f"Missing: {train_path}. Jalankan Stage 1 dulu.")
if not test_path.exists():
    raise FileNotFoundError(f"Missing: {test_path}. Jalankan Stage 1 dulu.")
if not sample_sub_path.exists():
    raise FileNotFoundError(f"Missing: {sample_sub_path}")

df_tr = pd.read_csv(train_path, low_memory=False)
df_te = pd.read_csv(test_path, low_memory=False)
df_sub = pd.read_csv(sample_sub_path, low_memory=False)

# ----------------------------
# Detect submission prediction column name
# ----------------------------
if "object_id" not in df_sub.columns:
    raise ValueError(f"sample_submission tidak punya 'object_id'. Found: {list(df_sub.columns)}")

pred_cols = [c for c in df_sub.columns if c != "object_id"]
if len(pred_cols) != 1:
    raise ValueError(f"sample_submission harus punya 1 kolom prediksi selain object_id. Found: {list(df_sub.columns)}")

SUB_PRED_COL = pred_cols[0]  # di kasus kamu: 'prediction'
print(f"[INFO] sample_submission prediction column = '{SUB_PRED_COL}'")

# ----------------------------
# Minimal checks
# ----------------------------
need_tr = ["object_id", "Z", "EBV", "split", "target"]
need_te = ["object_id", "Z", "EBV", "split"]

for c in need_tr:
    if c not in df_tr.columns:
        raise ValueError(f"train missing col: {c} | found={list(df_tr.columns)}")
for c in need_te:
    if c not in df_te.columns:
        raise ValueError(f"test missing col: {c} | found={list(df_te.columns)}")

# Ensure numeric
for col in ["Z", "EBV"]:
    df_tr[col] = pd.to_numeric(df_tr[col], errors="coerce")
    df_te[col] = pd.to_numeric(df_te[col], errors="coerce")

# Z_err optional: train biasanya kosong/tidak ada, test ada
if "Z_err" not in df_tr.columns:
    df_tr["Z_err"] = np.nan
else:
    df_tr["Z_err"] = pd.to_numeric(df_tr["Z_err"], errors="coerce")

if "Z_err" not in df_te.columns:
    df_te["Z_err"] = np.nan
else:
    df_te["Z_err"] = pd.to_numeric(df_te["Z_err"], errors="coerce")

# Target
y = pd.to_numeric(df_tr["target"], errors="coerce").astype(int).values

# ----------------------------
# Feature set (log-only)
# ----------------------------
def build_log_features(df: pd.DataFrame) -> pd.DataFrame:
    X = pd.DataFrame({
        "Z": df["Z"].astype(float),
        "EBV": df["EBV"].astype(float),
        "Z_err": df["Z_err"].astype(float),
        "Z_isna": df["Z"].isna().astype(int),
        "EBV_isna": df["EBV"].isna().astype(int),
        "Zerr_isna": df["Z_err"].isna().astype(int),
    })

    # stabilisasi sederhana (optional)
    def _p99(a):
        a = np.asarray(a, dtype=float)
        a = a[np.isfinite(a)]
        if a.size == 0:
            return np.nan
        return float(np.nanpercentile(a, 99))

    z99 = _p99(X["Z"].values)
    e99 = _p99(X["EBV"].values)
    zerr99 = _p99(X["Z_err"].values)

    X["Z_clip"] = X["Z"].clip(lower=0, upper=z99 if np.isfinite(z99) else 10.0)
    X["EBV_clip"] = X["EBV"].clip(lower=0, upper=e99 if np.isfinite(e99) else 1.0)
    X["Zerr_clip"] = X["Z_err"].clip(lower=0, upper=zerr99 if np.isfinite(zerr99) else 1.0)
    return X

X_tr = build_log_features(df_tr)
X_te = build_log_features(df_te)

# ----------------------------
# Model pipeline (fast CPU)
# ----------------------------
clf = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=True, with_std=True)),
    ("model", LogisticRegression(
        solver="liblinear",
        class_weight="balanced",
        max_iter=500,
        random_state=SEED
    ))
])

# ----------------------------
# CV + OOF
# ----------------------------
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
oof_proba = np.zeros(len(X_tr), dtype=np.float32)

fold_scores = []
for fold, (tr_idx, va_idx) in enumerate(skf.split(X_tr, y), start=1):
    X_train, X_val = X_tr.iloc[tr_idx], X_tr.iloc[va_idx]
    y_train, y_val = y[tr_idx], y[va_idx]

    clf.fit(X_train, y_train)
    p_val = clf.predict_proba(X_val)[:, 1]
    oof_proba[va_idx] = p_val

    f1_05 = f1_score(y_val, (p_val >= 0.5).astype(int))
    fold_scores.append(f1_05)
    print(f"[FOLD {fold}/{N_FOLDS}] F1@0.50 = {f1_05:.5f}")

print("\n=== CV SUMMARY (threshold=0.50) ===")
print(f"Mean F1@0.50: {float(np.mean(fold_scores)):.5f} | Std: {float(np.std(fold_scores)):.5f}")

# ----------------------------
# Threshold tuning (global OOF)
# ----------------------------
thr_grid = np.linspace(0.01, 0.99, 99)
best_thr, best_f1 = 0.5, -1.0
for thr in thr_grid:
    f1 = f1_score(y, (oof_proba >= thr).astype(int))
    if f1 > best_f1:
        best_f1 = float(f1)
        best_thr = float(thr)

print("\n=== OOF THRESHOLD TUNING ===")
print(f"Best threshold = {best_thr:.2f}")
print(f"OOF F1(best)   = {best_f1:.5f}")

y_hat = (oof_proba >= best_thr).astype(int)
print("\nConfusion Matrix (OOF):")
print(confusion_matrix(y, y_hat))

print("\nClassification Report (OOF):")
print(classification_report(y, y_hat, digits=4))

# Save OOF artifact
df_oof = pd.DataFrame({
    "object_id": df_tr["object_id"].astype(str),
    "oof_proba": oof_proba,
    "target": y
})
df_oof.to_csv(ART_DIR / "baseline_oof.csv", index=False)

with open(ART_DIR / "baseline_threshold.txt", "w", encoding="utf-8") as f:
    f.write(f"{best_thr}\n")

# ----------------------------
# Train full + predict test + export submission
# ----------------------------
clf.fit(X_tr, y)
test_proba = clf.predict_proba(X_te)[:, 1]
test_pred = (test_proba >= best_thr).astype(int)

sub = df_sub[["object_id"]].copy()
sub["object_id"] = sub["object_id"].astype(str)

pred_df = pd.DataFrame({
    "object_id": df_te["object_id"].astype(str),
    SUB_PRED_COL: test_pred.astype(int)
})

sub = sub.merge(pred_df, on="object_id", how="left")

if sub[SUB_PRED_COL].isna().any():
    missing = sub[sub[SUB_PRED_COL].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Submission has NaN after merge. Example missing object_id: {missing}")

out_path = SUB_DIR / "sub_baseline_logreg.csv"
sub.to_csv(out_path, index=False)

print("\n=== DONE ===")
print("Saved:", out_path)
print("Saved:", ART_DIR / "baseline_oof.csv")
print("Saved:", ART_DIR / "baseline_threshold.txt")


[INFO] sample_submission prediction column = 'prediction'
[FOLD 1/5] F1@0.50 = 0.10160




[FOLD 2/5] F1@0.50 = 0.08673
[FOLD 3/5] F1@0.50 = 0.10918
[FOLD 4/5] F1@0.50 = 0.08696




[FOLD 5/5] F1@0.50 = 0.08978

=== CV SUMMARY (threshold=0.50) ===
Mean F1@0.50: 0.09485 | Std: 0.00900

=== OOF THRESHOLD TUNING ===
Best threshold = 0.44
OOF F1(best)   = 0.10441

Confusion Matrix (OOF):
[[ 574 2321]
 [  12  136]]

Classification Report (OOF):
              precision    recall  f1-score   support

           0     0.9795    0.1983    0.3298      2895
           1     0.0554    0.9189    0.1044       148

    accuracy                         0.2333      3043
   macro avg     0.5174    0.5586    0.2171      3043
weighted avg     0.9346    0.2333    0.3188      3043


=== DONE ===
Saved: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\submissions\sub_baseline_logreg.csv
Saved: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\baseline_oof.csv
Saved: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\baseline_threshold



# Koreksi extinction (de-extinct flux) + fitur statistik per band

In [3]:
# ============================================================
# STAGE 3 — DE-EXTINCT FLUX + FITUR STATISTIK PER BAND (CHUNKED)
#
# Output:
#   artifacts/features_lc_train.csv
#   artifacts/features_lc_test.csv
#   artifacts/features_log.csv              (log features per object)
#   artifacts/features_merged_train.csv     (log + lc)
#   artifacts/features_merged_test.csv      (log + lc)
#
# Notes:
# - Hemat RAM: baca lightcurve per split, per chunk
# - Statistik yang dihitung (per object_id x band):
#   n_obs, flux_mean, flux_std, flux_min, flux_max, amp, snr_mean, frac_snr_gt3, frac_snr_gt5, time_span
# ============================================================

import json, re
from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"
ART_DIR.mkdir(parents=True, exist_ok=True)

CHUNK_ROWS = 1_000_000
BANDS = ["u", "g", "r", "i", "z", "y"]

# ----------------------------
# Load logs (clean)
# ----------------------------
train_log_path = ART_DIR / "train_log_clean.csv"
test_log_path  = ART_DIR / "test_log_clean.csv"
if not train_log_path.exists() or not test_log_path.exists():
    raise FileNotFoundError("Missing cleaned logs. Jalankan Stage 1 dulu.")

df_tr = pd.read_csv(train_log_path, low_memory=False)
df_te = pd.read_csv(test_log_path, low_memory=False)

# numeric
for col in ["Z", "EBV"]:
    df_tr[col] = pd.to_numeric(df_tr[col], errors="coerce")
    df_te[col] = pd.to_numeric(df_te[col], errors="coerce")

if "Z_err" not in df_tr.columns:
    df_tr["Z_err"] = np.nan
else:
    df_tr["Z_err"] = pd.to_numeric(df_tr["Z_err"], errors="coerce")

if "Z_err" not in df_te.columns:
    df_te["Z_err"] = np.nan
else:
    df_te["Z_err"] = pd.to_numeric(df_te["Z_err"], errors="coerce")

df_tr["object_id"] = df_tr["object_id"].astype(str)
df_te["object_id"] = df_te["object_id"].astype(str)

# ----------------------------
# EBV map for de-extinction
# ----------------------------
ebv_map = pd.concat([
    df_tr[["object_id", "EBV"]],
    df_te[["object_id", "EBV"]],
], ignore_index=True).drop_duplicates("object_id")
ebv_dict = dict(zip(ebv_map["object_id"].values, ebv_map["EBV"].values))

# ----------------------------
# Auto-extract extinction coefficients from Using_the_Data notebook (if exists)
# ----------------------------
def try_extract_extinction_coeffs(root: Path):
    """
    Cari file *Using_the_Data*.ipynb, lalu coba ekstrak dict yang berisi key u,g,r,i,z,y dan value float.
    """
    ipynbs = list(root.rglob("*Using_the_Data*.ipynb"))
    if not ipynbs:
        return None, None

    for nb_path in ipynbs[:5]:
        try:
            nb = json.loads(nb_path.read_text(encoding="utf-8"))
            cells = nb.get("cells", [])
            text = []
            for c in cells:
                if c.get("cell_type") in ("code", "markdown"):
                    src = c.get("source", [])
                    if isinstance(src, list):
                        text.append("".join(src))
                    elif isinstance(src, str):
                        text.append(src)
            blob = "\n".join(text)

            # cari pattern dict: 'u': 4.0, 'g': 3.0, ...
            pairs = re.findall(r"['\"]([ugrizy])['\"]\s*:\s*([0-9]+(?:\.[0-9]+)?)", blob)
            if not pairs:
                continue

            d = {}
            for k, v in pairs:
                d[k] = float(v)

            # harus punya semua bands
            if all(b in d for b in ["u","g","r","i","z","y"]):
                return d, nb_path
        except Exception:
            continue

    return None, None

EXT_COEFF, SRC_NB = try_extract_extinction_coeffs(DATA_ROOT)

# fallback (jika notebook tidak ketemu / gagal parse)
# kamu boleh ganti angka ini bila ingin sama persis dengan notebook Using_the_Data.
FALLBACK_EXT_COEFF = {
    "u": 4.2,
    "g": 3.3,
    "r": 2.3,
    "i": 1.7,
    "z": 1.3,
    "y": 1.1,
}

if EXT_COEFF is None:
    EXT_COEFF = FALLBACK_EXT_COEFF
    print("[WARN] Tidak berhasil ekstrak koefisien extinction dari Using_the_Data notebook.")
    print("       Pakai FALLBACK_EXT_COEFF. Jika mau 100% sama, ganti nilai dict ini sesuai notebook.")
else:
    print(f"[INFO] Extinction coeffs loaded from: {SRC_NB}")
print("[INFO] EXT_COEFF =", EXT_COEFF)

def de_extinct_flux(flux: np.ndarray, ebv: np.ndarray, band: str) -> np.ndarray:
    """
    flux_corr = flux * 10^(0.4 * A_lambda), dengan A_lambda = EXT_COEFF[band] * EBV
    """
    R = float(EXT_COEFF.get(band, 0.0))
    A = R * ebv
    factor = np.power(10.0, 0.4 * A)
    return flux * factor

# ----------------------------
# Lightcurve column detection (robust to naming)
# ----------------------------
def _norm(s: str) -> str:
    s = s.lower()
    s = re.sub(r"[\s\(\)\[\]\-]+", "", s)
    s = s.replace("_", "")
    return s

def detect_lc_columns(csv_path: Path):
    head = pd.read_csv(csv_path, nrows=0)
    cols = list(head.columns)
    ncols = {_norm(c): c for c in cols}

    def pick(cands):
        for c in cands:
            if c in ncols:
                return ncols[c]
        return None

    col_object = pick(["objectid", "object_id"])
    col_time   = pick(["timemjd", "time", "mjd", "timemodifiedjuliandate"])
    col_flux   = pick(["flux"])
    col_ferr   = pick(["fluxerr", "flux_err", "fluxerror", "fluxunc", "fluxuncertainty"])
    col_filt   = pick(["filter", "band", "passband"])

    got = {
        "object_id": col_object,
        "mjd": col_time,
        "flux": col_flux,
        "flux_err": col_ferr,
        "filter": col_filt,
    }
    if any(v is None for v in got.values()):
        raise ValueError(f"Kolom lightcurve tidak terdeteksi lengkap di {csv_path}\n"
                         f"Detected mapping: {got}\nAll columns: {cols}")
    return got

# ----------------------------
# Chunked scan + partial aggregation
# ----------------------------
def partial_agg_from_file(csv_path: Path, object_ids_set: set, ebv_dict: dict, chunk_rows: int) -> pd.DataFrame:
    """
    Return partial aggregated stats per (object_id, filter) dari 1 file CSV lightcurve.
    """
    if not csv_path.exists():
        return pd.DataFrame()

    colmap = detect_lc_columns(csv_path)
    usecols = list(colmap.values())

    parts = []
    reader = pd.read_csv(csv_path, usecols=usecols, chunksize=chunk_rows, low_memory=False)

    for chunk in reader:
        chunk = chunk.rename(columns={v: k for k, v in colmap.items()})

        # types
        chunk["object_id"] = chunk["object_id"].astype(str)
        chunk = chunk[chunk["object_id"].isin(object_ids_set)]
        if chunk.empty:
            continue

        chunk["filter"] = chunk["filter"].astype(str).str.strip().str.lower()
        chunk = chunk[chunk["filter"].isin(BANDS)]
        if chunk.empty:
            continue

        chunk["mjd"] = pd.to_numeric(chunk["mjd"], errors="coerce")
        chunk["flux"] = pd.to_numeric(chunk["flux"], errors="coerce")
        chunk["flux_err"] = pd.to_numeric(chunk["flux_err"], errors="coerce")

        chunk = chunk.dropna(subset=["mjd", "flux", "flux_err"])
        if chunk.empty:
            continue

        # EBV mapping
        ebv = chunk["object_id"].map(ebv_dict).astype(float).values
        flx = chunk["flux"].astype(float).values

        # de-extinct per band (vectorized per group band)
        # lebih cepat: apply per band mask
        flux_corr = flx.copy()
        for b in BANDS:
            m = (chunk["filter"].values == b)
            if m.any():
                flux_corr[m] = de_extinct_flux(flux_corr[m], ebv[m], b)

        chunk["flux_corr"] = flux_corr

        # SNR
        ferr = chunk["flux_err"].astype(float).values
        snr = np.zeros_like(ferr, dtype=float)
        mpos = ferr > 0
        snr[mpos] = np.abs(flux_corr[mpos]) / ferr[mpos]
        chunk["snr"] = snr
        chunk["snr_gt3"] = (snr > 3.0).astype(int)
        chunk["snr_gt5"] = (snr > 5.0).astype(int)

        # partial groupby stats
        g = chunk.groupby(["object_id", "filter"]).agg(
            n_obs=("flux_corr", "size"),
            sum_flux=("flux_corr", "sum"),
            sum_flux2=("flux_corr", lambda x: float(np.sum(np.square(x.values)))),
            min_flux=("flux_corr", "min"),
            max_flux=("flux_corr", "max"),
            sum_snr=("snr", "sum"),
            cnt_snr_gt3=("snr_gt3", "sum"),
            cnt_snr_gt5=("snr_gt5", "sum"),
            min_time=("mjd", "min"),
            max_time=("mjd", "max"),
        ).reset_index()

        parts.append(g)

    if not parts:
        return pd.DataFrame()

    return pd.concat(parts, ignore_index=True)

def combine_partials(df_partials: pd.DataFrame) -> pd.DataFrame:
    """
    Combine partial stats menjadi final stats per (object_id, filter).
    """
    if df_partials.empty:
        return df_partials

    g = df_partials.groupby(["object_id", "filter"]).agg(
        n_obs=("n_obs", "sum"),
        sum_flux=("sum_flux", "sum"),
        sum_flux2=("sum_flux2", "sum"),
        min_flux=("min_flux", "min"),
        max_flux=("max_flux", "max"),
        sum_snr=("sum_snr", "sum"),
        cnt_snr_gt3=("cnt_snr_gt3", "sum"),
        cnt_snr_gt5=("cnt_snr_gt5", "sum"),
        min_time=("min_time", "min"),
        max_time=("max_time", "max"),
    ).reset_index()

    # derive features
    n = g["n_obs"].astype(float).values
    mean = g["sum_flux"].values / np.clip(n, 1.0, None)
    var = (g["sum_flux2"].values / np.clip(n, 1.0, None)) - np.square(mean)
    var = np.maximum(var, 0.0)
    std = np.sqrt(var)

    g["flux_mean"] = mean
    g["flux_std"] = std
    g["amp"] = g["max_flux"] - g["min_flux"]
    g["snr_mean"] = g["sum_snr"].values / np.clip(n, 1.0, None)
    g["frac_snr_gt3"] = g["cnt_snr_gt3"].values / np.clip(n, 1.0, None)
    g["frac_snr_gt5"] = g["cnt_snr_gt5"].values / np.clip(n, 1.0, None)
    g["time_span"] = g["max_time"] - g["min_time"]

    # keep only final columns
    keep = [
        "object_id", "filter",
        "n_obs", "flux_mean", "flux_std", "min_flux", "max_flux",
        "amp", "snr_mean", "frac_snr_gt3", "frac_snr_gt5", "time_span"
    ]
    return g[keep]

def pivot_band_features(df_band: pd.DataFrame) -> pd.DataFrame:
    """
    Pivot per band → 1 row per object_id (wide).
    """
    if df_band.empty:
        return pd.DataFrame(columns=["object_id"])

    feats = [c for c in df_band.columns if c not in ("object_id", "filter")]
    wide = df_band.pivot(index="object_id", columns="filter", values=feats)

    # flatten columns: (feat, band) -> f"{band}__{feat}"
    wide.columns = [f"{band}__{feat}" for (feat, band) in wide.columns]
    wide = wide.reset_index()
    return wide

# ----------------------------
# Process all splits (train + test)
# ----------------------------
all_splits = sorted(set(df_tr["split"].dropna().astype(str)).union(set(df_te["split"].dropna().astype(str))))
print(f"[INFO] Total splits to process: {len(all_splits)} | example: {all_splits[:5]}")

train_wides = []
test_wides = []

for i, sp in enumerate(all_splits, start=1):
    sp_dir = DATA_ROOT / sp
    tr_file = sp_dir / "train_full_lightcurves.csv"
    te_file = sp_dir / "test_full_lightcurves.csv"

    tr_ids = set(df_tr.loc[df_tr["split"].astype(str).eq(sp), "object_id"].astype(str).tolist())
    te_ids = set(df_te.loc[df_te["split"].astype(str).eq(sp), "object_id"].astype(str).tolist())

    print(f"\n[{i}/{len(all_splits)}] {sp}")
    print(f"  train_ids={len(tr_ids):,} | test_ids={len(te_ids):,}")
    print(f"  files: train_exists={tr_file.exists()} | test_exists={te_file.exists()}")

    # TRAIN split
    if len(tr_ids) > 0 and tr_file.exists():
        part_tr = partial_agg_from_file(tr_file, tr_ids, ebv_dict, CHUNK_ROWS)
        band_tr = combine_partials(part_tr)
        wide_tr = pivot_band_features(band_tr)
        train_wides.append(wide_tr)

    # TEST split
    if len(te_ids) > 0 and te_file.exists():
        part_te = partial_agg_from_file(te_file, te_ids, ebv_dict, CHUNK_ROWS)
        band_te = combine_partials(part_te)
        wide_te = pivot_band_features(band_te)
        test_wides.append(wide_te)

# concat all splits
df_feat_tr = pd.concat(train_wides, ignore_index=True) if train_wides else pd.DataFrame(columns=["object_id"])
df_feat_te = pd.concat(test_wides, ignore_index=True) if test_wides else pd.DataFrame(columns=["object_id"])

# de-dup (safety)
df_feat_tr = df_feat_tr.drop_duplicates("object_id", keep="first").reset_index(drop=True)
df_feat_te = df_feat_te.drop_duplicates("object_id", keep="first").reset_index(drop=True)

print("\n[INFO] Lightcurve features built:")
print("  train rows:", len(df_feat_tr), "| cols:", df_feat_tr.shape[1])
print("  test  rows:", len(df_feat_te), "| cols:", df_feat_te.shape[1])

# ----------------------------
# Log features (per object_id)
# ----------------------------
def build_log_features(df: pd.DataFrame) -> pd.DataFrame:
    out = pd.DataFrame({
        "object_id": df["object_id"].astype(str),
        "Z": df["Z"].astype(float),
        "EBV": df["EBV"].astype(float),
        "Z_err": df["Z_err"].astype(float),
        "Z_isna": df["Z"].isna().astype(int),
        "EBV_isna": df["EBV"].isna().astype(int),
        "Zerr_isna": df["Z_err"].isna().astype(int),
    })
    return out

df_log_tr = build_log_features(df_tr)
df_log_te = build_log_features(df_te)

df_log_all = pd.concat([df_log_tr.assign(is_train=1), df_log_te.assign(is_train=0)], ignore_index=True)
df_log_all.to_csv(ART_DIR / "features_log.csv", index=False)

# ----------------------------
# Merge log + lc features
# ----------------------------
df_train_merged = df_log_tr.merge(df_feat_tr, on="object_id", how="left")
df_test_merged  = df_log_te.merge(df_feat_te, on="object_id", how="left")

# save
df_feat_tr.to_csv(ART_DIR / "features_lc_train.csv", index=False)
df_feat_te.to_csv(ART_DIR / "features_lc_test.csv", index=False)
df_train_merged.to_csv(ART_DIR / "features_merged_train.csv", index=False)
df_test_merged.to_csv(ART_DIR / "features_merged_test.csv", index=False)

print("\n=== STAGE 3 DONE ===")
print("Saved:")
print(" -", ART_DIR / "features_lc_train.csv")
print(" -", ART_DIR / "features_lc_test.csv")
print(" -", ART_DIR / "features_log.csv")
print(" -", ART_DIR / "features_merged_train.csv")
print(" -", ART_DIR / "features_merged_test.csv")


[WARN] Tidak berhasil ekstrak koefisien extinction dari Using_the_Data notebook.
       Pakai FALLBACK_EXT_COEFF. Jika mau 100% sama, ganti nilai dict ini sesuai notebook.
[INFO] EXT_COEFF = {'u': 4.2, 'g': 3.3, 'r': 2.3, 'i': 1.7, 'z': 1.3, 'y': 1.1}
[INFO] Total splits to process: 20 | example: ['split_01', 'split_02', 'split_03', 'split_04', 'split_05']

[1/20] split_01
  train_ids=155 | test_ids=364
  files: train_exists=True | test_exists=True

[2/20] split_02
  train_ids=170 | test_ids=414
  files: train_exists=True | test_exists=True

[3/20] split_03
  train_ids=138 | test_ids=338
  files: train_exists=True | test_exists=True

[4/20] split_04
  train_ids=145 | test_ids=332
  files: train_exists=True | test_exists=True

[5/20] split_05
  train_ids=165 | test_ids=375
  files: train_exists=True | test_exists=True

[6/20] split_06
  train_ids=155 | test_ids=374
  files: train_exists=True | test_exists=True

[7/20] split_07
  train_ids=165 | test_ids=398
  files: train_exists=True | 

# Model utama CPU: LightGBM + CV yang benar

In [5]:
# ============================================================
# STAGE 4 — MODEL UTAMA CPU: CV + Threshold tuning (F1) [REVISI FULL: AUTO-FALLBACK]
#
# Backend priority:
# 1) lightgbm (jika terinstall)
# 2) xgboost  (jika terinstall)
# 3) sklearn HistGradientBoostingClassifier (fallback, no extra install)
#
# Output (tetap pakai nama file yang sama):
# - artifacts/lgbm_oof.csv
# - artifacts/lgbm_threshold.txt
# - artifacts/lgbm_cv_report.txt
# - artifacts/lgbm_feature_importance.csv
# - artifacts/lgbm_test_proba.csv
# - submissions/sub_lgbm_v01.csv
# ============================================================

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"
SUB_DIR   = DATA_ROOT / "submissions"
SUB_DIR.mkdir(parents=True, exist_ok=True)

SEED    = 2025
N_FOLDS = 5

THR_STEP = 0.001  # threshold tuning grid step (F1)

# ----------------------------
# Backend selection (no crash)
# ----------------------------
BACKEND = None
lgb = None
xgb = None

try:
    import lightgbm as lgb  # type: ignore
    BACKEND = "lightgbm"
except Exception:
    try:
        import xgboost as xgb  # type: ignore
        BACKEND = "xgboost"
    except Exception:
        BACKEND = "sklearn_hgb"

print(f"[INFO] Backend used: {BACKEND}")

# sklearn fallback model
from sklearn.ensemble import HistGradientBoostingClassifier

# ----------------------------
# Load inputs
# ----------------------------
train_log_path  = ART_DIR / "train_log_clean.csv"
test_log_path   = ART_DIR / "test_log_clean.csv"
feat_tr_path    = ART_DIR / "features_merged_train.csv"
feat_te_path    = ART_DIR / "features_merged_test.csv"
sample_sub_path = DATA_ROOT / "sample_submission.csv"

for p in [train_log_path, test_log_path, feat_tr_path, feat_te_path, sample_sub_path]:
    if not p.exists():
        raise FileNotFoundError(f"Missing: {p}. Pastikan Stage 1 & 3 sudah dijalankan.")

df_tr_log  = pd.read_csv(train_log_path, low_memory=False)
df_te_log  = pd.read_csv(test_log_path, low_memory=False)
df_tr_feat = pd.read_csv(feat_tr_path, low_memory=False)
df_te_feat = pd.read_csv(feat_te_path, low_memory=False)
df_sub     = pd.read_csv(sample_sub_path, low_memory=False)

# ----------------------------
# Detect submission prediction column name
# ----------------------------
if "object_id" not in df_sub.columns:
    raise ValueError(f"sample_submission tidak punya 'object_id'. Found: {list(df_sub.columns)}")
pred_cols = [c for c in df_sub.columns if c != "object_id"]
if len(pred_cols) != 1:
    raise ValueError(f"sample_submission harus punya 1 kolom prediksi selain object_id. Found: {list(df_sub.columns)}")
SUB_PRED_COL = pred_cols[0]  # contoh: 'prediction'
print(f"[INFO] sample_submission prediction column = '{SUB_PRED_COL}'")

# ----------------------------
# Prepare train/test tables (merge target)
# ----------------------------
need_log_cols = ["object_id", "target"]
for c in need_log_cols:
    if c not in df_tr_log.columns:
        raise ValueError(f"train_log_clean missing '{c}'. Found: {list(df_tr_log.columns)}")

for d in [df_tr_log, df_te_log, df_tr_feat, df_te_feat]:
    d["object_id"] = d["object_id"].astype(str)

df_tr_log["target"] = pd.to_numeric(df_tr_log["target"], errors="coerce").astype(int)

df_train = df_tr_feat.merge(df_tr_log[["object_id", "target"]], on="object_id", how="left")
if df_train["target"].isna().any():
    bad = df_train[df_train["target"].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Ada object_id di features_merged_train yang tidak ketemu target. Example: {bad}")

df_test = df_te_feat.copy()

# ----------------------------
# Define features
# ----------------------------
drop_cols = {"object_id", "target"}
feature_cols = [c for c in df_train.columns if c not in drop_cols]

# Force numeric
for c in feature_cols:
    df_train[c] = pd.to_numeric(df_train[c], errors="coerce")
    df_test[c]  = pd.to_numeric(df_test[c], errors="coerce")

X_df = df_train[feature_cols]
y = df_train["target"].astype(int).values
X_test_df = df_test[feature_cols]

print(f"[INFO] X_train shape: {X_df.shape} | X_test shape: {X_test_df.shape}")
pos = max(int((y == 1).sum()), 1)
neg = max(int((y == 0).sum()), 1)
scale_pos_weight = neg / pos
print(f"[INFO] Pos rate: {(y==1).mean():.6f} | scale_pos_weight={scale_pos_weight:.4f}")

# ----------------------------
# Imputer (shared)
# ----------------------------
imputer = SimpleImputer(strategy="median")

# Fit imputer on full train (ok for CV karena median-only; kalau mau strict, fit per-fold, tapi lebih lambat)
X_all = imputer.fit_transform(X_df)
X_test_all = imputer.transform(X_test_df)

# Cast to float32 for speed/memory
X_all = X_all.astype(np.float32, copy=False)
X_test_all = X_test_all.astype(np.float32, copy=False)

# ----------------------------
# Model factory
# ----------------------------
def make_model(seed: int):
    if BACKEND == "lightgbm":
        params = dict(
            objective="binary",
            learning_rate=0.03,
            n_estimators=7000,
            num_leaves=64,
            min_child_samples=150,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=seed,
            n_jobs=-1,
            scale_pos_weight=scale_pos_weight,
        )
        return lgb.LGBMClassifier(**params)

    if BACKEND == "xgboost":
        # xgboost CPU params
        params = dict(
            objective="binary:logistic",
            learning_rate=0.03,
            n_estimators=8000,
            max_depth=6,
            min_child_weight=3.0,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=seed,
            n_jobs=-1,
            eval_metric="logloss",
            scale_pos_weight=scale_pos_weight,
            tree_method="hist",
        )
        return xgb.XGBClassifier(**params)

    # sklearn fallback (no extra install)
    # class imbalance handled via sample_weight in fit()
    return HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_leaf_nodes=64,
        min_samples_leaf=60,
        l2_regularization=0.0,
        max_iter=2500,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=50,
        random_state=seed,
    )

def fit_model(model, X_tr, y_tr, X_va, y_va, seed: int):
    if BACKEND == "lightgbm":
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="binary_logloss",
            callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
        )
        best_it = int(getattr(model, "best_iteration_", None) or model.n_estimators)
        return model, best_it

    if BACKEND == "xgboost":
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False,
            early_stopping_rounds=200
        )
        best_it = int(getattr(model, "best_iteration", None) or model.n_estimators)
        return model, best_it

    # sklearn fallback: use sample_weight for imbalance
    sw = np.ones_like(y_tr, dtype=np.float32)
    sw[y_tr == 1] = float(scale_pos_weight)
    model.fit(X_tr, y_tr, sample_weight=sw)
    # best iteration approximation
    best_it = int(getattr(model, "n_iter_", None) or getattr(model, "max_iter", 0) or 0)
    return model, best_it

def predict_proba_pos(model, X):
    if BACKEND in ("lightgbm", "xgboost"):
        return model.predict_proba(X)[:, 1]
    # sklearn hgb has predict_proba
    return model.predict_proba(X)[:, 1]

# ----------------------------
# CV + OOF
# ----------------------------
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_proba = np.zeros(X_all.shape[0], dtype=np.float32)
best_iters = []
fold_f1_05 = []

# feature importance accum (if backend supports)
feat_importance_accum = np.zeros(len(feature_cols), dtype=np.float64)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y), start=1):
    X_tr, X_va = X_all[tr_idx], X_all[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    model = make_model(SEED + fold)
    model, best_it = fit_model(model, X_tr, y_tr, X_va, y_va, seed=SEED + fold)

    p_va = predict_proba_pos(model, X_va).astype(np.float32)
    oof_proba[va_idx] = p_va

    f1_at_05 = f1_score(y_va, (p_va >= 0.5).astype(int))
    fold_f1_05.append(float(f1_at_05))
    best_iters.append(int(best_it))

    # importance (gain) if available
    try:
        if BACKEND == "lightgbm":
            fi = model.booster_.feature_importance(importance_type="gain")
            feat_importance_accum += fi
        elif BACKEND == "xgboost":
            booster = model.get_booster()
            score = booster.get_score(importance_type="gain")
            fi = np.array([score.get(f"f{i}", 0.0) for i in range(len(feature_cols))], dtype=np.float64)
            feat_importance_accum += fi
        else:
            # sklearn fallback: no native importance
            pass
    except Exception:
        pass

    print(f"[FOLD {fold}/{N_FOLDS}] best_iter={best_it} | F1@0.50={f1_at_05:.5f}")

print("\n=== CV SUMMARY (threshold=0.50) ===")
print(f"Mean F1@0.50: {float(np.mean(fold_f1_05)):.5f} | Std: {float(np.std(fold_f1_05)):.5f}")
print(f"Best iters   : min={int(np.min(best_iters))} | mean={float(np.mean(best_iters)):.1f} | max={int(np.max(best_iters))}")

# ----------------------------
# Threshold tuning (global OOF) for F1
# ----------------------------
thr_grid = np.arange(0.0, 1.0 + THR_STEP, THR_STEP)
best_thr, best_f1 = 0.5, -1.0
for thr in thr_grid:
    f1 = f1_score(y, (oof_proba >= thr).astype(int))
    if f1 > best_f1:
        best_f1 = float(f1)
        best_thr = float(thr)

print("\n=== OOF THRESHOLD TUNING ===")
print(f"Best threshold = {best_thr:.4f}")
print(f"OOF F1(best)   = {best_f1:.6f}")

y_hat = (oof_proba >= best_thr).astype(int)
print("\nConfusion Matrix (OOF):")
print(confusion_matrix(y, y_hat))

print("\nClassification Report (OOF):")
print(classification_report(y, y_hat, digits=4))

# ----------------------------
# Save OOF + threshold + report
# ----------------------------
df_oof = pd.DataFrame({
    "object_id": df_train["object_id"].astype(str),
    "oof_proba": oof_proba,
    "target": y
})
df_oof.to_csv(ART_DIR / "lgbm_oof.csv", index=False)

with open(ART_DIR / "lgbm_threshold.txt", "w", encoding="utf-8") as f:
    f.write(f"{best_thr}\n")

with open(ART_DIR / "lgbm_cv_report.txt", "w", encoding="utf-8") as f:
    f.write(f"BACKEND={BACKEND}\n")
    f.write("=== CV SUMMARY (threshold=0.50) ===\n")
    f.write(f"Mean F1@0.50: {float(np.mean(fold_f1_05)):.6f} | Std: {float(np.std(fold_f1_05)):.6f}\n")
    f.write(f"Best iters   : min={int(np.min(best_iters))} | mean={float(np.mean(best_iters)):.2f} | max={int(np.max(best_iters))}\n\n")
    f.write("=== OOF THRESHOLD TUNING ===\n")
    f.write(f"Best threshold = {best_thr:.6f}\n")
    f.write(f"OOF F1(best)   = {best_f1:.6f}\n")

# Feature importance (may be zeros if sklearn fallback)
fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance_gain_sum": feat_importance_accum
}).sort_values("importance_gain_sum", ascending=False).reset_index(drop=True)
fi_df.to_csv(ART_DIR / "lgbm_feature_importance.csv", index=False)

print("\nSaved artifacts:")
print(" -", ART_DIR / "lgbm_oof.csv")
print(" -", ART_DIR / "lgbm_threshold.txt")
print(" -", ART_DIR / "lgbm_cv_report.txt")
print(" -", ART_DIR / "lgbm_feature_importance.csv")

# ----------------------------
# Train final model on full train
# ----------------------------
final_model = make_model(SEED + 999)

# For final training:
# - lightgbm/xgboost: use mean best_iter
# - sklearn_hgb: keep its internal early stopping
mean_best_iter = int(max(200, round(float(np.mean(best_iters)))))

if BACKEND == "lightgbm":
    final_model.set_params(n_estimators=mean_best_iter)
    final_model.fit(X_all, y)
elif BACKEND == "xgboost":
    final_model.set_params(n_estimators=mean_best_iter)
    final_model.fit(X_all, y, verbose=False)
else:
    sw = np.ones_like(y, dtype=np.float32)
    sw[y == 1] = float(scale_pos_weight)
    final_model.fit(X_all, y, sample_weight=sw)

test_proba = predict_proba_pos(final_model, X_test_all).astype(np.float32)
test_pred  = (test_proba >= best_thr).astype(int)

# Save test probabilities (berguna untuk ensemble proba)
pd.DataFrame({"object_id": df_test["object_id"].astype(str), "proba": test_proba}).to_csv(
    ART_DIR / "lgbm_test_proba.csv", index=False
)

# ----------------------------
# Build submission
# ----------------------------
sub = df_sub[["object_id"]].copy()
sub["object_id"] = sub["object_id"].astype(str)

pred_df = pd.DataFrame({
    "object_id": df_test["object_id"].astype(str),
    SUB_PRED_COL: test_pred.astype(int),
})

sub = sub.merge(pred_df, on="object_id", how="left")

if sub[SUB_PRED_COL].isna().any():
    missing = sub[sub[SUB_PRED_COL].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Submission has NaN after merge. Example missing object_id: {missing}")

out_path = SUB_DIR / "sub_lgbm_v01.csv"
sub.to_csv(out_path, index=False)

print("\n=== DONE ===")
print("Backend:", BACKEND)
print("Final n_estimators (if applicable):", mean_best_iter)
print("Saved submission:", out_path)
print("Saved test proba:", ART_DIR / "lgbm_test_proba.csv")


[INFO] Backend used: sklearn_hgb
[INFO] sample_submission prediction column = 'prediction'
[INFO] X_train shape: (3043, 66) | X_test shape: (7135, 66)
[INFO] Pos rate: 0.048636 | scale_pos_weight=19.5608




[FOLD 1/5] best_iter=82 | F1@0.50=0.38835
[FOLD 2/5] best_iter=81 | F1@0.50=0.35789
[FOLD 3/5] best_iter=70 | F1@0.50=0.45833
[FOLD 4/5] best_iter=96 | F1@0.50=0.43038
[FOLD 5/5] best_iter=76 | F1@0.50=0.35955

=== CV SUMMARY (threshold=0.50) ===
Mean F1@0.50: 0.39890 | Std: 0.03966
Best iters   : min=70 | mean=81.0 | max=96

=== OOF THRESHOLD TUNING ===
Best threshold = 0.6680
OOF F1(best)   = 0.422535

Confusion Matrix (OOF):
[[2763  132]
 [  73   75]]

Classification Report (OOF):
              precision    recall  f1-score   support

           0     0.9743    0.9544    0.9642      2895
           1     0.3623    0.5068    0.4225       148

    accuracy                         0.9326      3043
   macro avg     0.6683    0.7306    0.6934      3043
weighted avg     0.9445    0.9326    0.9379      3043


Saved artifacts:
 - D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\lgbm_oof.csv
 - D:\MALLORN Astronomical Classification Cha

# Threshold tuning khusus F1

In [6]:
# ============================================================
# STAGE 5 — THRESHOLD TUNING KHUSUS F1 (LEBIH HALUS)
#
# Tujuan:
# - Cari threshold terbaik untuk memaksimalkan F1 berdasarkan OOF proba
# - Opsional: threshold per-fold (lebih robust), lalu voting / averaging threshold
#
# Input:
# - artifacts/lgbm_oof.csv            (from STAGE 4)
#
# Output:
# - artifacts/lgbm_threshold_grid.csv
# - artifacts/lgbm_threshold_best.txt
# - artifacts/lgbm_threshold_report.txt
# - (opsional) artifacts/lgbm_threshold_per_fold.csv  (jika fold tersedia di oof)
#
# Catatan:
# - STAGE 4 di atas menyimpan oof_proba + target, tapi tidak simpan fold.
# - Jadi tuning per-fold hanya bisa jika kamu juga punya kolom fold.
#   Kalau belum ada, skrip ini fokus global OOF threshold (yang sudah cukup kuat).
# ============================================================

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"

OOF_PATH = ART_DIR / "lgbm_oof.csv"
if not OOF_PATH.exists():
    raise FileNotFoundError(f"Missing: {OOF_PATH}. Jalankan STAGE 4 dulu.")

df_oof = pd.read_csv(OOF_PATH, low_memory=False)

need = ["oof_proba", "target"]
for c in need:
    if c not in df_oof.columns:
        raise ValueError(f"OOF missing '{c}'. Found: {list(df_oof.columns)}")

p = pd.to_numeric(df_oof["oof_proba"], errors="coerce").astype(float).values
y = pd.to_numeric(df_oof["target"], errors="coerce").astype(int).values

# Drop NaN safety
mask = np.isfinite(p)
if mask.mean() < 0.999:
    df_oof = df_oof.loc[mask].reset_index(drop=True)
    p = p[mask]
    y = y[mask]

# ----------------------------
# 1) Coarse grid (0.00..1.00 step 0.01)
# ----------------------------
thr_grid_coarse = np.linspace(0.0, 1.0, 1001)  # step 0.001 (lebih halus dari 0.01)
rows = []

best_thr = 0.5
best_f1 = -1.0

for thr in thr_grid_coarse:
    pred = (p >= thr).astype(int)
    f1 = f1_score(y, pred)
    if f1 > best_f1:
        best_f1 = float(f1)
        best_thr = float(thr)

    # simpan ringkas (biar file tidak kegedean, simpan tiap 0.005)
    # tapi kita tetap cari best pakai full 0.001 grid
    if abs((thr * 1000) % 5) < 1e-9:  # every 0.005
        prec = precision_score(y, pred, zero_division=0)
        rec  = recall_score(y, pred, zero_division=0)
        rows.append((thr, f1, prec, rec))

df_grid = pd.DataFrame(rows, columns=["threshold", "f1", "precision", "recall"])
df_grid.to_csv(ART_DIR / "lgbm_threshold_grid.csv", index=False)

print("=== THRESHOLD TUNING (GLOBAL OOF) ===")
print(f"Best threshold (grid 0.001) = {best_thr:.3f}")
print(f"Best OOF F1                 = {best_f1:.6f}")

# ----------------------------
# 2) Fine local search around best (optional)
#    - cari di sekitar best_thr ± 0.02 dengan step 0.0002
# ----------------------------
lo = max(0.0, best_thr - 0.02)
hi = min(1.0, best_thr + 0.02)
thr_grid_fine = np.linspace(lo, hi, int(round((hi - lo) / 0.0002)) + 1)

best_thr2 = best_thr
best_f12 = best_f1

for thr in thr_grid_fine:
    pred = (p >= thr).astype(int)
    f1 = f1_score(y, pred)
    if f1 > best_f12:
        best_f12 = float(f1)
        best_thr2 = float(thr)

print("\n=== FINE SEARCH ===")
print(f"Best threshold (fine) = {best_thr2:.4f}")
print(f"Best OOF F1 (fine)    = {best_f12:.6f}")

# Save best threshold
with open(ART_DIR / "lgbm_threshold_best.txt", "w", encoding="utf-8") as f:
    f.write(f"{best_thr2}\n")

# Report
pred_best = (p >= best_thr2).astype(int)
prec_best = precision_score(y, pred_best, zero_division=0)
rec_best  = recall_score(y, pred_best, zero_division=0)

report = []
report.append("=== THRESHOLD TUNING REPORT (GLOBAL OOF) ===")
report.append(f"OOF samples         : {len(y)}")
report.append(f"Positive rate (y=1) : {float((y==1).mean()):.6f}")
report.append("")
report.append(f"Best threshold      : {best_thr2:.6f}")
report.append(f"F1                  : {best_f12:.6f}")
report.append(f"Precision           : {float(prec_best):.6f}")
report.append(f"Recall              : {float(rec_best):.6f}")
report.append("")
report.append("Saved files:")
report.append(f"- {ART_DIR / 'lgbm_threshold_grid.csv'}")
report.append(f"- {ART_DIR / 'lgbm_threshold_best.txt'}")

(ART_DIR / "lgbm_threshold_report.txt").write_text("\n".join(report), encoding="utf-8")

print("\n".join(report[-6:]))

# ----------------------------
# (Optional) If user later wants: apply this threshold to create a new submission
# - Submission dibuat di STAGE 4. Kalau kamu ingin regenerate submission
#   dengan threshold baru, tinggal rerun STAGE 4 bagian inferensi,
#   atau aku buatkan STAGE 5b khusus "re-export submission from saved proba".
# ----------------------------


=== THRESHOLD TUNING (GLOBAL OOF) ===
Best threshold (grid 0.001) = 0.668
Best OOF F1                 = 0.422535

=== FINE SEARCH ===
Best threshold (fine) = 0.6680
Best OOF F1 (fine)    = 0.422535
Precision           : 0.362319
Recall              : 0.506757

Saved files:
- D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\lgbm_threshold_grid.csv
- D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\lgbm_threshold_best.txt


# Tangani “domain shift” redshift (train spec-z vs test photo-z)

In [8]:
# ============================================================
# STAGE 6 — DOMAIN SHIFT REDSHIFT (TRAIN spec-z vs TEST photo-z)
# [REVISI FULL: AUTO-FALLBACK TANPA LIGHTGBM]
#
# Backend priority:
# 1) lightgbm (jika terinstall)
# 2) xgboost  (jika terinstall)
# 3) sklearn HistGradientBoostingClassifier (fallback, no extra install)
#
# Fitur domain-shift:
# - Z_filled, Zerr_filled, Z_missing, Zerr_missing
# - log1pZ, log1pZerr, inv1pZ, Z_div_Zerr
# - Z_aug (train noisy), Z_aug_absdiff
#
# Prasyarat:
# - artifacts/train_log_clean.csv
# - artifacts/test_log_clean.csv
# - artifacts/features_merged_train.csv  (STAGE 3)
# - artifacts/features_merged_test.csv   (STAGE 3)
# - sample_submission.csv
#
# Output (nama tetap):
# - artifacts/lgbm_zaug_oof.csv
# - artifacts/lgbm_zaug_threshold.txt
# - artifacts/lgbm_zaug_cv_report.txt
# - artifacts/lgbm_zaug_feature_importance.csv
# - artifacts/lgbm_zaug_test_proba.csv
# - submissions/sub_lgbm_zaug_v02.csv
# ============================================================

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"
SUB_DIR   = DATA_ROOT / "submissions"
SUB_DIR.mkdir(parents=True, exist_ok=True)

SEED    = 2025
N_FOLDS = 5

# Noise augmentation strength
AUG_COPIES = 1          # 0=off, 1=duplikasi 1x (jadi 2x data), 2=jadi 3x data, dst (CPU lebih berat)
AUG_FLOOR  = 0.01       # floor noise (absolute)
AUG_REL    = 0.02       # noise tambahan proporsional (1+z)

# Threshold tuning grid
THR_STEP = 0.001

# ----------------------------
# Backend selection (no crash)
# ----------------------------
BACKEND = None
lgb = None
xgb = None

try:
    import lightgbm as lgb  # type: ignore
    BACKEND = "lightgbm"
except Exception:
    try:
        import xgboost as xgb  # type: ignore
        BACKEND = "xgboost"
    except Exception:
        BACKEND = "sklearn_hgb"

print(f"[INFO] Backend used: {BACKEND}")

# ----------------------------
# Load inputs
# ----------------------------
train_log_path  = ART_DIR / "train_log_clean.csv"
test_log_path   = ART_DIR / "test_log_clean.csv"
feat_tr_path    = ART_DIR / "features_merged_train.csv"
feat_te_path    = ART_DIR / "features_merged_test.csv"
sample_sub_path = DATA_ROOT / "sample_submission.csv"

for p in [train_log_path, test_log_path, feat_tr_path, feat_te_path, sample_sub_path]:
    if not p.exists():
        raise FileNotFoundError(f"Missing: {p}. Jalankan STAGE 1 & 3 dulu.")

df_tr_log  = pd.read_csv(train_log_path, low_memory=False)
df_te_log  = pd.read_csv(test_log_path, low_memory=False)
df_tr_feat = pd.read_csv(feat_tr_path, low_memory=False)
df_te_feat = pd.read_csv(feat_te_path, low_memory=False)
df_sub     = pd.read_csv(sample_sub_path, low_memory=False)

# Detect submission prediction column name
if "object_id" not in df_sub.columns:
    raise ValueError(f"sample_submission tidak punya 'object_id'. Found: {list(df_sub.columns)}")
pred_cols = [c for c in df_sub.columns if c != "object_id"]
if len(pred_cols) != 1:
    raise ValueError(f"sample_submission harus punya 1 kolom prediksi selain object_id. Found: {list(df_sub.columns)}")
SUB_PRED_COL = pred_cols[0]
print(f"[INFO] sample_submission prediction column = '{SUB_PRED_COL}'")

# Normalize ids
for d in [df_tr_log, df_te_log, df_tr_feat, df_te_feat]:
    d["object_id"] = d["object_id"].astype(str)

# Target
if "target" not in df_tr_log.columns:
    raise ValueError(f"train_log_clean missing target. Found: {list(df_tr_log.columns)}")
df_tr_log["target"] = pd.to_numeric(df_tr_log["target"], errors="coerce").astype(int)

df_train_base = df_tr_feat.merge(df_tr_log[["object_id", "target"]], on="object_id", how="left")
if df_train_base["target"].isna().any():
    bad = df_train_base[df_train_base["target"].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Ada object_id train features yang tidak ketemu target. Example: {bad}")

df_test_base = df_te_feat.copy()

# ----------------------------
# Compute Z_err reference from TEST (photo-z error distribution)
# ----------------------------
def _to_num(s):
    return pd.to_numeric(s, errors="coerce")

zerr_test = None
if "Z_err" in df_te_log.columns:
    zerr_test = _to_num(df_te_log["Z_err"])
elif "Z_err" in df_test_base.columns:
    zerr_test = _to_num(df_test_base["Z_err"])

if zerr_test is None:
    zerr_fill = 0.05
    print("[WARN] Tidak menemukan Z_err di test. Pakai zerr_fill=0.05")
else:
    zerr_fill = float(np.nanmedian(zerr_test.astype(float).values))
    if not np.isfinite(zerr_fill) or zerr_fill <= 0:
        zerr_fill = 0.05
    print(f"[INFO] zerr_fill (median test Z_err) = {zerr_fill:.6f}")

# ----------------------------
# Feature engineering: robust Z/Z_err + Z augmentation column
# ----------------------------
def add_redshift_domainshift_features(df: pd.DataFrame, zerr_fill: float, is_train: bool, rng: np.random.Generator):
    out = df.copy()

    if "Z" not in out.columns:
        out["Z"] = np.nan
    if "Z_err" not in out.columns:
        out["Z_err"] = np.nan

    out["Z"] = pd.to_numeric(out["Z"], errors="coerce")
    out["Z_err"] = pd.to_numeric(out["Z_err"], errors="coerce")

    z = out["Z"].astype(float).values
    zerr = out["Z_err"].astype(float).values

    z_isna = ~np.isfinite(z)
    zerr_isna = ~np.isfinite(zerr)

    z_f = z.copy()
    z_f[z_isna] = 0.0

    zerr_f = zerr.copy()
    zerr_f[zerr_isna] = zerr_fill
    zerr_f = np.clip(zerr_f, 0.0, None)

    out["Z_filled"] = z_f
    out["Zerr_filled"] = zerr_f
    out["Z_missing"] = z_isna.astype(int)
    out["Zerr_missing"] = zerr_isna.astype(int)

    out["log1pZ"] = np.log1p(np.clip(z_f, 0.0, None))
    out["log1pZerr"] = np.log1p(np.clip(zerr_f, 0.0, None))
    out["inv1pZ"] = 1.0 / (1.0 + np.clip(z_f, 0.0, None))
    out["Z_div_Zerr"] = z_f / (zerr_f + 1e-6)

    if is_train:
        sigma = np.sqrt(np.square(zerr_f) + np.square(AUG_FLOOR) + np.square(AUG_REL * (1.0 + z_f)))
        noise = rng.normal(loc=0.0, scale=sigma, size=z_f.shape[0])
        z_aug = np.clip(z_f + noise, 0.0, None)
        out["Z_aug"] = z_aug
        out["Z_aug_absdiff"] = np.abs(z_aug - z_f)
    else:
        out["Z_aug"] = z_f
        out["Z_aug_absdiff"] = 0.0

    return out

rng = np.random.default_rng(SEED)

df_train_fe = add_redshift_domainshift_features(df_train_base, zerr_fill=zerr_fill, is_train=True, rng=rng)
df_test_fe  = add_redshift_domainshift_features(df_test_base,  zerr_fill=zerr_fill, is_train=False, rng=rng)

# ----------------------------
# Optional: Data augmentation by duplicating TRAIN with different noise draws
# ----------------------------
if AUG_COPIES > 0:
    aug_list = [df_train_fe]
    for k in range(AUG_COPIES):
        rng_k = np.random.default_rng(SEED + 1000 * (k + 1))
        df_k = add_redshift_domainshift_features(df_train_base, zerr_fill=zerr_fill, is_train=True, rng=rng_k)
        aug_list.append(df_k)
    df_train_fe = pd.concat(aug_list, ignore_index=True)
    print(f"[INFO] Augmented train rows: {len(df_train_fe):,} (AUG_COPIES={AUG_COPIES})")

# ----------------------------
# Build X/y
# ----------------------------
y = df_train_fe["target"].astype(int).values
drop_cols = {"object_id", "target"}
feature_cols = [c for c in df_train_fe.columns if c not in drop_cols]

# Numeric coercion
for c in feature_cols:
    df_train_fe[c] = pd.to_numeric(df_train_fe[c], errors="coerce")
    df_test_fe[c]  = pd.to_numeric(df_test_fe[c], errors="coerce")

X_df = df_train_fe[feature_cols]
X_test_df = df_test_fe[feature_cols]

pos = max(int((y == 1).sum()), 1)
neg = max(int((y == 0).sum()), 1)
scale_pos_weight = neg / pos

print(f"[INFO] X_train shape: {X_df.shape} | X_test shape: {X_test_df.shape}")
print(f"[INFO] Pos rate: {(y==1).mean():.6f} | scale_pos_weight={scale_pos_weight:.4f}")

# ----------------------------
# Imputer
# ----------------------------
imputer = SimpleImputer(strategy="median")
X_all = imputer.fit_transform(X_df).astype(np.float32, copy=False)
X_test_all = imputer.transform(X_test_df).astype(np.float32, copy=False)

# ----------------------------
# Model factory
# ----------------------------
def make_model(seed: int):
    if BACKEND == "lightgbm":
        params = dict(
            objective="binary",
            learning_rate=0.03,
            n_estimators=9000,
            num_leaves=96,
            min_child_samples=120,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=seed,
            n_jobs=-1,
            scale_pos_weight=scale_pos_weight,
        )
        return lgb.LGBMClassifier(**params)

    if BACKEND == "xgboost":
        params = dict(
            objective="binary:logistic",
            learning_rate=0.03,
            n_estimators=10000,
            max_depth=6,
            min_child_weight=3.0,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=seed,
            n_jobs=-1,
            eval_metric="logloss",
            scale_pos_weight=scale_pos_weight,
            tree_method="hist",
        )
        return xgb.XGBClassifier(**params)

    return HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_leaf_nodes=96,
        min_samples_leaf=60,
        l2_regularization=0.0,
        max_iter=3500,
        early_stopping=True,
        validation_fraction=0.1,
        n_iter_no_change=80,
        random_state=seed,
    )

def fit_model(model, X_tr, y_tr, X_va, y_va):
    if BACKEND == "lightgbm":
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric="binary_logloss",
            callbacks=[lgb.early_stopping(stopping_rounds=250, verbose=False)]
        )
        best_it = int(getattr(model, "best_iteration_", None) or model.n_estimators)
        return model, best_it

    if BACKEND == "xgboost":
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False,
            early_stopping_rounds=250
        )
        best_it = int(getattr(model, "best_iteration", None) or model.n_estimators)
        return model, best_it

    sw = np.ones_like(y_tr, dtype=np.float32)
    sw[y_tr == 1] = float(scale_pos_weight)
    model.fit(X_tr, y_tr, sample_weight=sw)
    best_it = int(getattr(model, "n_iter_", None) or getattr(model, "max_iter", 0) or 0)
    return model, best_it

def predict_proba_pos(model, X):
    return model.predict_proba(X)[:, 1]

# ----------------------------
# CV + OOF
# ----------------------------
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof_proba = np.zeros(X_all.shape[0], dtype=np.float32)
best_iters = []
fold_f1_05 = []
feat_importance_accum = np.zeros(len(feature_cols), dtype=np.float64)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_all, y), start=1):
    X_tr, X_va = X_all[tr_idx], X_all[va_idx]
    y_tr, y_va = y[tr_idx], y[va_idx]

    model = make_model(SEED + 10 * fold)
    model, best_it = fit_model(model, X_tr, y_tr, X_va, y_va)

    p_va = predict_proba_pos(model, X_va).astype(np.float32)
    oof_proba[va_idx] = p_va

    f1_at_05 = f1_score(y_va, (p_va >= 0.5).astype(int))
    fold_f1_05.append(float(f1_at_05))
    best_iters.append(int(best_it))

    # importance (gain)
    try:
        if BACKEND == "lightgbm":
            fi = model.booster_.feature_importance(importance_type="gain")
            feat_importance_accum += fi
        elif BACKEND == "xgboost":
            booster = model.get_booster()
            score = booster.get_score(importance_type="gain")
            fi = np.array([score.get(f"f{i}", 0.0) for i in range(len(feature_cols))], dtype=np.float64)
            feat_importance_accum += fi
    except Exception:
        pass

    print(f"[FOLD {fold}/{N_FOLDS}] best_iter={best_it} | F1@0.50={f1_at_05:.5f}")

print("\n=== CV SUMMARY (threshold=0.50) ===")
print(f"Mean F1@0.50: {float(np.mean(fold_f1_05)):.5f} | Std: {float(np.std(fold_f1_05)):.5f}")
print(f"Best iters   : min={int(np.min(best_iters))} | mean={float(np.mean(best_iters)):.1f} | max={int(np.max(best_iters))}")

# ----------------------------
# Threshold tuning (global OOF) for F1
# ----------------------------
thr_grid = np.arange(0.0, 1.0 + THR_STEP, THR_STEP)
best_thr, best_f1 = 0.5, -1.0
for thr in thr_grid:
    f1 = f1_score(y, (oof_proba >= thr).astype(int))
    if f1 > best_f1:
        best_f1 = float(f1)
        best_thr = float(thr)

print("\n=== OOF THRESHOLD TUNING ===")
print(f"Best threshold = {best_thr:.4f}")
print(f"OOF F1(best)   = {best_f1:.6f}")

y_hat = (oof_proba >= best_thr).astype(int)
print("\nConfusion Matrix (OOF):")
print(confusion_matrix(y, y_hat))
print("\nClassification Report (OOF):")
print(classification_report(y, y_hat, digits=4))

# ----------------------------
# Save artifacts
# ----------------------------
oof_path = ART_DIR / "lgbm_zaug_oof.csv"
pd.DataFrame({
    "object_id": df_train_fe["object_id"].astype(str),
    "oof_proba": oof_proba,
    "target": y
}).to_csv(oof_path, index=False)

thr_path = ART_DIR / "lgbm_zaug_threshold.txt"
thr_path.write_text(f"{best_thr}\n", encoding="utf-8")

rep_path = ART_DIR / "lgbm_zaug_cv_report.txt"
with open(rep_path, "w", encoding="utf-8") as f:
    f.write(f"BACKEND={BACKEND}\n")
    f.write("=== CV SUMMARY (threshold=0.50) ===\n")
    f.write(f"Mean F1@0.50: {float(np.mean(fold_f1_05)):.6f} | Std: {float(np.std(fold_f1_05)):.6f}\n")
    f.write(f"Best iters   : min={int(np.min(best_iters))} | mean={float(np.mean(best_iters)):.2f} | max={int(np.max(best_iters))}\n\n")
    f.write("=== OOF THRESHOLD TUNING ===\n")
    f.write(f"Best threshold = {best_thr:.6f}\n")
    f.write(f"OOF F1(best)   = {best_f1:.6f}\n")

fi_path = ART_DIR / "lgbm_zaug_feature_importance.csv"
pd.DataFrame({
    "feature": feature_cols,
    "importance_gain_sum": feat_importance_accum
}).sort_values("importance_gain_sum", ascending=False).reset_index(drop=True).to_csv(fi_path, index=False)

print("\nSaved artifacts:")
print(" -", oof_path)
print(" -", thr_path)
print(" -", rep_path)
print(" -", fi_path)

# ----------------------------
# Train final model on full train
# ----------------------------
final_model = make_model(SEED + 999)

mean_best_iter = int(max(300, round(float(np.mean(best_iters)))))

if BACKEND == "lightgbm":
    final_model.set_params(n_estimators=mean_best_iter)
    final_model.fit(X_all, y)
elif BACKEND == "xgboost":
    final_model.set_params(n_estimators=mean_best_iter)
    final_model.fit(X_all, y, verbose=False)
else:
    sw = np.ones_like(y, dtype=np.float32)
    sw[y == 1] = float(scale_pos_weight)
    final_model.fit(X_all, y, sample_weight=sw)

test_proba = predict_proba_pos(final_model, X_test_all).astype(np.float32)
test_pred = (test_proba >= best_thr).astype(int)

# Save test proba (buat ensemble proba)
proba_path = ART_DIR / "lgbm_zaug_test_proba.csv"
pd.DataFrame({"object_id": df_test_fe["object_id"].astype(str), "proba": test_proba}).to_csv(proba_path, index=False)

# ----------------------------
# Build submission
# ----------------------------
sub = df_sub[["object_id"]].copy()
sub["object_id"] = sub["object_id"].astype(str)

pred_df = pd.DataFrame({
    "object_id": df_test_fe["object_id"].astype(str),
    SUB_PRED_COL: test_pred.astype(int),
})

sub = sub.merge(pred_df, on="object_id", how="left")

if sub[SUB_PRED_COL].isna().any():
    missing = sub[sub[SUB_PRED_COL].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Submission has NaN after merge. Example missing object_id: {missing}")

out_path = SUB_DIR / "sub_lgbm_zaug_v02.csv"
sub.to_csv(out_path, index=False)

print("\n=== DONE ===")
print("Backend:", BACKEND)
print("Final n_estimators (if applicable):", mean_best_iter)
print("Saved submission:", out_path)
print("Saved test proba:", proba_path)


[INFO] Backend used: sklearn_hgb
[INFO] sample_submission prediction column = 'prediction'
[INFO] zerr_fill (median test Z_err) = 0.029650
[INFO] Augmented train rows: 6,086 (AUG_COPIES=1)
[INFO] X_train shape: (6086, 76) | X_test shape: (7135, 76)
[INFO] Pos rate: 0.048636 | scale_pos_weight=19.5608




[FOLD 1/5] best_iter=160 | F1@0.50=0.79688
[FOLD 2/5] best_iter=179 | F1@0.50=0.77519
[FOLD 3/5] best_iter=254 | F1@0.50=0.85965
[FOLD 4/5] best_iter=193 | F1@0.50=0.79365
[FOLD 5/5] best_iter=169 | F1@0.50=0.71318

=== CV SUMMARY (threshold=0.50) ===
Mean F1@0.50: 0.78771 | Std: 0.04692
Best iters   : min=160 | mean=191.0 | max=254

=== OOF THRESHOLD TUNING ===
Best threshold = 0.7400
OOF F1(best)   = 0.846011

Confusion Matrix (OOF):
[[5775   15]
 [  68  228]]

Classification Report (OOF):
              precision    recall  f1-score   support

           0     0.9884    0.9974    0.9929      5790
           1     0.9383    0.7703    0.8460       296

    accuracy                         0.9864      6086
   macro avg     0.9633    0.8838    0.9194      6086
weighted avg     0.9859    0.9864    0.9857      6086


Saved artifacts:
 - D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\lgbm_zaug_oof.csv
 - D:\MALLORN Astronomical Class

# Feature upgrade yang sering menang di lightcurve (masih CPU-friendly)

In [9]:
# ============================================================
# STAGE 7 — FEATURE UPGRADE (CPU-FRIENDLY)
# Upgrade fitur lightcurve TANPA baca ulang CSV besar:
# - Derive "shape-ish" features per band dari fitur STAGE 3 (mean/std/min/max/amp/snr/time_span)
# - Tambah cross-band features (ratio/diff antar band)
# - Tambah global aggregation across bands (jumlah band hadir, total n_obs, peak band, dll)
#
# Input (STAGE 3):
#   artifacts/features_merged_train.csv
#   artifacts/features_merged_test.csv
#
# Output:
#   artifacts/features_upg_train.csv
#   artifacts/features_upg_test.csv
# ============================================================

from pathlib import Path
import numpy as np
import pandas as pd

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"

IN_TR = ART_DIR / "features_merged_train.csv"
IN_TE = ART_DIR / "features_merged_test.csv"
OUT_TR = ART_DIR / "features_upg_train.csv"
OUT_TE = ART_DIR / "features_upg_test.csv"

BANDS = ["u", "g", "r", "i", "z", "y"]
BASE_FEATS = ["n_obs", "flux_mean", "flux_std", "min_flux", "max_flux", "amp",
              "snr_mean", "frac_snr_gt3", "frac_snr_gt5", "time_span"]

EPS = 1e-9

for p in [IN_TR, IN_TE]:
    if not p.exists():
        raise FileNotFoundError(f"Missing: {p}. Jalankan STAGE 3 dulu.")

df_tr = pd.read_csv(IN_TR, low_memory=False)
df_te = pd.read_csv(IN_TE, low_memory=False)

if "object_id" not in df_tr.columns or "object_id" not in df_te.columns:
    raise ValueError("features_merged_* wajib punya kolom object_id")

df_tr["object_id"] = df_tr["object_id"].astype(str)
df_te["object_id"] = df_te["object_id"].astype(str)

# ----------------------------
# Helpers
# ----------------------------
def colname(band: str, feat: str) -> str:
    return f"{band}__{feat}"

def ensure_numeric(df: pd.DataFrame, cols: list):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

def safe_series(df: pd.DataFrame, c: str, default=np.nan):
    if c in df.columns:
        return pd.to_numeric(df[c], errors="coerce")
    return pd.Series(default, index=df.index, dtype="float64")

def add_band_derived(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Pastikan base columns numeric
    cols = [colname(b, f) for b in BANDS for f in BASE_FEATS if colname(b, f) in out.columns]
    ensure_numeric(out, cols)

    for b in BANDS:
        n_obs = safe_series(out, colname(b, "n_obs"), default=0.0).fillna(0.0)
        mu    = safe_series(out, colname(b, "flux_mean"))
        sd    = safe_series(out, colname(b, "flux_std"))
        mn    = safe_series(out, colname(b, "min_flux"))
        mx    = safe_series(out, colname(b, "max_flux"))
        amp   = safe_series(out, colname(b, "amp"))
        snr_m = safe_series(out, colname(b, "snr_mean"))
        tspan = safe_series(out, colname(b, "time_span"))

        present = (n_obs > 0).astype(int)
        out[colname(b, "present")] = present

        # Coef of variation (robust)
        out[colname(b, "cv_absmean")] = sd / (np.abs(mu) + EPS)

        # Peak positivity (untuk ratio/cross-band yang lebih stabil)
        peak_pos = np.clip(mx, 0.0, None)
        out[colname(b, "peak_pos")] = peak_pos
        out[colname(b, "log1p_peak_pos")] = np.log1p(peak_pos)

        # Peak-to-mean ratio (pakai abs mean biar stabil)
        out[colname(b, "peak_over_absmean")] = mx / (np.abs(mu) + EPS)

        # Amp normalized
        out[colname(b, "amp_over_absmean")] = amp / (np.abs(mu) + EPS)
        out[colname(b, "amp_over_std")] = amp / (sd + EPS)

        # Negative flux indicator
        out[colname(b, "has_negative")] = (mn < 0).astype(int)

        # Time density proxy (karena gap exact sulit tanpa sort)
        out[colname(b, "cadence_proxy")] = tspan / (n_obs.clip(lower=1.0))

        # SNR density proxy
        out[colname(b, "snr_per_obs")] = snr_m  # already per obs-ish, keep alias

    return out

def add_cross_band(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # Convenience arrays for cross-band operations
    peak_pos_cols = [colname(b, "peak_pos") for b in BANDS]
    amp_cols      = [colname(b, "amp") for b in BANDS]
    nobs_cols     = [colname(b, "n_obs") for b in BANDS]
    pres_cols     = [colname(b, "present") for b in BANDS]
    tspan_cols    = [colname(b, "time_span") for b in BANDS]

    ensure_numeric(out, [c for c in peak_pos_cols+amp_cols+nobs_cols+tspan_cols if c in out.columns])

    # Global counts
    pres_mat = np.column_stack([safe_series(out, c, default=0.0).fillna(0.0).values for c in pres_cols])
    out["n_bands_present"] = pres_mat.sum(axis=1).astype(int)

    nobs_mat = np.column_stack([safe_series(out, c, default=0.0).fillna(0.0).values for c in nobs_cols])
    out["total_n_obs"] = nobs_mat.sum(axis=1).astype(float)
    out["mean_n_obs_per_band_present"] = out["total_n_obs"] / (out["n_bands_present"].clip(lower=1).astype(float))

    # Peak across bands (positive peak)
    peak_mat = np.column_stack([safe_series(out, c, default=np.nan).values for c in peak_pos_cols])
    peak_mat = np.where(np.isfinite(peak_mat), peak_mat, -np.inf)

    max_peak = np.max(peak_mat, axis=1)
    sum_peak = np.sum(np.where(np.isfinite(peak_mat) & (peak_mat > -np.inf), np.clip(peak_mat, 0.0, None), 0.0), axis=1)
    argmax_peak = np.argmax(peak_mat, axis=1)

    out["max_peak_pos"] = np.where(np.isfinite(max_peak), max_peak, np.nan)
    out["sum_peak_pos"] = sum_peak
    out["peak_concentration"] = out["max_peak_pos"] / (out["sum_peak_pos"] + EPS)

    # Encode peak band as integer 0..5 + one-hot
    out["peak_band_idx"] = argmax_peak.astype(int)
    for i, b in enumerate(BANDS):
        out[f"peak_band_is_{b}"] = (out["peak_band_idx"] == i).astype(int)

    # Amp across bands
    amp_mat = np.column_stack([safe_series(out, c, default=np.nan).values for c in amp_cols])
    amp_mat = np.where(np.isfinite(amp_mat), amp_mat, -np.inf)
    out["max_amp"] = np.where(np.isfinite(np.max(amp_mat, axis=1)), np.max(amp_mat, axis=1), np.nan)
    out["mean_amp"] = np.nanmean(np.where(amp_mat > -np.inf, amp_mat, np.nan), axis=1)
    out["amp_concentration"] = out["max_amp"] / (np.nansum(np.where(np.isfinite(amp_mat) & (amp_mat > -np.inf), np.clip(amp_mat, 0.0, None), 0.0), axis=1) + EPS)

    # Time span across bands
    tspan_mat = np.column_stack([safe_series(out, c, default=np.nan).values for c in tspan_cols])
    out["max_time_span"] = np.nanmax(tspan_mat, axis=1)
    out["mean_time_span"] = np.nanmean(tspan_mat, axis=1)

    # Adjacent band ratios (peak_pos & amp) + log ratios (lebih stabil)
    adj_pairs = [("u","g"), ("g","r"), ("r","i"), ("i","z"), ("z","y")]
    for b1, b2 in adj_pairs:
        p1 = safe_series(out, colname(b1, "peak_pos"))
        p2 = safe_series(out, colname(b2, "peak_pos"))
        out[f"peakpos_ratio_{b1}{b2}"] = (p1 + EPS) / (p2 + EPS)
        out[f"log_peakpos_ratio_{b1}{b2}"] = np.log1p(np.clip(p1, 0.0, None)) - np.log1p(np.clip(p2, 0.0, None))

        a1 = safe_series(out, colname(b1, "amp"))
        a2 = safe_series(out, colname(b2, "amp"))
        out[f"amp_ratio_{b1}{b2}"] = (a1 + EPS) / (a2 + EPS)
        out[f"log_amp_ratio_{b1}{b2}"] = np.log1p(np.clip(a1, 0.0, None)) - np.log1p(np.clip(a2, 0.0, None))

    # Broad color-like: blue vs red (g+r) vs (i+z+y)
    blue = safe_series(out, colname("g","peak_pos")) + safe_series(out, colname("r","peak_pos"))
    red  = safe_series(out, colname("i","peak_pos")) + safe_series(out, colname("z","peak_pos")) + safe_series(out, colname("y","peak_pos"))
    out["peakpos_blue_over_red"] = (blue + EPS) / (red + EPS)
    out["log_peakpos_blue_over_red"] = np.log1p(np.clip(blue, 0.0, None)) - np.log1p(np.clip(red, 0.0, None))

    return out

# ----------------------------
# Build upgraded features
# ----------------------------
df_tr_upg = add_cross_band(add_band_derived(df_tr))
df_te_upg = add_cross_band(add_band_derived(df_te))

# Final sanity: keep column order stable-ish (object_id first)
def reorder(df: pd.DataFrame) -> pd.DataFrame:
    cols = list(df.columns)
    if "object_id" in cols:
        cols = ["object_id"] + [c for c in cols if c != "object_id"]
        return df[cols]
    return df

df_tr_upg = reorder(df_tr_upg)
df_te_upg = reorder(df_te_upg)

df_tr_upg.to_csv(OUT_TR, index=False)
df_te_upg.to_csv(OUT_TE, index=False)

print("=== STAGE 7 DONE ===")
print("Saved:", OUT_TR)
print("Saved:", OUT_TE)
print("Train shape:", df_tr_upg.shape, "| Test shape:", df_te_upg.shape)

# Quick peek of new columns count
new_cols = set(df_tr_upg.columns) - set(df_tr.columns)
print("Added features:", len(new_cols))
print("Example added:", sorted(list(new_cols))[:30])


  out[colname(b, "has_negative")] = (mn < 0).astype(int)
  out[colname(b, "cadence_proxy")] = tspan / (n_obs.clip(lower=1.0))
  out[colname(b, "snr_per_obs")] = snr_m  # already per obs-ish, keep alias
  out[colname(b, "present")] = present
  out[colname(b, "cv_absmean")] = sd / (np.abs(mu) + EPS)
  out[colname(b, "peak_pos")] = peak_pos
  out[colname(b, "log1p_peak_pos")] = np.log1p(peak_pos)
  out[colname(b, "peak_over_absmean")] = mx / (np.abs(mu) + EPS)
  out[colname(b, "amp_over_absmean")] = amp / (np.abs(mu) + EPS)
  out[colname(b, "amp_over_std")] = amp / (sd + EPS)
  out[colname(b, "has_negative")] = (mn < 0).astype(int)
  out[colname(b, "cadence_proxy")] = tspan / (n_obs.clip(lower=1.0))
  out[colname(b, "snr_per_obs")] = snr_m  # already per obs-ish, keep alias
  out[colname(b, "present")] = present
  out[colname(b, "cv_absmean")] = sd / (np.abs(mu) + EPS)
  out[colname(b, "peak_pos")] = peak_pos
  out[colname(b, "log1p_peak_pos")] = np.log1p(peak_pos)
  out[colname(b, "peak_

=== STAGE 7 DONE ===
Saved: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\features_upg_train.csv
Saved: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\features_upg_test.csv
Train shape: (3043, 167) | Test shape: (7135, 167)
Added features: 100
Example added: ['amp_concentration', 'amp_ratio_gr', 'amp_ratio_iz', 'amp_ratio_ri', 'amp_ratio_ug', 'amp_ratio_zy', 'g__amp_over_absmean', 'g__amp_over_std', 'g__cadence_proxy', 'g__cv_absmean', 'g__has_negative', 'g__log1p_peak_pos', 'g__peak_over_absmean', 'g__peak_pos', 'g__present', 'g__snr_per_obs', 'i__amp_over_absmean', 'i__amp_over_std', 'i__cadence_proxy', 'i__cv_absmean', 'i__has_negative', 'i__log1p_peak_pos', 'i__peak_over_absmean', 'i__peak_pos', 'i__present', 'i__snr_per_obs', 'log_amp_ratio_gr', 'log_amp_ratio_iz', 'log_amp_ratio_ri', 'log_amp_ratio_ug']


# Ensemble ringan

In [10]:
# ============================================================
# STAGE 8 — ENSEMBLE RINGAN (CPU-FRIENDLY)
# Tujuan:
# - Gabungkan prediksi dari beberapa model/varian (mis: LGBM v01, LGBM z-aug v02, LGBM fitur upgrade)
# - Ensemble dengan average probability (lebih stabil)
# - Threshold tuning F1 dari OOF ensemble (jika OOF tersedia untuk semua)
# - Export submission ensemble (mengikuti kolom sample_submission: 'prediction' atau lainnya)
#
# Cara pakai (paling aman):
# 1) Pastikan kamu sudah punya minimal 2 file OOF:
#    - artifacts/lgbm_oof.csv
#    - artifacts/lgbm_zaug_oof.csv  (atau oof lain)
#    dan 2 file proba test (opsional):
#    - artifacts/lgbm_test_proba.csv
#    - artifacts/lgbm_zaug_test_proba.csv
#
# Kalau kamu belum simpan test_proba, skrip ini tetap bisa ensemble submission
# dari file submission masing-masing (majority vote / average label),
# tapi yang optimal adalah average PROBABILITY.
#
# Output:
# - artifacts/ens_oof.csv
# - artifacts/ens_threshold.txt
# - artifacts/ens_report.txt
# - submissions/sub_ensemble_v01.csv
# ============================================================

from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"
SUB_DIR   = DATA_ROOT / "submissions"
SUB_DIR.mkdir(parents=True, exist_ok=True)

SAMPLE_SUB_PATH = DATA_ROOT / "sample_submission.csv"
if not SAMPLE_SUB_PATH.exists():
    raise FileNotFoundError(f"Missing: {SAMPLE_SUB_PATH}")
df_sub = pd.read_csv(SAMPLE_SUB_PATH, low_memory=False)

if "object_id" not in df_sub.columns:
    raise ValueError(f"sample_submission tidak punya 'object_id'. Found: {list(df_sub.columns)}")
pred_cols = [c for c in df_sub.columns if c != "object_id"]
if len(pred_cols) != 1:
    raise ValueError(f"sample_submission harus punya 1 kolom prediksi selain object_id. Found: {list(df_sub.columns)}")
SUB_PRED_COL = pred_cols[0]
print(f"[INFO] sample_submission prediction column = '{SUB_PRED_COL}'")

# ----------------------------
# Pilih sumber ensemble (edit daftar ini sesuai file kamu)
# ----------------------------
# OOF sources (harus ada kolom: object_id, oof_proba, target)
OOF_FILES = [
    ART_DIR / "lgbm_oof.csv",
    ART_DIR / "lgbm_zaug_oof.csv",
    # ART_DIR / "lgbm_upg_oof.csv",   # kalau nanti kamu buat
]

# TEST proba sources (recommended) (harus ada: object_id, proba)
# Kalau belum ada, isi [] dan pakai submission-based ensemble di bawah.
TEST_PROBA_FILES = [
    # ART_DIR / "lgbm_test_proba.csv",
    # ART_DIR / "lgbm_zaug_test_proba.csv",
]

# Submission sources (fallback) (harus mengikuti sample_submission, kolom prediksi 0/1)
SUB_FILES_FALLBACK = [
    SUB_DIR / "sub_lgbm_v01.csv",
    SUB_DIR / "sub_lgbm_zaug_v02.csv",
    # SUB_DIR / "sub_lgbm_upg_v03.csv",
]

# Ensemble weights (opsional)
# - Jika None, semua model bobot sama.
# - Jika list, panjang harus sama dengan jumlah model yang dipakai.
ENSEMBLE_WEIGHTS = None

# Threshold tuning grid
THR_STEP = 0.001

# ============================================================
# Helper loaders
# ============================================================
def load_oof(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, low_memory=False)
    need = {"object_id", "oof_proba", "target"}
    if not need.issubset(df.columns):
        raise ValueError(f"OOF file {path} missing {need - set(df.columns)} | found={list(df.columns)}")
    df = df[["object_id", "oof_proba", "target"]].copy()
    df["object_id"] = df["object_id"].astype(str)
    df["oof_proba"] = pd.to_numeric(df["oof_proba"], errors="coerce")
    df["target"] = pd.to_numeric(df["target"], errors="coerce").astype(int)
    df = df.dropna(subset=["oof_proba"])
    return df

def load_test_proba(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, low_memory=False)
    # accept proba column name variations
    if "object_id" not in df.columns:
        raise ValueError(f"Test proba file {path} missing object_id")
    proba_col = None
    for c in ["proba", "pred_proba", "prediction_proba", "test_proba"]:
        if c in df.columns:
            proba_col = c
            break
    if proba_col is None:
        # try: any column besides object_id
        cand = [c for c in df.columns if c != "object_id"]
        if len(cand) == 1:
            proba_col = cand[0]
        else:
            raise ValueError(f"Cannot detect proba column in {path}. Found={list(df.columns)}")
    df = df[["object_id", proba_col]].copy()
    df = df.rename(columns={proba_col: "proba"})
    df["object_id"] = df["object_id"].astype(str)
    df["proba"] = pd.to_numeric(df["proba"], errors="coerce")
    df = df.dropna(subset=["proba"])
    return df

def load_submission(path: Path, pred_col: str) -> pd.DataFrame:
    df = pd.read_csv(path, low_memory=False)
    if "object_id" not in df.columns or pred_col not in df.columns:
        raise ValueError(f"Submission {path} must contain ['object_id','{pred_col}']. Found={list(df.columns)}")
    df = df[["object_id", pred_col]].copy()
    df["object_id"] = df["object_id"].astype(str)
    df[pred_col] = pd.to_numeric(df[pred_col], errors="coerce").astype(int)
    return df

def weighted_mean(mat: np.ndarray, w: np.ndarray) -> np.ndarray:
    return (mat * w[None, :]).sum(axis=1) / (w.sum() + 1e-12)

# ============================================================
# 1) OOF Ensemble (threshold tuning terbaik)
# ============================================================
oofs = []
for p in OOF_FILES:
    if not p.exists():
        print(f"[WARN] Missing OOF: {p} (skip)")
        continue
    oofs.append(load_oof(p))

if len(oofs) < 2:
    print("[WARN] OOF ensemble butuh >=2 OOF files. Akan lanjut ke submission-based ensemble saja.")
    HAVE_OOF = False
else:
    HAVE_OOF = True

best_thr = 0.5

if HAVE_OOF:
    # Inner join by object_id so rows align
    base = oofs[0][["object_id", "target"]].copy()
    for i, df in enumerate(oofs):
        base = base.merge(df[["object_id", "oof_proba"]].rename(columns={"oof_proba": f"p{i}"}),
                          on="object_id", how="inner")

    y = base["target"].astype(int).values
    p_cols = [c for c in base.columns if c.startswith("p")]
    P = base[p_cols].to_numpy(dtype=float)

    m = P.shape[1]
    if ENSEMBLE_WEIGHTS is None:
        w = np.ones(m, dtype=float)
    else:
        w = np.asarray(ENSEMBLE_WEIGHTS, dtype=float)
        if w.shape[0] != m:
            raise ValueError(f"ENSEMBLE_WEIGHTS length {w.shape[0]} != #models {m}")

    ens_p = weighted_mean(P, w)

    # threshold tuning (fine)
    thr_grid = np.arange(0.0, 1.0 + THR_STEP, THR_STEP)
    best_f1 = -1.0
    for thr in thr_grid:
        f1 = f1_score(y, (ens_p >= thr).astype(int))
        if f1 > best_f1:
            best_f1 = float(f1)
            best_thr = float(thr)

    pred = (ens_p >= best_thr).astype(int)
    prec = precision_score(y, pred, zero_division=0)
    rec = recall_score(y, pred, zero_division=0)

    print("\n=== OOF ENSEMBLE RESULT ===")
    print(f"Models used      : {m}")
    print(f"Best threshold   : {best_thr:.4f}")
    print(f"OOF F1           : {best_f1:.6f}")
    print(f"OOF Precision    : {prec:.6f}")
    print(f"OOF Recall       : {rec:.6f}")

    # save OOF ensemble artifact
    df_ens_oof = pd.DataFrame({
        "object_id": base["object_id"].astype(str),
        "ens_oof_proba": ens_p.astype(np.float32),
        "target": y
    })
    df_ens_oof.to_csv(ART_DIR / "ens_oof.csv", index=False)

    (ART_DIR / "ens_threshold.txt").write_text(f"{best_thr}\n", encoding="utf-8")

    report = []
    report.append("=== ENSEMBLE REPORT ===")
    report.append(f"Models used: {m}")
    report.append(f"OOF rows (inner-join): {len(base):,}")
    report.append(f"Best threshold: {best_thr:.6f}")
    report.append(f"OOF F1: {best_f1:.6f}")
    report.append(f"OOF Precision: {prec:.6f}")
    report.append(f"OOF Recall: {rec:.6f}")
    report.append("")
    report.append("OOF files:")
    for p in OOF_FILES:
        report.append(f"- {p}")
    (ART_DIR / "ens_report.txt").write_text("\n".join(report), encoding="utf-8")

# ============================================================
# 2) TEST Ensemble: preferred from test probability files
# ============================================================
have_test_proba = all(p.exists() for p in TEST_PROBA_FILES) and len(TEST_PROBA_FILES) >= 2

if have_test_proba:
    test_list = [load_test_proba(p) for p in TEST_PROBA_FILES]

    base = test_list[0][["object_id"]].copy()
    for i, df in enumerate(test_list):
        base = base.merge(df.rename(columns={"proba": f"p{i}"}), on="object_id", how="inner")

    p_cols = [c for c in base.columns if c.startswith("p")]
    P = base[p_cols].to_numpy(dtype=float)
    m = P.shape[1]

    if ENSEMBLE_WEIGHTS is None:
        w = np.ones(m, dtype=float)
    else:
        w = np.asarray(ENSEMBLE_WEIGHTS, dtype=float)
        if w.shape[0] != m:
            raise ValueError(f"ENSEMBLE_WEIGHTS length {w.shape[0]} != #models {m}")

    ens_test_p = weighted_mean(P, w)
    ens_test_pred = (ens_test_p >= best_thr).astype(int)

    pred_df = pd.DataFrame({"object_id": base["object_id"].astype(str), SUB_PRED_COL: ens_test_pred})
else:
    # ============================================================
    # 3) FALLBACK: Submission-based ensemble (vote / average label)
    # ============================================================
    subs = []
    for p in SUB_FILES_FALLBACK:
        if not p.exists():
            print(f"[WARN] Missing submission: {p} (skip)")
            continue
        subs.append(load_submission(p, SUB_PRED_COL))

    if len(subs) < 2:
        raise RuntimeError("Butuh >=2 model untuk ensemble. Tambahkan OOF/TEST proba/submission lain.")

    base = subs[0][["object_id"]].copy()
    for i, df in enumerate(subs):
        base = base.merge(df.rename(columns={SUB_PRED_COL: f"y{i}"}), on="object_id", how="inner")

    y_cols = [c for c in base.columns if c.startswith("y")]
    Y = base[y_cols].to_numpy(dtype=float)

    # average label -> then apply 0.5 vote threshold
    avg_label = Y.mean(axis=1)
    ens_test_pred = (avg_label >= 0.5).astype(int)

    pred_df = pd.DataFrame({"object_id": base["object_id"].astype(str), SUB_PRED_COL: ens_test_pred})

# ============================================================
# 4) Build final submission
# ============================================================
sub = df_sub[["object_id"]].copy()
sub["object_id"] = sub["object_id"].astype(str)

sub = sub.merge(pred_df, on="object_id", how="left")

if sub[SUB_PRED_COL].isna().any():
    missing = sub[sub[SUB_PRED_COL].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Submission has NaN after merge. Example missing object_id: {missing}")

out_path = SUB_DIR / "sub_ensemble_v01.csv"
sub.to_csv(out_path, index=False)

print("\n=== DONE ===")
print("Saved submission:", out_path)
print("Used threshold   :", best_thr)


[INFO] sample_submission prediction column = 'prediction'

=== OOF ENSEMBLE RESULT ===
Models used      : 2
Best threshold   : 0.6310
OOF F1           : 0.687943
OOF Precision    : 0.723881
OOF Recall       : 0.655405

=== DONE ===
Saved submission: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\submissions\sub_ensemble_v01.csv
Used threshold   : 0.631


# Train full & buat submission

In [12]:
# ============================================================
# FINAL STAGE — TRAIN FULL (CPU) + EXPORT SUBMISSION  [REVISI FULL: AUTO-FALLBACK]
#
# Default behavior:
# - Pakai fitur paling baru jika ada: features_upg_* (STAGE 7), kalau tidak ada pakai features_merged_* (STAGE 3)
# - Aktifkan fitur domain-shift redshift (robust) + Z noise augmentation ringan
# - Ambil threshold terbaik dari artifacts:
#     1) ens_threshold.txt (jika ada)
#     2) lgbm_threshold_best.txt
#     3) lgbm_threshold.txt / lgbm_zaug_threshold.txt
#     4) fallback 0.50
# - Export submission mengikuti kolom sample_submission (mis. 'prediction')
# - Simpan juga test probabilities untuk ensemble berikutnya
#
# Backend priority (tanpa crash kalau lgbm belum ada):
# 1) lightgbm
# 2) xgboost
# 3) sklearn HistGradientBoostingClassifier (no extra install)
# ============================================================

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import HistGradientBoostingClassifier

# ----------------------------
# CONFIG
# ----------------------------
DATA_ROOT = Path(r"D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge")
ART_DIR   = DATA_ROOT / "artifacts"
SUB_DIR   = DATA_ROOT / "submissions"
SUB_DIR.mkdir(parents=True, exist_ok=True)

SEED = 2025

# Domain-shift redshift settings (ringan)
USE_REDSHIFT_ROBUST_FE = True
AUG_FLOOR = 0.01
AUG_REL   = 0.02

# Threshold tuning fallback if none exists
DEFAULT_THR = 0.50

# ----------------------------
# Backend selection (no crash)
# ----------------------------
BACKEND = None
lgb = None
xgb = None

try:
    import lightgbm as lgb  # type: ignore
    BACKEND = "lightgbm"
except Exception:
    try:
        import xgboost as xgb  # type: ignore
        BACKEND = "xgboost"
    except Exception:
        BACKEND = "sklearn_hgb"

print(f"[INFO] Backend used: {BACKEND}")

# ----------------------------
# Paths
# ----------------------------
train_log_path  = ART_DIR / "train_log_clean.csv"
test_log_path   = ART_DIR / "test_log_clean.csv"
sample_sub_path = DATA_ROOT / "sample_submission.csv"

feat_tr_candidates = [ART_DIR / "features_upg_train.csv", ART_DIR / "features_merged_train.csv"]
feat_te_candidates = [ART_DIR / "features_upg_test.csv",  ART_DIR / "features_merged_test.csv"]

def pick_existing(paths):
    for p in paths:
        if p.exists():
            return p
    return None

feat_tr_path = pick_existing(feat_tr_candidates)
feat_te_path = pick_existing(feat_te_candidates)

for p in [train_log_path, test_log_path, sample_sub_path, feat_tr_path, feat_te_path]:
    if p is None or not Path(p).exists():
        raise FileNotFoundError(f"Missing required file: {p}")

print("[INFO] Feature train:", feat_tr_path)
print("[INFO] Feature test :", feat_te_path)

# ----------------------------
# Load data
# ----------------------------
df_tr_log  = pd.read_csv(train_log_path, low_memory=False)
df_te_log  = pd.read_csv(test_log_path, low_memory=False)
df_tr_feat = pd.read_csv(feat_tr_path, low_memory=False)
df_te_feat = pd.read_csv(feat_te_path, low_memory=False)
df_sub     = pd.read_csv(sample_sub_path, low_memory=False)

# Detect submission prediction column
if "object_id" not in df_sub.columns:
    raise ValueError(f"sample_submission tidak punya 'object_id'. Found: {list(df_sub.columns)}")
pred_cols = [c for c in df_sub.columns if c != "object_id"]
if len(pred_cols) != 1:
    raise ValueError(f"sample_submission harus punya 1 kolom prediksi selain object_id. Found: {list(df_sub.columns)}")
SUB_PRED_COL = pred_cols[0]
print(f"[INFO] sample_submission prediction column = '{SUB_PRED_COL}'")

# Normalize ids
for d in [df_tr_log, df_te_log, df_tr_feat, df_te_feat, df_sub]:
    d["object_id"] = d["object_id"].astype(str)

# Target
if "target" not in df_tr_log.columns:
    raise ValueError(f"train_log_clean missing 'target'. Found: {list(df_tr_log.columns)}")
df_tr_log["target"] = pd.to_numeric(df_tr_log["target"], errors="coerce").astype(int)

df_train = df_tr_feat.merge(df_tr_log[["object_id", "target"]], on="object_id", how="left")
if df_train["target"].isna().any():
    bad = df_train[df_train["target"].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Ada object_id train features yang tidak ketemu target. Example: {bad}")

df_test = df_te_feat.copy()

# ----------------------------
# Threshold loader
# ----------------------------
def load_threshold():
    cand = [
        ART_DIR / "ens_threshold.txt",
        ART_DIR / "lgbm_threshold_best.txt",
        ART_DIR / "lgbm_threshold.txt",
        ART_DIR / "lgbm_zaug_threshold.txt",
    ]
    for p in cand:
        if p.exists():
            try:
                v = float(p.read_text(encoding="utf-8").strip().splitlines()[0])
                if 0.0 <= v <= 1.0:
                    print("[INFO] Using threshold from:", p)
                    return v
            except Exception:
                pass
    print(f"[WARN] No threshold file found. Using {DEFAULT_THR:.2f}")
    return float(DEFAULT_THR)

THR = load_threshold()

# ----------------------------
# Domain-shift redshift robust features
# ----------------------------
def _to_num(s):
    return pd.to_numeric(s, errors="coerce")

def get_zerr_fill_from_test(df_te_log, df_test):
    zerr = None
    if "Z_err" in df_te_log.columns:
        zerr = _to_num(df_te_log["Z_err"])
    elif "Z_err" in df_test.columns:
        zerr = _to_num(df_test["Z_err"])
    if zerr is None:
        return 0.05
    v = float(np.nanmedian(zerr.values))
    if not np.isfinite(v) or v <= 0:
        v = 0.05
    return v

def add_redshift_domainshift_features(df, zerr_fill, is_train, seed):
    out = df.copy()

    if "Z" not in out.columns:
        out["Z"] = np.nan
    if "Z_err" not in out.columns:
        out["Z_err"] = np.nan

    z = _to_num(out["Z"]).astype(float).values
    zerr = _to_num(out["Z_err"]).astype(float).values

    z_isna = ~np.isfinite(z)
    zerr_isna = ~np.isfinite(zerr)

    z_f = z.copy()
    z_f[z_isna] = 0.0

    zerr_f = zerr.copy()
    zerr_f[zerr_isna] = zerr_fill
    zerr_f = np.clip(zerr_f, 0.0, None)

    out["Z_filled"] = z_f
    out["Zerr_filled"] = zerr_f
    out["Z_missing"] = z_isna.astype(int)
    out["Zerr_missing"] = zerr_isna.astype(int)

    out["log1pZ"] = np.log1p(np.clip(z_f, 0.0, None))
    out["log1pZerr"] = np.log1p(np.clip(zerr_f, 0.0, None))
    out["inv1pZ"] = 1.0 / (1.0 + np.clip(z_f, 0.0, None))
    out["Z_div_Zerr"] = z_f / (zerr_f + 1e-6)

    if is_train:
        rng = np.random.default_rng(seed)
        sigma = np.sqrt(np.square(zerr_f) + np.square(AUG_FLOOR) + np.square(AUG_REL * (1.0 + z_f)))
        noise = rng.normal(0.0, sigma, size=z_f.shape[0])
        z_aug = np.clip(z_f + noise, 0.0, None)
        out["Z_aug"] = z_aug
        out["Z_aug_absdiff"] = np.abs(z_aug - z_f)
    else:
        out["Z_aug"] = z_f
        out["Z_aug_absdiff"] = 0.0

    return out

if USE_REDSHIFT_ROBUST_FE:
    zerr_fill = get_zerr_fill_from_test(df_te_log, df_test)
    print(f"[INFO] zerr_fill (from test median) = {zerr_fill:.6f}")
    df_train = add_redshift_domainshift_features(df_train, zerr_fill, is_train=True, seed=SEED)
    df_test  = add_redshift_domainshift_features(df_test,  zerr_fill, is_train=False, seed=SEED)

# ----------------------------
# Build X/y
# ----------------------------
y = df_train["target"].astype(int).values
drop_cols = {"object_id", "target"}
feature_cols = [c for c in df_train.columns if c not in drop_cols]

for c in feature_cols:
    df_train[c] = pd.to_numeric(df_train[c], errors="coerce")
    df_test[c]  = pd.to_numeric(df_test[c], errors="coerce")

X_df = df_train[feature_cols]
X_test_df = df_test[feature_cols]

pos = max(int((y == 1).sum()), 1)
neg = max(int((y == 0).sum()), 1)
scale_pos_weight = neg / pos

print(f"[INFO] X_train: {X_df.shape} | X_test: {X_test_df.shape}")
print(f"[INFO] pos_rate={float((y==1).mean()):.6f} | scale_pos_weight={scale_pos_weight:.4f}")
print(f"[INFO] threshold={THR:.6f}")

# ----------------------------
# Imputer (median) + float32
# ----------------------------
imputer = SimpleImputer(strategy="median")
X_all = imputer.fit_transform(X_df).astype(np.float32, copy=False)
X_test_all = imputer.transform(X_test_df).astype(np.float32, copy=False)

# ----------------------------
# Model factory + helpers
# ----------------------------
def make_model(seed: int):
    if BACKEND == "lightgbm":
        params = dict(
            objective="binary",
            learning_rate=0.03,
            n_estimators=12000,
            num_leaves=96,
            min_child_samples=120,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=seed,
            n_jobs=-1,
            scale_pos_weight=scale_pos_weight,
        )
        return lgb.LGBMClassifier(**params)

    if BACKEND == "xgboost":
        params = dict(
            objective="binary:logistic",
            learning_rate=0.03,
            n_estimators=15000,
            max_depth=6,
            min_child_weight=3.0,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=seed,
            n_jobs=-1,
            eval_metric="logloss",
            scale_pos_weight=scale_pos_weight,
            tree_method="hist",
        )
        return xgb.XGBClassifier(**params)

    return HistGradientBoostingClassifier(
        learning_rate=0.05,
        max_depth=6,
        max_leaf_nodes=96,
        min_samples_leaf=60,
        l2_regularization=0.0,
        max_iter=4000,
        early_stopping=True,
        validation_fraction=0.12,
        n_iter_no_change=100,
        random_state=seed,
    )

def predict_proba_pos(model, X):
    return model.predict_proba(X)[:, 1]

# ----------------------------
# Find best_iter via small holdout (early stop)
# - lightgbm/xgboost: early stopping -> best_iteration
# - sklearn_hgb: internal early stopping -> n_iter_
# ----------------------------
X_tr, X_va, y_tr, y_va = train_test_split(
    X_all, y, test_size=0.12, stratify=y, random_state=SEED
)

tmp_model = make_model(SEED + 7)

if BACKEND == "lightgbm":
    tmp_model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="binary_logloss",
        callbacks=[lgb.early_stopping(stopping_rounds=300, verbose=False)]
    )
    best_iter = int(getattr(tmp_model, "best_iteration_", None) or tmp_model.n_estimators)

elif BACKEND == "xgboost":
    tmp_model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        verbose=False,
        early_stopping_rounds=300
    )
    best_iter = int(getattr(tmp_model, "best_iteration", None) or tmp_model.n_estimators)

else:
    sw = np.ones_like(y_tr, dtype=np.float32)
    sw[y_tr == 1] = float(scale_pos_weight)
    tmp_model.fit(X_tr, y_tr, sample_weight=sw)
    best_iter = int(getattr(tmp_model, "n_iter_", None) or getattr(tmp_model, "max_iter", 2000) or 2000)

print(f"[INFO] best_iteration (holdout) = {best_iter}")

p_va = predict_proba_pos(tmp_model, X_va)
f1_va = f1_score(y_va, (p_va >= THR).astype(int))
print(f"[INFO] holdout F1@THR = {f1_va:.6f}")

# ----------------------------
# Train FULL with best_iter (if applicable)
# ----------------------------
final_model = make_model(SEED + 999)

if BACKEND == "lightgbm":
    final_model.set_params(n_estimators=max(200, best_iter))
    final_model.fit(X_all, y)

elif BACKEND == "xgboost":
    final_model.set_params(n_estimators=max(200, best_iter))
    final_model.fit(X_all, y, verbose=False)

else:
    sw = np.ones_like(y, dtype=np.float32)
    sw[y == 1] = float(scale_pos_weight)
    final_model.fit(X_all, y, sample_weight=sw)

# ----------------------------
# Predict test (proba + label)
# ----------------------------
test_proba = predict_proba_pos(final_model, X_test_all).astype(np.float32)
test_pred  = (test_proba >= THR).astype(int)

# Save test probabilities (buat ensemble berikutnya)
proba_path = ART_DIR / "final_test_proba.csv"
pd.DataFrame({"object_id": df_test["object_id"].astype(str), "proba": test_proba}).to_csv(proba_path, index=False)
print("[INFO] Saved test proba:", proba_path)

# ----------------------------
# Build submission
# ----------------------------
sub = df_sub[["object_id"]].copy()
sub["object_id"] = sub["object_id"].astype(str)

pred_df = pd.DataFrame({
    "object_id": df_test["object_id"].astype(str),
    SUB_PRED_COL: test_pred.astype(int),
})

sub = sub.merge(pred_df, on="object_id", how="left")

if sub[SUB_PRED_COL].isna().any():
    missing = sub[sub[SUB_PRED_COL].isna()]["object_id"].head(10).tolist()
    raise RuntimeError(f"Submission has NaN after merge. Example missing object_id: {missing}")

out_path = SUB_DIR / "sub_final_cpu.csv"
sub.to_csv(out_path, index=False)

print("\n=== DONE ===")
print("Backend:", BACKEND)
print("Prediction column:", SUB_PRED_COL)
print("Saved submission:", out_path)


[INFO] Backend used: sklearn_hgb
[INFO] Feature train: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\features_upg_train.csv
[INFO] Feature test : D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\features_upg_test.csv


[INFO] sample_submission prediction column = 'prediction'
[INFO] Using threshold from: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\ens_threshold.txt
[INFO] zerr_fill (from test median) = 0.029650
[INFO] X_train: (3043, 176) | X_test: (7135, 176)
[INFO] pos_rate=0.048636 | scale_pos_weight=19.5608
[INFO] threshold=0.631000




[INFO] best_iteration (holdout) = 134
[INFO] holdout F1@THR = 0.611111
[INFO] Saved test proba: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\artifacts\final_test_proba.csv

=== DONE ===
Backend: sklearn_hgb
Prediction column: prediction
Saved submission: D:\MALLORN Astronomical Classification Challenge\mallorn-astronomical-classification-challenge\submissions\sub_final_cpu.csv
