In [None]:
# Cell 0
!pip install -q tsfresh
!pip install -q lightgbm

In [None]:
# Cell 1 – Imports
import os
import numpy as np
import pandas as pd

from tsfresh import extract_features, select_features
from tsfresh.feature_extraction import EfficientFCParameters

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss, f1_score
from sklearn.preprocessing import StandardScaler # Dòng mới
from sklearn.linear_model import LogisticRegression # Dòng mới
import xgboost as xgb
import lightgbm as lgb # Dòng mới

In [None]:
# Cell 2 – Locate DATA_DIR
candidate_dirs = [
    "/kaggle/input/dataset-macc/dataset_mallorn-astronomical-classification-challenge"
]

DATA_DIR = None
for d in candidate_dirs:
    if os.path.exists(os.path.join(d, "train_log.csv")):
        DATA_DIR = d
        break

if DATA_DIR is None:
    raise FileNotFoundError(
        "Could not find train_log.csv. "
        "Check /kaggle/input and adjust DATA_DIR accordingly."
    )

print("Using DATA_DIR:", DATA_DIR)
print("Files:", os.listdir(DATA_DIR))


In [None]:
# Cell 3 – Load metadata
train_log = pd.read_csv(os.path.join(DATA_DIR, "train_log.csv"))
test_log  = pd.read_csv(os.path.join(DATA_DIR, "test_log.csv"))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("train_log:", train_log.shape)
print("test_log:", test_log.shape)
print("sample_submission:", sample_sub.shape)

print("\nTarget distribution:")
print(train_log["target"].value_counts(normalize=True))


In [None]:
# Cell 4 – Lightcurve feature function (UPDATED: cadence + peak + noise-aware)
def make_features(
    lc_df: pd.DataFrame,
    clip_flux: bool = True,
    clip_low: float = -5.0,
    clip_high: float = 5.0,
    drop_fluxerr_quantile: float = 0.99,
) -> pd.DataFrame:
    """
    From lightcurves:
        object_id, Time (MJD), Flux, Flux_err, Filter
    -> one row per object_id with per-filter + global stats.

    New additions (from EDA approach):
      - Drop noisiest 1% Flux_err per filter
      - Flux clipping for robust peak/shape
      - Cadence features: max_gap, gap_std, obs_density
      - Peak features: peak_flux/time, time_to_peak, rise/decay slopes, asymmetry
      - Shape/noise: n_local_peaks, auc_pos/abs, smoothness, snr_peak
      - Cross-filter: peak_time_diff, peak_flux_ratio
    """
    lc_df = lc_df.drop_duplicates().copy()
    lc_df = lc_df.replace([np.inf, -np.inf], np.nan)
    lc_df = lc_df.dropna(subset=["object_id", "Time (MJD)", "Flux", "Flux_err", "Filter"])

    # Drop top 1% Flux_err per filter (noise-aware)
    if drop_fluxerr_quantile is not None:
        thr = lc_df.groupby("Filter")["Flux_err"].transform(lambda s: s.quantile(drop_fluxerr_quantile))
        lc_df = lc_df[lc_df["Flux_err"] <= thr]

    # Clip heavy tails for robust peak/shape
    lc_df["Flux_clip"] = lc_df["Flux"].clip(clip_low, clip_high) if clip_flux else lc_df["Flux"]

    # Per-measurement helpers
    lc_df["snr"] = lc_df["Flux"] / lc_df["Flux_err"].replace(0, np.nan)
    lc_df["snr"] = lc_df["snr"].fillna(0.0)
    lc_df["snr_clip"] = lc_df["Flux_clip"] / lc_df["Flux_err"].replace(0, np.nan)
    lc_df["snr_clip"] = lc_df["snr_clip"].fillna(0.0)
    lc_df["is_pos"] = (lc_df["Flux"] > 0).astype(float)

    # --- 1) Per-band aggregates ---
    agg = lc_df.groupby(["object_id", "Filter"]).agg(
        flux_mean=("Flux", "mean"),
        flux_std=("Flux", "std"),
        flux_min=("Flux", "min"),
        flux_max=("Flux", "max"),
        flux_median=("Flux", "median"),
        flux_count=("Flux", "count"),
        flux_abs_mean=("Flux", lambda x: float(np.mean(np.abs(x)))),

        flux_clip_mean=("Flux_clip", "mean"),
        flux_clip_std=("Flux_clip", "std"),
        flux_clip_min=("Flux_clip", "min"),
        flux_clip_max=("Flux_clip", "max"),

        time_min=("Time (MJD)", "min"),
        time_max=("Time (MJD)", "max"),

        err_mean=("Flux_err", "mean"),
        err_std=("Flux_err", "std"),

        snr_mean=("snr", "mean"),
        snr_std=("snr", "std"),
        snr_max=("snr", "max"),
        snr_clip_max=("snr_clip", "max"),

        pos_frac=("is_pos", "mean"),
    ).reset_index()

    agg["time_range"] = agg["time_max"] - agg["time_min"]
    agg["flux_amp"] = agg["flux_max"] - agg["flux_min"]
    agg["flux_clip_amp"] = agg["flux_clip_max"] - agg["flux_clip_min"]
    agg["obs_rate"] = agg["flux_count"] / agg["time_range"].replace(0, np.nan)
    agg["obs_rate"] = agg["obs_rate"].fillna(0.0)

    # --- 2) Extra per-band (cadence + peak + shape) ---
    def _extra_band_feats(group: pd.DataFrame) -> pd.Series:
        t = group["Time (MJD)"].values
        f = group["Flux_clip"].values
        snr = group["snr"].values

        if len(t) > 0:
            order = np.argsort(t)
            t = t[order]
            f = f[order]
            snr = snr[order]

        # cadence / slopes
        slope_max = slope_min = slope_mean = 0.0
        dt_mean = dt_max = dt_std = 0.0
        obs_density = 0.0

        if len(t) > 1:
            dt = np.diff(t)
            valid = dt > 0
            if np.any(valid):
                df = np.diff(f)
                slopes = df[valid] / dt[valid]
                slope_max = float(np.max(slopes))
                slope_min = float(np.min(slopes))
                slope_mean = float(np.mean(slopes))
                dt_mean = float(np.mean(dt[valid]))
                dt_max  = float(np.max(dt[valid]))
                dt_std  = float(np.std(dt[valid]))

            duration = float(t[-1] - t[0])
            obs_density = float(len(t) / duration) if duration > 0 else 0.0

        # robust amplitude
        if len(f) > 0:
            flux_p10 = float(np.percentile(f, 10))
            flux_p90 = float(np.percentile(f, 90))
            flux_p90_p10 = float(flux_p90 - flux_p10)
        else:
            flux_p10 = flux_p90 = flux_p90_p10 = 0.0

        # peak features
        peak_flux = peak_time = time_to_peak = 0.0
        rise_slope = decay_slope = asymmetry = 0.0
        snr_peak = 0.0

        if len(f) > 0:
            i_peak = int(np.argmax(f))
            peak_flux = float(f[i_peak])
            peak_time = float(t[i_peak]) if len(t) > 0 else 0.0
            time_to_peak = float(peak_time - t[0]) if len(t) > 0 else 0.0

            def lin_slope(tt, ff):
                if len(tt) < 2: return 0.0
                if float(tt[-1] - tt[0]) == 0.0: return 0.0
                return float(np.polyfit(tt, ff, 1)[0])

            rise_slope = lin_slope(t[:i_peak+1], f[:i_peak+1])
            decay_slope = lin_slope(t[i_peak:], f[i_peak:])

            rise_time = float(t[i_peak] - t[0]) if len(t) > 0 else 0.0
            decay_time = float(t[-1] - t[i_peak]) if len(t) > 0 else 0.0
            asymmetry = float(rise_time / (decay_time + 1e-6))

            if len(snr) > i_peak:
                snr_peak = float(snr[i_peak])

        # local peaks (simple + robust threshold)
        n_local_peaks = 0
        if len(f) >= 3:
            loc = (f[1:-1] > f[:-2]) & (f[1:-1] > f[2:])
            prom = f[1:-1] - np.maximum(f[:-2], f[2:])
            thr = 0.5 * float(np.std(f)) if float(np.std(f)) > 0 else 0.0
            n_local_peaks = int(np.sum(loc & (prom > thr)))

        # AUC and smoothness
        auc_pos = auc_abs = smoothness = 0.0
        if len(f) >= 2:
            auc_pos = float(np.trapz(np.clip(f, 0, None), t))
            auc_abs = float(np.trapz(np.abs(f), t))
            if len(f) >= 3:
                smoothness = float(np.mean(np.abs(np.diff(f, 2)))) / (flux_p90_p10 + 1e-6)

        return pd.Series({
            "slope_max": slope_max,
            "slope_min": slope_min,
            "slope_mean": slope_mean,
            "dt_mean": dt_mean,
            "max_gap": dt_max,
            "gap_std": dt_std,
            "obs_density": obs_density,

            "flux_p10": flux_p10,
            "flux_p90": flux_p90,
            "flux_p90_p10": flux_p90_p10,

            "peak_flux": peak_flux,
            "peak_time": peak_time,
            "time_to_peak": time_to_peak,
            "rise_slope": rise_slope,
            "decay_slope": decay_slope,
            "asymmetry": asymmetry,
            "snr_peak": snr_peak,

            "n_local_peaks": n_local_peaks,
            "auc_pos": auc_pos,
            "auc_abs": auc_abs,
            "smoothness": smoothness,
        })

    extra = (
        lc_df.groupby(["object_id", "Filter"])[["Time (MJD)", "Flux_clip", "snr"]]
        .apply(_extra_band_feats)
        .reset_index()
    )
    agg = agg.merge(extra, on=["object_id", "Filter"], how="left")

    # Pivot to wide
    feat_pivot = agg.pivot(
        index="object_id",
        columns="Filter",
        values=[
            "flux_mean", "flux_std", "flux_min", "flux_max", "flux_median", "flux_count", "flux_abs_mean",
            "flux_clip_mean", "flux_clip_std", "flux_clip_min", "flux_clip_max",
            "time_min", "time_max", "time_range",
            "flux_amp", "flux_clip_amp", "obs_rate",
            "err_mean", "err_std",
            "snr_mean", "snr_std", "snr_max", "snr_clip_max",
            "pos_frac",
            "slope_max", "slope_min", "slope_mean",
            "dt_mean", "max_gap", "gap_std", "obs_density",
            "flux_p10", "flux_p90", "flux_p90_p10",
            "peak_flux", "peak_time", "time_to_peak", "rise_slope", "decay_slope", "asymmetry", "snr_peak",
            "n_local_peaks", "auc_pos", "auc_abs", "smoothness",
        ],
    )
    feat_pivot.columns = [f"{stat}_f{band}" for stat, band in feat_pivot.columns]
    feat_pivot = feat_pivot.reset_index()

    # --- 3) Global stats ---
    g = lc_df.groupby("object_id").agg(
        flux_mean_all=("Flux", "mean"),
        flux_std_all=("Flux", "std"),
        flux_min_all=("Flux", "min"),
        flux_max_all=("Flux", "max"),
        flux_median_all=("Flux", "median"),
        flux_count_all=("Flux", "count"),
        flux_abs_mean_all=("Flux", lambda x: float(np.mean(np.abs(x)))),
        snr_mean_all=("snr", "mean"),
        snr_std_all=("snr", "std"),
        snr_max_all=("snr", "max"),
        pos_frac_all=("is_pos", "mean"),
        time_min_all=("Time (MJD)", "min"),
        time_max_all=("Time (MJD)", "max"),
    ).reset_index()
    g["time_range_all"] = g["time_max_all"] - g["time_min_all"]

    out = g.merge(feat_pivot, on="object_id", how="left")

    # Cross-filter features (peak time lag + peak flux ratio)
    eps = 1e-6
    for a, b in [("r", "i"), ("i", "z"), ("r", "z"), ("g", "r")]:
        ta, tb = f"peak_time_f{a}", f"peak_time_f{b}"
        fa, fb = f"peak_flux_f{a}", f"peak_flux_f{b}"
        if ta in out.columns and tb in out.columns:
            out[f"peak_time_diff_{a}_{b}"] = out[ta] - out[tb]
        if fa in out.columns and fb in out.columns:
            out[f"peak_flux_ratio_{a}_{b}"] = (out[fa] + eps) / (out[fb] + eps)

    out = out.replace([np.inf, -np.inf], np.nan).fillna(0.0)
    return out


In [None]:
# Cell 5 – Meta features: Z, EBV, colors, rest-frame times
def add_meta_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add features that depend on meta-data: Z, EBV, colors, rest-frame times.
    """
    df = df.copy()

    # Add split_id as numeric feature (keep split string dropped later)
    if "split" in df.columns:
        df["split_id"] = (
            df["split"].astype(str).str.extract(r"(\d+)")[0].astype(float).fillna(0).astype(int)
        )

    z = df["Z"].fillna(0.0)
    ebv = df["EBV"].fillna(0.0)
    one_plus_z = 1.0 + z

    # Rest-frame durations
    if "time_range_all" in df.columns:
        df["time_range_all_rest"] = df["time_range_all"] / one_plus_z

    for col in df.columns:
        if col.startswith("time_range_f"):
            df[col + "_rest"] = df[col] / one_plus_z

    # Simple transforms of meta
    df["log_z_plus1"] = np.log1p(z)
    df["ebv"] = ebv
    df["ebv_z"] = ebv * z

    # Color features from flux_abs_mean per band (proxy for brightness)
    bands = ["u", "g", "r", "i", "z", "y"]
    # Approx extinction coefficients A_lambda / E(B-V)
    R = {"u": 4.239, "g": 3.303, "r": 2.285, "i": 1.698, "z": 1.263, "y": 1.088}

    eps = 1e-6
    pairs = [("u", "g"), ("g", "r"), ("r", "i"), ("i", "z"), ("z", "y"),
             ("g", "i"), ("g", "z")]

    for b1, b2 in pairs:
        c1 = f"flux_abs_mean_f{b1}"
        c2 = f"flux_abs_mean_f{b2}"
        if c1 in df.columns and c2 in df.columns:
            f1 = np.abs(df[c1]) + eps
            f2 = np.abs(df[c2]) + eps
            cname = f"color_{b1}{b2}"
            df[cname] = -2.5 * np.log10(f1 / f2)

            # EBV-corrected color
            delta_R = R[b1] - R[b2]
            df[cname + "_deext"] = df[cname] - ebv * delta_R

    return df


In [None]:
# Cell 6 – Build per-split features + global lightcurves
train_feat_list = []
test_feat_list = []

full_train_lc_list = []
full_test_lc_list = []

for i in range(1, 21):
    split_name = f"split_{i:02d}"
    print("Processing", split_name)

    lc_tr_path = os.path.join(DATA_DIR, split_name, "train_full_lightcurves.csv")
    lc_te_path = os.path.join(DATA_DIR, split_name, "test_full_lightcurves.csv")

    lc_tr = pd.read_csv(lc_tr_path)
    lc_te = pd.read_csv(lc_te_path)

    full_train_lc_list.append(lc_tr)
    full_test_lc_list.append(lc_te)

    # Train features
    tr_feat = make_features(lc_tr)
    tr_meta = train_log[train_log["split"] == split_name]
    merged_tr = tr_meta.merge(tr_feat, on="object_id", how="left")
    merged_tr = add_meta_features(merged_tr)
    train_feat_list.append(merged_tr)

    # Test features
    te_feat = make_features(lc_te)
    te_meta = test_log[test_log["split"] == split_name]
    merged_te = te_meta.merge(te_feat, on="object_id", how="left")
    merged_te = add_meta_features(merged_te)
    test_feat_list.append(merged_te)

train_df = pd.concat(train_feat_list, ignore_index=True)
test_df  = pd.concat(test_feat_list,  ignore_index=True)

print("train_df:", train_df.shape)
print("test_df:", test_df.shape)

full_train_lc = pd.concat(full_train_lc_list, ignore_index=True)
full_test_lc  = pd.concat(full_test_lc_list,  ignore_index=True)

print("full_train_lc:", full_train_lc.shape)
print("full_test_lc:", full_test_lc.shape)


In [None]:
# ---- NEW: Enrich full lightcurves for corrected + rest-frame tsfresh channels ----
R_COEFF = {"u": 4.239, "g": 3.303, "r": 2.285, "i": 1.698, "z": 1.263, "y": 1.088}

def enrich_for_tsfresh(full_lc: pd.DataFrame, log_df: pd.DataFrame) -> pd.DataFrame:
    lc = full_lc.merge(log_df[["object_id", "Z", "EBV"]], on="object_id", how="left")
    lc["Z"] = lc["Z"].fillna(0.0).clip(lower=0.0)
    lc["EBV"] = lc["EBV"].fillna(0.0)

    # rest-frame time (relative to each object's first obs)
    t0 = lc.groupby("object_id")["Time (MJD)"].transform("min")
    lc["t_rest"] = (lc["Time (MJD)"] - t0) / (1.0 + lc["Z"])

    # de-extinction on flux + error
    A = lc["EBV"] * lc["Filter"].map(R_COEFF).fillna(0.0)
    fac = np.power(10.0, 0.4 * A)

    lc["flux_deext"] = lc["Flux"] * fac
    lc["fluxerr_deext"] = lc["Flux_err"] * fac
    lc["snr_deext"] = lc["flux_deext"] / lc["fluxerr_deext"].replace(0, np.nan)
    lc["snr_deext"] = lc["snr_deext"].fillna(0.0)

    # stable transform for negatives
    lc["logflux_deext"] = np.sign(lc["flux_deext"]) * np.log1p(np.abs(lc["flux_deext"]))

    # clipped deext channel (robust)
    lc["flux_deext_clip"] = lc["flux_deext"].clip(-5, 5)

    return lc

full_train_lc = enrich_for_tsfresh(full_train_lc, train_log)
full_test_lc  = enrich_for_tsfresh(full_test_lc,  test_log)

print("Enriched full_train_lc:", full_train_lc.shape)
print("Enriched full_test_lc:", full_test_lc.shape)


In [None]:
# Cell 7 – tsfresh features (TRAIN) multi-channel + corrected time/flux

fc_params = EfficientFCParameters()

def build_tsfresh_long(lc: pd.DataFrame) -> pd.DataFrame:
    parts = []
    for col, suf in [
        ("flux_deext_clip", "_flux"),
        ("snr_deext", "_snr"),
        ("logflux_deext", "_logflux"),
    ]:
        tmp = lc[["object_id", "t_rest", "Filter", col]].copy()
        tmp.rename(columns={"t_rest": "time", col: "value"}, inplace=True)
        tmp["kind"] = tmp["Filter"].astype(str) + suf
        parts.append(tmp[["object_id", "time", "kind", "value"]])

    ts = pd.concat(parts, ignore_index=True)
    ts = ts.replace([np.inf, -np.inf], np.nan)
    ts = ts.dropna(subset=["object_id", "time", "kind", "value"])
    return ts

train_ts = build_tsfresh_long(full_train_lc)

print("train_ts after cleaning:", train_ts.shape)

X_ts = extract_features(
    train_ts,
    column_id="object_id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=fc_params,
    n_jobs=1,
    disable_progressbar=False,
)

X_ts = X_ts.replace([np.inf, -np.inf], np.nan).fillna(0.0)

# Align to train_df order
X_ts = X_ts.reindex(train_df["object_id"]).fillna(0.0)

y_target = train_df["target"].values
X_ts_selected = select_features(X_ts, y_target)

print("tsfresh train shape (after selection):", X_ts_selected.shape)

ts_cols = X_ts_selected.columns.tolist()

train_df_ts = train_df.merge(
    X_ts_selected,
    left_on="object_id",
    right_index=True,
    how="left",
).fillna(0.0)


In [None]:
# Cell 8 – tsfresh features (TEST) multi-channel + corrected time/flux

test_ts = build_tsfresh_long(full_test_lc)
print("test_ts after cleaning:", test_ts.shape)

X_ts_test = extract_features(
    test_ts,
    column_id="object_id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=fc_params,
    n_jobs=1,
    disable_progressbar=False,
)

X_ts_test = X_ts_test.replace([np.inf, -np.inf], np.nan).fillna(0.0)

# Ensure same columns as train
missing_cols = [c for c in ts_cols if c not in X_ts_test.columns]
for c in missing_cols:
    X_ts_test[c] = 0.0
X_ts_test = X_ts_test[ts_cols]

test_df_ts = test_df.merge(
    X_ts_test,
    left_on="object_id",
    right_index=True,
    how="left",
).fillna(0.0)

# Replace original dfs with enriched versions
train_df = train_df_ts
test_df = test_df_ts

print("Final train_df:", train_df.shape)
print("Final test_df:", test_df.shape)


In [None]:
# Cell 9 – Build X, y, X_test

drop_cols = ["object_id", "split", "SpecType", "English Translation", "target"]

feature_cols = [c for c in train_df.columns if c not in drop_cols]

# Keep NaNs (XGB/LGB handle them); just remove inf
train_df[feature_cols] = train_df[feature_cols].replace([np.inf, -np.inf], np.nan)
test_df[feature_cols]  = test_df[feature_cols].replace([np.inf, -np.inf], np.nan)

X      = train_df[feature_cols].astype(np.float32).values
y      = train_df["target"].values
X_test = test_df[feature_cols].astype(np.float32).values

print("Number of features:", len(feature_cols))


In [None]:
# Cell 10 training XGBoost + LightGBM + Ensemble (UPDATED: no scaling + GPU XGB + best blend)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Class imbalance
n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
scale_pos_weight = n_neg / max(n_pos, 1)
print("Positives:", n_pos, "Negatives:", n_neg, "scale_pos_weight:", scale_pos_weight)

# XGBoost (GPU)
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "gpu_id": 0,
    "max_depth": 6,
    "eta": 0.03,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "lambda": 2.0,
    "alpha": 0.0,
    "min_child_weight": 5.0,
    "gamma": 0.1,
    "scale_pos_weight": scale_pos_weight,
    "seed": 42,
}

# LightGBM (CPU)
lgb_params = {
    "objective": "binary",
    "metric": "average_precision",
    "learning_rate": 0.035,
    "num_leaves": 120,
    "max_depth": 11,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 40,
    "lambda_l1": 0.0,
    "lambda_l2": 2.0,
    "scale_pos_weight": scale_pos_weight,
    "verbosity": -1,
}

oof_pred_xgb = np.zeros(len(train_df))
test_pred_xgb = np.zeros(len(test_df))

oof_pred_lgb = np.zeros(len(train_df))
test_pred_lgb = np.zeros(len(test_df))

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n========== Fold {fold} ==========")
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    # --- XGBoost ---
    dtrain = xgb.DMatrix(X_tr, label=y_tr, missing=np.nan)
    dval   = xgb.DMatrix(X_val, label=y_val, missing=np.nan)
    dtest  = xgb.DMatrix(X_test, missing=np.nan)

    model_xgb = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=2500,
        evals=[(dtrain, "train"), (dval, "valid")],
        early_stopping_rounds=200,
        verbose_eval=200,
    )

    best_iter_xgb = model_xgb.best_iteration
    if best_iter_xgb is None:
        oof_pred_xgb[val_idx] = model_xgb.predict(dval)
        test_pred_xgb += model_xgb.predict(dtest) / skf.n_splits
    else:
        oof_pred_xgb[val_idx] = model_xgb.predict(dval, iteration_range=(0, best_iter_xgb + 1))
        test_pred_xgb += model_xgb.predict(dtest, iteration_range=(0, best_iter_xgb + 1)) / skf.n_splits

    # --- LightGBM ---
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_valid = lgb.Dataset(X_val, label=y_val)

    model_lgb = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        num_boost_round=7000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=400, first_metric_only=True),
            lgb.log_evaluation(period=200),
        ],
    )

    best_iter_lgb = model_lgb.best_iteration
    oof_pred_lgb[val_idx] = model_lgb.predict(X_val, num_iteration=best_iter_lgb)
    test_pred_lgb += model_lgb.predict(X_test, num_iteration=best_iter_lgb) / skf.n_splits


def best_f1_threshold(oof_probs, y_true):
    thresholds = np.linspace(0.01, 0.99, 99)
    best_th = 0.5
    best_f1 = 0.0
    for th in thresholds:
        f1 = f1_score(y_true, (oof_probs >= th).astype(int))
        if f1 > best_f1:
            best_f1 = f1
            best_th = th
    return best_f1, best_th


# Evaluate base models
f1_xgb, th_xgb = best_f1_threshold(oof_pred_xgb, y)
f1_lgb, th_lgb = best_f1_threshold(oof_pred_lgb, y)

print(f"\n[XGBoost]  Best OOF F1: {f1_xgb:.4f} at threshold={th_xgb:.3f}")
print(f"[LightGBM] Best OOF F1: {f1_lgb:.4f} at threshold={th_lgb:.3f}")

# Search best blend weight
best_w = 0.5
best_f1 = -1.0
best_th = 0.5
for w in np.linspace(0.0, 1.0, 21):
    oof_blend = w * oof_pred_xgb + (1 - w) * oof_pred_lgb
    f1b, thb = best_f1_threshold(oof_blend, y)
    if f1b > best_f1:
        best_f1 = f1b
        best_th = thb
        best_w = w

oof_pred_ens = best_w * oof_pred_xgb + (1 - best_w) * oof_pred_lgb
test_pred_ens = best_w * test_pred_xgb + (1 - best_w) * test_pred_lgb

print(f"\n[Blend] Best OOF F1: {best_f1:.4f} at threshold={best_th:.3f} with w={best_w:.2f}*XGB + {1-best_w:.2f}*LGB")

# Pick best final predictor among XGB/LGB/Blend
best_model_name = "Blend"
best_test_pred = test_pred_ens

if f1_xgb > best_f1:
    best_model_name = "XGBoost"
    best_f1 = f1_xgb
    best_th = th_xgb
    best_test_pred = test_pred_xgb

if f1_lgb > best_f1:
    best_model_name = "LightGBM"
    best_f1 = f1_lgb
    best_th = th_lgb
    best_test_pred = test_pred_lgb

print(f"\n>> Using {best_model_name} with OOF F1 = {best_f1:.4f} and threshold = {best_th:.3f}")


In [None]:
# Cell 11 – Build submission.csv
binary_prediction = (best_test_pred >= best_th).astype(int)

pred_df = pd.DataFrame({
    "object_id": test_df["object_id"],
    "prediction": binary_prediction,
})

submission = sample_sub[["object_id"]].merge(pred_df, on="object_id", how="left")
submission["prediction"] = submission["prediction"].fillna(0).astype(int)

print("Prediction value counts:")
print(submission["prediction"].value_counts())

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")
submission.head()


In [None]:
import numpy as np
import pandas as pd

def count_pos(p, th):
    return int((p >= th).sum())

print("\n==== SUMMARY ====")
print("Features:", len(feature_cols))
print("Train size:", len(train_df), "Pos:", int((y==1).sum()), "Neg:", int((y==0).sum()))

print("\n==== OOF METRICS ====")
print(f"XGB  OOF F1={f1_xgb:.5f}  th={th_xgb:.3f}  OOF_pos={count_pos(oof_pred_xgb, th_xgb)}")
print(f"LGB  OOF F1={f1_lgb:.5f}  th={th_lgb:.3f}  OOF_pos={count_pos(oof_pred_lgb, th_lgb)}")
print(f"BLND OOF F1={best_f1:.5f} th={best_th:.3f}  OOF_pos={count_pos(oof_pred_ens, best_th)}")

print("\n==== TEST PRED COUNTS ====")
print("Final model:", best_model_name)
print("Test positives predicted:", count_pos(best_test_pred, best_th), "out of", len(best_test_pred))

# Feature importance snapshots (works if you kept models per fold; if not, skip)
try:
    # If you saved last fold models as model_xgb/model_lgb
    xgb_imp = model_xgb.get_score(importance_type="gain")
    xgb_imp = pd.Series(xgb_imp).sort_values(ascending=False).head(30)
    print("\n==== TOP 30 XGB GAIN FEATURES ====")
    print(xgb_imp)

    lgb_imp = pd.Series(model_lgb.feature_importance(importance_type="gain"), index=feature_cols)
    lgb_imp = lgb_imp.sort_values(ascending=False).head(30)
    print("\n==== TOP 30 LGB GAIN FEATURES ====")
    print(lgb_imp)
except Exception as e:
    print("\n(Feature importance skipped:", e, ")")