In [None]:
# Cell 0
!pip install -q tsfresh
!pip install -q lightgbm

In [None]:
# Cell 1 – Imports
import os
import numpy as np
import pandas as pd

from tsfresh import extract_features, select_features
from tsfresh.feature_extraction import EfficientFCParameters

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, log_loss, f1_score
from sklearn.preprocessing import StandardScaler # Dòng mới
from sklearn.linear_model import LogisticRegression # Dòng mới
import xgboost as xgb
import lightgbm as lgb # Dòng mới

In [None]:
# Cell 2 – Locate DATA_DIR
candidate_dirs = [
    "/kaggle/input/dataset-macc/dataset_mallorn-astronomical-classification-challenge"
]

DATA_DIR = None
for d in candidate_dirs:
    if os.path.exists(os.path.join(d, "train_log.csv")):
        DATA_DIR = d
        break

if DATA_DIR is None:
    raise FileNotFoundError(
        "Could not find train_log.csv. "
        "Check /kaggle/input and adjust DATA_DIR accordingly."
    )

print("Using DATA_DIR:", DATA_DIR)
print("Files:", os.listdir(DATA_DIR))


In [None]:
# Cell 3 – Load metadata
train_log = pd.read_csv(os.path.join(DATA_DIR, "train_log.csv"))
test_log  = pd.read_csv(os.path.join(DATA_DIR, "test_log.csv"))
sample_sub = pd.read_csv(os.path.join(DATA_DIR, "sample_submission.csv"))

print("train_log:", train_log.shape)
print("test_log:", test_log.shape)
print("sample_submission:", sample_sub.shape)

print("\nTarget distribution:")
print(train_log["target"].value_counts(normalize=True))


In [None]:
# Cell 4 – Lightcurve feature function
def make_features(lc_df: pd.DataFrame) -> pd.DataFrame:
    """
    From lightcurves:
        object_id, Time (MJD), Flux, Flux_err, Filter
    -> one row per object_id with per-filter + global stats.
    """
    lc_df = lc_df.drop_duplicates().copy()

    # Basic per-measurement helpers
    lc_df["snr"] = lc_df["Flux"] / lc_df["Flux_err"].replace(0, np.nan)
    lc_df["snr"] = lc_df["snr"].fillna(0.0)
    lc_df["is_pos"] = (lc_df["Flux"] > 0).astype(float)

    # --- 1) Standard per-band aggregates ---
    agg = lc_df.groupby(["object_id", "Filter"]).agg(
        flux_mean=("Flux", "mean"),
        flux_std=("Flux", "std"),
        flux_min=("Flux", "min"),
        flux_max=("Flux", "max"),
        flux_median=("Flux", "median"),
        flux_count=("Flux", "count"),
        flux_abs_mean=("Flux", lambda x: np.mean(np.abs(x))),
        time_min=("Time (MJD)", "min"),
        time_max=("Time (MJD)", "max"),
        snr_mean=("snr", "mean"),
        snr_std=("snr", "std"),
        snr_max=("snr", "max"),
        pos_frac=("is_pos", "mean"),
    ).reset_index()

    agg["time_range"] = agg["time_max"] - agg["time_min"]
    agg["flux_amp"] = agg["flux_max"] - agg["flux_min"]
    agg["obs_rate"] = agg["flux_count"] / agg["time_range"].replace(0, np.nan)
    agg["obs_rate"] = agg["obs_rate"].fillna(0.0)

    # --- 2) Extra time-series features per band: slopes + robust amplitude ---
    def _extra_band_feats(group: pd.DataFrame) -> pd.Series:
        # group has "Time (MJD)" and "Flux"
        t = group["Time (MJD)"].values
        f = group["Flux"].values

        if len(t) > 1:
            order = np.argsort(t)
            t_sorted = t[order]
            f_sorted = f[order]

            dt = np.diff(t_sorted)
            df = np.diff(f_sorted)
            valid = dt > 0

            if np.any(valid):
                slopes = df[valid] / dt[valid]
                slope_max = slopes.max()
                slope_min = slopes.min()
                slope_mean = slopes.mean()
                dt_mean = dt.mean()
            else:
                slope_max = slope_min = slope_mean = 0.0
                dt_mean = 0.0
        else:
            slope_max = slope_min = slope_mean = 0.0
            dt_mean = 0.0

        # Robust amplitude
        if len(f) > 0:
            flux_p10 = np.percentile(f, 10)
            flux_p90 = np.percentile(f, 90)
            flux_p90_p10 = flux_p90 - flux_p10
        else:
            flux_p10 = flux_p90 = flux_p90_p10 = 0.0

        return pd.Series(
            {
                "slope_max": slope_max,
                "slope_min": slope_min,
                "slope_mean": slope_mean,
                "dt_mean": dt_mean,
                "flux_p10": flux_p10,
                "flux_p90": flux_p90,
                "flux_p90_p10": flux_p90_p10,
            }
        )

    extra = (
        lc_df.groupby(["object_id", "Filter"])[["Time (MJD)", "Flux"]]
        .apply(_extra_band_feats)
        .reset_index()
    )

    agg = agg.merge(extra, on=["object_id", "Filter"], how="left")

    # Pivot: (stat, band) -> columns like "flux_mean_fg", "slope_max_fr", ...
    feat_pivot = agg.pivot(
        index="object_id",
        columns="Filter",
        values=[
            "flux_mean",
            "flux_std",
            "flux_min",
            "flux_max",
            "flux_median",
            "flux_count",
            "flux_abs_mean",
            "time_min",
            "time_max",
            "time_range",
            "flux_amp",
            "obs_rate",
            "snr_mean",
            "snr_std",
            "snr_max",
            "pos_frac",
            "slope_max",
            "slope_min",
            "slope_mean",
            "dt_mean",
            "flux_p10",
            "flux_p90",
            "flux_p90_p10",
        ],
    )

    feat_pivot.columns = [f"{stat}_f{band}" for stat, band in feat_pivot.columns]
    feat_pivot = feat_pivot.reset_index()

    # --- 3) Global stats over all filters ---
    g = lc_df.groupby("object_id").agg(
        flux_mean_all=("Flux", "mean"),
        flux_std_all=("Flux", "std"),
        flux_min_all=("Flux", "min"),
        flux_max_all=("Flux", "max"),
        flux_median_all=("Flux", "median"),
        flux_count_all=("Flux", "count"),
        flux_abs_mean_all=("Flux", lambda x: np.mean(np.abs(x))),
        snr_mean_all=("snr", "mean"),
        snr_std_all=("snr", "std"),
        snr_max_all=("snr", "max"),
        pos_frac_all=("is_pos", "mean"),
        time_min_all=("Time (MJD)", "min"),
        time_max_all=("Time (MJD)", "max"),
    ).reset_index()

    g["time_range_all"] = g["time_max_all"] - g["time_min_all"]

    out = g.merge(feat_pivot, on="object_id", how="left")

    # Fill NaNs from std in singleton groups, etc.
    out = out.fillna(0.0)

    return out


In [None]:
# Cell 5 – Meta features: Z, EBV, colors, rest-frame times
def add_meta_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add features that depend on meta-data: Z, EBV, colors, rest-frame times.
    """
    df = df.copy()

    z = df["Z"].fillna(0.0)
    ebv = df["EBV"].fillna(0.0)
    one_plus_z = 1.0 + z

    # Rest-frame durations
    if "time_range_all" in df.columns:
        df["time_range_all_rest"] = df["time_range_all"] / one_plus_z

    for col in df.columns:
        if col.startswith("time_range_f"):
            df[col + "_rest"] = df[col] / one_plus_z

    # Simple transforms of meta
    df["log_z_plus1"] = np.log1p(z)
    df["ebv"] = ebv
    df["ebv_z"] = ebv * z

    # Color features from flux_abs_mean per band (proxy for brightness)
    bands = ["u", "g", "r", "i", "z", "y"]
    # Approx extinction coefficients A_lambda / E(B-V)
    R = {"u": 4.239, "g": 3.303, "r": 2.285, "i": 1.698, "z": 1.263, "y": 1.088}

    eps = 1e-6
    pairs = [("u", "g"), ("g", "r"), ("r", "i"), ("i", "z"), ("z", "y"),
             ("g", "i"), ("g", "z")]

    for b1, b2 in pairs:
        c1 = f"flux_abs_mean_f{b1}"
        c2 = f"flux_abs_mean_f{b2}"
        if c1 in df.columns and c2 in df.columns:
            f1 = np.abs(df[c1]) + eps
            f2 = np.abs(df[c2]) + eps
            cname = f"color_{b1}{b2}"
            df[cname] = -2.5 * np.log10(f1 / f2)

            # EBV-corrected color: color_corr = color_obs - E(B-V)*(R1 - R2)
            delta_R = R[b1] - R[b2]
            df[cname + "_deext"] = df[cname] - ebv * delta_R

    return df


In [None]:
# Cell 6 – Build per-split features + global lightcurves
train_feat_list = []
test_feat_list = []

full_train_lc_list = []
full_test_lc_list = []

for i in range(1, 21):
    split_name = f"split_{i:02d}"
    print("Processing", split_name)

    lc_tr_path = os.path.join(DATA_DIR, split_name, "train_full_lightcurves.csv")
    lc_te_path = os.path.join(DATA_DIR, split_name, "test_full_lightcurves.csv")

    lc_tr = pd.read_csv(lc_tr_path)
    lc_te = pd.read_csv(lc_te_path)

    full_train_lc_list.append(lc_tr)
    full_test_lc_list.append(lc_te)

    # Train features
    tr_feat = make_features(lc_tr)
    tr_meta = train_log[train_log["split"] == split_name]
    merged_tr = tr_meta.merge(tr_feat, on="object_id", how="left")
    merged_tr = add_meta_features(merged_tr)
    train_feat_list.append(merged_tr)

    # Test features
    te_feat = make_features(lc_te)
    te_meta = test_log[test_log["split"] == split_name]
    merged_te = te_meta.merge(te_feat, on="object_id", how="left")
    merged_te = add_meta_features(merged_te)
    test_feat_list.append(merged_te)

train_df = pd.concat(train_feat_list, ignore_index=True)
test_df  = pd.concat(test_feat_list,  ignore_index=True)

print("train_df:", train_df.shape)
print("test_df:", test_df.shape)

full_train_lc = pd.concat(full_train_lc_list, ignore_index=True)
full_test_lc  = pd.concat(full_test_lc_list,  ignore_index=True)

print("full_train_lc:", full_train_lc.shape)
print("full_test_lc:", full_test_lc.shape)


In [None]:
# Cell 7 – tsfresh features (TRAIN) with NaN cleaning

# === tsfresh feature extraction: TRAIN ===

fc_params = EfficientFCParameters()

train_ts = full_train_lc[["object_id", "Time (MJD)", "Filter", "Flux"]].copy()
train_ts.rename(columns={"Time (MJD)": "time", "Flux": "value"}, inplace=True)

# Clean up before feeding into tsfresh: no NaNs or inf allowed in id/kind/value/time
train_ts = train_ts.replace([np.inf, -np.inf], np.nan)
train_ts = train_ts.dropna(subset=["object_id", "Filter", "time", "value"])

print("train_ts after cleaning:", train_ts.shape)
print("Any NaNs left in value?", train_ts["value"].isna().any())

X_ts = extract_features(
    train_ts,
    column_id="object_id",
    column_sort="time",
    column_kind="Filter",
    column_value="value",
    default_fc_parameters=fc_params,
    n_jobs=4,
    disable_progressbar=False,
)

# Clean infinities / NaNs coming from feature computations
X_ts = X_ts.replace([np.inf, -np.inf], np.nan).fillna(0.0)

# Align to train_df order (by object_id)
X_ts = X_ts.reindex(train_df["object_id"]).fillna(0.0)

y_target = train_df["target"].values
X_ts_selected = select_features(X_ts, y_target)

print("tsfresh train shape (after selection):", X_ts_selected.shape)

ts_cols = X_ts_selected.columns.tolist()

train_df_ts = train_df.merge(
    X_ts_selected,
    left_on="object_id",
    right_index=True,
    how="left",
)

train_df_ts = train_df_ts.fillna(0.0)


In [None]:
# Cell 8 – tsfresh features (TEST) with NaN cleaning

# === tsfresh feature extraction: TEST ===

test_ts = full_test_lc[["object_id", "Time (MJD)", "Filter", "Flux"]].copy()
test_ts.rename(columns={"Time (MJD)": "time", "Flux": "value"}, inplace=True)

# Clean up before tsfresh
test_ts = test_ts.replace([np.inf, -np.inf], np.nan)
test_ts = test_ts.dropna(subset=["object_id", "Filter", "time", "value"])

print("test_ts after cleaning:", test_ts.shape)
print("Any NaNs left in value?", test_ts["value"].isna().any())

X_ts_test = extract_features(
    test_ts,
    column_id="object_id",
    column_sort="time",
    column_kind="Filter",
    column_value="value",
    default_fc_parameters=fc_params,
    n_jobs=4,
    disable_progressbar=False,
)

X_ts_test = X_ts_test.replace([np.inf, -np.inf], np.nan).fillna(0.0)

# Ensure same columns as in train
missing_cols = [c for c in ts_cols if c not in X_ts_test.columns]
for c in missing_cols:
    X_ts_test[c] = 0.0

X_ts_test = X_ts_test[ts_cols]

test_df_ts = test_df.merge(
    X_ts_test,
    left_on="object_id",
    right_index=True,
    how="left",
)

test_df_ts = test_df_ts.fillna(0.0)

# Replace original dfs with enriched versions
train_df = train_df_ts
test_df = test_df_ts

print("Final train_df:", train_df.shape)
print("Final test_df:", test_df.shape)


In [None]:
# Cell 9 – Build X, y, X_test

drop_cols = ["object_id", "split", "SpecType", "English Translation", "target"]

feature_cols = [c for c in train_df.columns if c not in drop_cols]

train_df[feature_cols] = train_df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)
test_df[feature_cols]  = test_df[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0.0)

X      = train_df[feature_cols].values
y      = train_df["target"].values
X_test = test_df[feature_cols].values

print("Number of features:", len(feature_cols))


In [None]:
# Cell 10 training XGBoost + LightGBM + Ensemble
from sklearn.preprocessing import StandardScaler

# Scale features (đối với LightGBM; XGBoost không cần nhưng scaling cũng không gây hại)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Containers cho mỗi mô hình
oof_pred_xgb = np.zeros(len(train_df))
test_pred_xgb = np.zeros(len(test_df))

oof_pred_lgb = np.zeros(len(train_df))
test_pred_lgb = np.zeros(len(test_df))

# Xử lý mất cân bằng lớp
n_pos = (y == 1).sum()
n_neg = (y == 0).sum()
scale_pos_weight = n_neg / n_pos
print("Positives:", n_pos, "Negatives:", n_neg, "scale_pos_weight:", scale_pos_weight)

# --- Tham số XGBoost (kiểu tương tự mô hình tốt nhất của bạn) ---
xgb_params = {
    "objective": "binary:logistic",
    "eval_metric": "aucpr",
    "tree_method": "hist",
    "max_depth": 5,
    "eta": 0.03,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 2.0,
    "alpha": 0.0,
    "min_child_weight": 5.0,
    "gamma": 0.1,
    "scale_pos_weight": scale_pos_weight,
}

# --- Tham số LightGBM (lấy cảm hứng từ giải pháp hạng 7) ---
lgb_params = {
    "objective": "binary",
    # Tên của PR-AUC trong LightGBM là "average_precision"
    "metric": "average_precision",
    "learning_rate": 0.0361,
    "num_leaves": 120,
    "max_depth": 11,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "min_data_in_leaf": 40,
    "lambda_l1": 0.0,
    "lambda_l2": 2.0,
    "scale_pos_weight": scale_pos_weight,
    "verbosity": -1,  # "verbosity" là key được ưa dùng hơn; "verbose" cũng được nhưng cái này gọn hơn
}

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y), 1):
    print(f"\n========== Fold {fold} ==========")
    X_tr, X_val = X[tr_idx], X[val_idx]
    y_tr, y_val = y[tr_idx], y[val_idx]

    # ---------- XGBoost ----------
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval   = xgb.DMatrix(X_val, label=y_val)
    dtest  = xgb.DMatrix(X_test)

    evals = [(dtrain, "train"), (dval, "valid")]

    model_xgb = xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=2000,
        evals=evals,
        early_stopping_rounds=200,
        verbose_eval=100,
    )

    best_iter_xgb = model_xgb.best_iteration
    if best_iter_xgb is None:
        oof_pred_xgb[val_idx] = model_xgb.predict(dval)
        test_pred_xgb += model_xgb.predict(dtest) / skf.n_splits
    else:
        oof_pred_xgb[val_idx] = model_xgb.predict(dval, iteration_range=(0, best_iter_xgb + 1))
        test_pred_xgb += model_xgb.predict(dtest, iteration_range=(0, best_iter_xgb + 1)) / skf.n_splits

    # ---------- LightGBM ----------
    X_tr_s = X_scaled[tr_idx]
    X_val_s = X_scaled[val_idx]

    lgb_train = lgb.Dataset(X_tr_s, label=y_tr)
    lgb_valid = lgb.Dataset(X_val_s, label=y_val, reference=lgb_train)

    model_lgb = lgb.train(
        params=lgb_params,
        train_set=lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train", "valid"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=300, first_metric_only=True),
            lgb.log_evaluation(period=100),
        ],
    )

    best_iter_lgb = model_lgb.best_iteration
    oof_pred_lgb[val_idx] = model_lgb.predict(X_val_s, num_iteration=best_iter_lgb)
    test_pred_lgb += model_lgb.predict(X_test_scaled, num_iteration=best_iter_lgb) / skf.n_splits

# ---------- Ensemble (trung bình đơn giản) ----------
oof_pred_ens = 0.5 * oof_pred_xgb + 0.5 * oof_pred_lgb
test_pred_ens = 0.5 * test_pred_xgb + 0.5 * test_pred_lgb


# ---------- Đánh giá từng mô hình và chọn mô hình tốt nhất bằng OOF F1 ----------
def find_best_f1(oof_probs, y_true, name):
    oof_clip = np.clip(oof_probs, 1e-6, 1 - 1e-6)
    auc = roc_auc_score(y_true, oof_clip)
    ll  = log_loss(y_true, oof_clip)
    print(f"\n[{name}] OOF ROC AUC: {auc:.4f}, logloss: {ll:.4f}")

    thresholds = np.linspace(0.01, 0.99, 99)
    best_th = 0.5
    best_f1 = 0.0
    for th in thresholds:
        preds_bin = (oof_probs >= th).astype(int)
        f1 = f1_score(y_true, preds_bin)
        if f1 > best_f1:
            best_f1 = f1
            best_th = th
    print(f"[{name}] Best OOF F1: {best_f1:.4f} at threshold = {best_th:.3f}")
    return best_f1, best_th

f1_xgb, th_xgb = find_best_f1(oof_pred_xgb, y, "XGBoost")
f1_lgb, th_lgb = find_best_f1(oof_pred_lgb, y, "LightGBM")
f1_ens, th_ens = find_best_f1(oof_pred_ens, y, "Ensemble")

best_model_name = "XGBoost"
best_f1 = f1_xgb
best_th = th_xgb
best_test_pred = test_pred_xgb

if f1_lgb > best_f1:
    best_model_name = "LightGBM"
    best_f1 = f1_lgb
    best_th = th_lgb
    best_test_pred = test_pred_lgb

if f1_ens > best_f1:
    best_model_name = "Ensemble"
    best_f1 = f1_ens
    best_th = th_ens
    best_test_pred = test_pred_ens

print(f"\n>> Using {best_model_name} with OOF F1 = {best_f1:.4f} and threshold = {best_th:.3f}")

In [None]:
# Cell 11 – Build submission.csv
binary_prediction = (best_test_pred >= best_th).astype(int)

pred_df = pd.DataFrame({
    "object_id": test_df["object_id"],
    "prediction": binary_prediction,
})

submission = sample_sub[["object_id"]].merge(pred_df, on="object_id", how="left")
submission["prediction"] = submission["prediction"].fillna(0).astype(int)

print("Prediction value counts:")
print(submission["prediction"].value_counts())

submission.to_csv("submission.csv", index=False)
print("Saved submission.csv")
submission.head()
