In [3]:
# === Imports & Config ===
import os
import json
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

RANDOM_SEED = 42
N_FOLDS = 5


In [32]:

# === Paths (Kaggle) ===
TRAIN_PATH = "/kaggle/input/playground-series-s5e11/train.csv"
TEST_PATH  = "/kaggle/input/playground-series-s5e11/test.csv"
SAMPLE_SUB_PATH = "/kaggle/input/playground-series-s5e11/sample_submission.csv"
SUBMIT_NAME = "submission.csv"

TARGET_COL = "loan_paid_back"  # competition target name


In [33]:
# === Utilities ===
def set_seed(seed: int = RANDOM_SEED):
    np.random.seed(seed)


def read_data(train_path: str, test_path: str, sample_sub_path: str):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    sample_sub = pd.read_csv(sample_sub_path)
    print(f"Train shape: {train.shape} | Test shape: {test.shape}")
    return train, test, sample_sub


In [6]:
def split_target_and_id(train: pd.DataFrame, test: pd.DataFrame, target_col: str):
    # Capture test ids early and drop from frames
    if "id" in test.columns:
        test_ids = test["id"].copy()
        test = test.drop(columns=["id"])
    else:
        test_ids = pd.Series(np.arange(len(test)), name="id")

    # Drop id from train if present
    if "id" in train.columns:
        train = train.drop(columns=["id"])

    # Extract target and features
    if target_col not in train.columns:
        raise ValueError(f"Target column '{target_col}' not found in train.")
    y = train[target_col].astype(int).copy()
    X = train.drop(columns=[target_col]).copy()

    return X, y, test, test_ids

In [7]:
def basic_impute_and_encode(X: pd.DataFrame, test: pd.DataFrame):
    """
    Combined-factorization approach for stable category mapping.
    Optionally, we’ll also return cat feature names for CatBoost.
    """
    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

    combined = pd.concat(
        [X.assign(_is_train=1), test.assign(_is_train=0)],
        ignore_index=True
    )

    # Numeric: median impute
    for c in num_cols:
        combined[c] = combined[c].fillna(combined[c].median())

    # Categorical: stringify + factorize for consistent mapping
    for c in cat_cols:
        combined[c] = combined[c].astype(str).fillna("NA")
        codes, _ = pd.factorize(combined[c], sort=True)
        combined[c] = codes

    X_proc = combined[combined["_is_train"] == 1].drop(columns=["_is_train"]).reset_index(drop=True)
    T_proc = combined[combined["_is_train"] == 0].drop(columns=["_is_train"]).reset_index(drop=True)

    return X_proc, T_proc, num_cols, cat_cols

In [8]:
def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    """Feature engineering with safe column checks."""
    df = df.copy()

    loan_col   = "loan_amount" if "loan_amount" in df.columns else None
    income_col = "annual_income" if "annual_income" in df.columns else None
    emp_col    = "employment_length" if "employment_length" in df.columns else None
    cred_col   = "credit_score" if "credit_score" in df.columns else None
    rate_col   = "interest_rate" if "interest_rate" in df.columns else None
    ratio_col  = "debt_to_income_ratio" if "debt_to_income_ratio" in df.columns else None

    # Derived ratios
    if loan_col and income_col:
        df["loan_income_ratio"] = df[loan_col] / (df[income_col] + 1.0)
    if emp_col and cred_col:
        df["emp_credit_ratio"] = df[emp_col] / (df[cred_col] + 1.0)
    if rate_col and ratio_col:
        df["loan_rate_ratio"] = df[rate_col] * df[ratio_col]

    # Binning examples (robust to duplicates)
    if income_col:
        try:
            df["income_bin"] = pd.qcut(df[income_col], 10, labels=False, duplicates="drop")
        except Exception:
            pass

    if "age" in df.columns:
        df["age_bin"] = pd.cut(df["age"], bins=[18, 25, 35, 45, 60, 100], labels=False)
    elif "person_age" in df.columns:
        df["age_bin"] = pd.cut(df["person_age"], bins=[18, 25, 35, 45, 60, 100], labels=False)

    return df

In [9]:
def train_models_with_cv(X: pd.DataFrame, y: pd.Series, X_test: pd.DataFrame, n_folds: int = N_FOLDS, seed: int = RANDOM_SEED):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)

    oof_xgb = np.zeros(len(X))
    oof_lgb = np.zeros(len(X))
    oof_cat = np.zeros(len(X))

    pred_xgb = np.zeros(len(X_test))
    pred_lgb = np.zeros(len(X_test))
    pred_cat = np.zeros(len(X_test))

    # Optional: CatBoost native categorical indices (after encoding they’re numeric, so keep empty)
    cat_idx = []  # if you keep factorization, CatBoost sees them as numeric; leave empty or rework to native strings.

    # Class imbalance helper
    pos = y.sum()
    neg = len(y) - pos
    scale_pos_weight = (neg / max(pos, 1.0))

    for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n=== Fold {fold}/{n_folds} ===")
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        # Align columns with test
        common = X_tr.columns.intersection(X_test.columns)
        X_tr, X_va, X_te = X_tr[common], X_va[common], X_test[common]

        # XGBoost
        xgb = XGBClassifier(
            n_estimators=3000,
            learning_rate=0.02,
            max_depth=5,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.2,
            random_state=seed,
            eval_metric="auc",
            tree_method="hist",
            scale_pos_weight=scale_pos_weight
        )
        xgb.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], early_stopping_rounds=200, verbose=False)
        oof_xgb[va_idx] = xgb.predict_proba(X_va)[:, 1]
        pred_xgb += xgb.predict_proba(X_te)[:, 1] / n_folds

        # LightGBM
        lgb = LGBMClassifier(
            n_estimators=3000,
            learning_rate=0.02,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            num_leaves=63,
            reg_lambda=1.2,
            random_state=seed,
            class_weight=None  # or "balanced"
        )
        lgb.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric="auc", callbacks=[])
        oof_lgb[va_idx] = lgb.predict_proba(X_va)[:, 1]
        pred_lgb += lgb.predict_proba(X_te)[:, 1] / n_folds

        # CatBoost
        cat = CatBoostClassifier(
            iterations=2000,
            learning_rate=0.03,
            depth=6,
            l2_leaf_reg=3.0,
            subsample=0.8,
            eval_metric="AUC",
            random_seed=seed,
            verbose=False
        )
        # If you decide to keep native categoricals, feed cat_features=cat_idx and use un-factorized data.
        cat.fit(X_tr, y_tr, eval_set=(X_va, y_va), use_best_model=True)
        oof_cat[va_idx] = cat.predict_proba(X_va)[:, 1]
        pred_cat += cat.predict_proba(X_te)[:, 1] / n_folds

        # Per-fold diagnostics
        print(
    f"AUC XGB={roc_auc_score(y_va, oof_xgb[va_idx]):.5f} | "
    f"LGB={roc_auc_score(y_va, oof_lgb[va_idx]):.5f} | "
    f"CAT={roc_auc_score(y_va, oof_cat[va_idx]):.5f}" )
    
    print("\nOOF AUCs:")
    print("XGBoost :", roc_auc_score(y, oof_xgb))
    print("LightGBM:", roc_auc_score(y, oof_lgb))
    print("CatBoost:", roc_auc_score(y, oof_cat))

    return (oof_xgb, oof_lgb, oof_cat), (pred_xgb, pred_lgb, pred_cat)

In [10]:
def optimize_blend(oof_list, y_true, grid_step: int = 21):
    """
    Simple grid search on weights (w1, w2, w3) s.t. w1 + w2 + w3 = 1
    """
    def oof_auc(weights):
        w1, w2, w3 = weights
        blend = w1 * oof_list[0] + w2 * oof_list[1] + w3 * oof_list[2]
        return roc_auc_score(y_true, blend)

    best = {"w": (1/3, 1/3, 1/3), "auc": -1.0}
    grid = np.linspace(0.0, 1.0, grid_step)

    for w1 in grid:
        for w2 in grid:
            w3 = 1.0 - w1 - w2
            if w3 < 0 or w3 > 1:
                continue
            auc = oof_auc((w1, w2, w3))
            if auc > best["auc"]:
                best = {"w": (w1, w2, w3), "auc": auc}

    print(f"\nBest Blend AUC: {best['auc']:.6f} with weights {best['w']}")
    return best["w"]


In [11]:
def make_submission(test_ids: pd.Series, test_preds: np.ndarray, out_path: str = SUBMIT_NAME):
    sub = pd.DataFrame({
        "id": test_ids.astype(int),
        "loan_paid_back": np.clip(test_preds.astype(float), 0.0, 1.0)
    })
    sub.to_csv(out_path, index=False)
    print(f"Saved: {out_path}")
    return sub

In [16]:

# === Main flow ===
def main():
    set_seed(RANDOM_SEED)

    train, test, sample_sub = read_data(TRAIN_PATH, TEST_PATH, SAMPLE_SUB_PATH)

    # Split target and ids
    X, y, X_test, test_ids = split_target_and_id(train, test, TARGET_COL)

    # Preprocess (median-impute numeric + factorize categoricals)
    Xp, Tp, num_cols, cat_cols = basic_impute_and_encode(X, X_test)

    # Optional feature engineering
    Xp = engineer_features(Xp)
    Tp = engineer_features(Tp)

    # Ensure aligned columns
    common = Xp.columns.intersection(Tp.columns)
    Xp, Tp = Xp[common], Tp[common]

    # Train + OOF/Test predictions
    oof_list, test_list = train_models_with_cv(Xp, y, Tp, n_folds=N_FOLDS, seed=RANDOM_SEED)

    # Blend optimization
    w1, w2, w3 = optimize_blend(oof_list, y, grid_step=21)
    test_blend = w1 * test_list[0] + w2 * test_list[1] + w3 * test_list[2]

    # Submission
    sub = make_submission(test_ids, test_blend, SUBMIT_NAME)
    display(sub.head())


if __name__ == "__main__":
    main()


Train shape: (593994, 13) | Test shape: (254569, 12)

=== Fold 1/5 ===
[LightGBM] [Info] Number of positive: 379595, number of negative: 95600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011244 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1849
[LightGBM] [Info] Number of data points in the train set: 475195, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.798819 -> initscore=1.378932
[LightGBM] [Info] Start training from score 1.378932
AUC XGB=0.92286 | LGB=0.92291 | CAT=0.92276

=== Fold 2/5 ===
[LightGBM] [Info] Number of positive: 379595, number of negative: 95600
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011233 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] 

Unnamed: 0,id,loan_paid_back
0,593994,0.911399
1,593995,0.973542
2,593996,0.432107
3,593997,0.896232
4,593998,0.951887
