In [None]:
# Imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math

from collections import defaultdict
from scipy.stats import ks_2samp
from scipy.spatial.distance import jensenshannon

from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

import lightgbm as lgb

# Load Data

In [None]:
df_train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
df_test = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
df_sub = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

In [None]:
df_train.info()

# EDR: shapes, target balance, types, missingness, cardinality 

In [None]:

if "df_train" not in globals():
    df_train = pd.read_csv('/kaggle/input/playground-series-s5e11/train.csv')
    df_test  = pd.read_csv('/kaggle/input/playground-series-s5e11/test.csv')
    df_sub   = pd.read_csv('/kaggle/input/playground-series-s5e11/sample_submission.csv')

TARGET = "loan_paid_back"
ID_COL = "id"

feat_cols = [c for c in df_train.columns if c not in [TARGET, ID_COL]]
num_cols = df_train[feat_cols].select_dtypes(include=["number"]).columns.tolist()
cat_cols = df_train[feat_cols].select_dtypes(include=["object"]).columns.tolist()

print(f"Train shape: {df_train.shape} | Test shape: {df_test.shape}")
print(f"Numeric cols ({len(num_cols)}): {num_cols}")
print(f"Categorical cols ({len(cat_cols)}): {cat_cols}")

vc = df_train[TARGET].value_counts().sort_index()
print("\nTarget counts (0/1):")
print(vc.to_frame("count"))
print("Positive rate:", df_train[TARGET].mean())

print("\nDuplicate IDs -> train:", df_train[ID_COL].duplicated().sum(), 
      "| test:", df_test[ID_COL].duplicated().sum())

miss_tr = df_train.isna().sum()
miss_te = df_test.isna().sum()
print("\nMissing (train):")
print(miss_tr[miss_tr>0].sort_values(ascending=False))
print("\nMissing (test):")
print(miss_te[miss_te>0].sort_values(ascending=False))

card = df_train[cat_cols].nunique().sort_values(ascending=False)
print("\nCategorical cardinality (top-10):")
print(card.head(10))


# diagnostics and category alignment

In [None]:

for c in df_train.select_dtypes(include=["object"]).columns:
    cats = pd.Index(df_train[c].astype("string").unique()).union(df_test[c].astype("string").unique())
    df_train[c] = pd.Categorical(df_train[c].astype("string"), categories=cats)
    df_test[c]  = pd.Categorical(df_test[c].astype("string"),  categories=cats)


drift_num, drift_cat = [], []

try:
    have_scipy = True
except Exception:
    have_scipy = False

if have_scipy:
    for c in num_cols:
        ks = ks_2samp(df_train[c].values, df_test[c].values).statistic
        drift_num.append((c, ks))
    for c in cat_cols:
        p = df_train[c].value_counts(normalize=True)
        q = df_test[c].value_counts(normalize=True)
        idx = p.index.union(q.index)
        p = p.reindex(idx, fill_value=0); q = q.reindex(idx, fill_value=0)
        js = float(jensenshannon(p.values, q.values))
        drift_cat.append((c, js))
else:
    for c in num_cols:
        proxy = abs(df_train[c].median() - df_test[c].median()) / (df_train[c].mad() + 1e-9)
        drift_num.append((c, proxy))
    for c in cat_cols:
        tr_top = df_train[c].value_counts(normalize=True).iloc[:1].sum()
        te_top = df_test[c].value_counts(normalize=True).iloc[:1].sum()
        drift_cat.append((c, abs(tr_top - te_top)))


num_top = sorted(drift_num, key=lambda x: x[1], reverse=True)[:5]
cat_top = sorted(drift_cat, key=lambda x: x[1], reverse=True)[:5]

print("Top-5 numeric drift (KS or proxy):")
for c,v in num_top: print(f"{c:22s} -> {v:.4f}")
print("\nTop-5 categorical drift (JS or proxy):")
for c,v in cat_top: print(f"{c:22s} -> {v:.4f}")


#  signal checks to guide modeling

In [None]:

y = df_train[TARGET].astype(int).values

X_num = df_train[num_cols].copy()
X_cat = pd.DataFrame(index=df_train.index)
for c in cat_cols:
    X_cat[c] = df_train[c].astype("category").cat.codes  # -1 for NaN/unseen
X_mi = pd.concat([X_num, X_cat], axis=1)
discrete_mask = np.array([False]*len(num_cols) + [True]*len(cat_cols))
mi = mutual_info_classif(X_mi, y, discrete_features=discrete_mask, random_state=2025)
mi_s = pd.Series(mi, index=X_mi.columns).sort_values(ascending=False)

corr_pears = df_train[num_cols + [TARGET]].corr(method="pearson")[TARGET].drop(TARGET)
corr_abs = corr_pears.abs().sort_values(ascending=False)

spread_rows = []
for c in cat_cols:
    tab = df_train.groupby(c)[TARGET].agg(["count","mean"])
    tab = tab[tab["count"] >= 100]  # avoid tiny levels
    if len(tab) >= 2:
        spread = tab["mean"].max() - tab["mean"].min()
        spread_rows.append((c, spread, len(tab)))
spread_df = pd.DataFrame(spread_rows, columns=["feature","target_rate_spread","n_levels"]).sort_values("target_rate_spread", ascending=False)


print("Top-10 features by Mutual Information:")
display(mi_s.head(10).to_frame("mutual_info"))

print("Top-10 numeric by |Pearson corr| with target:")
display(corr_abs.head(10).to_frame("|pearson|"))

print("Top-10 categorical by target-rate spread (support >=100):")
display(spread_df.head(10))


# Correlation heatmap between numeric features and the target

In [None]:

TARGET = "loan_paid_back"
ID_COL = "id"

num_cols = df_train.drop(columns=[TARGET, ID_COL]).select_dtypes(include=["number"]).columns.tolist()

corr = df_train[num_cols + [TARGET]].corr(method="pearson")

plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, fmt=".2f", square=True, cbar=True)
plt.title("Correlation Matrix (Numeric Features + Target)")
plt.tight_layout()
plt.show()


#  Skewness plot for ALL numeric features

In [None]:

TARGET = "loan_paid_back"
ID_COL = "id"


num_cols = df_train.drop(columns=[TARGET, ID_COL]).select_dtypes(include=["number"]).columns.tolist()
print("Numeric columns:", num_cols)

log_scale_features = set()

raw_skew = df_train[num_cols].skew()

n = len(num_cols)
cols = 3
rows = math.ceil(n / cols)
fig, axes = plt.subplots(rows, cols, figsize=(cols*5, rows*3.6))
axes = axes.flatten()

for i, c in enumerate(num_cols):
    ax = axes[i]
    x = df_train[c].astype(float).values

    use_log = c in log_scale_features
    x_plot = np.log1p(x) if use_log else x

    ax.hist(x_plot, bins=50, density=True)

    mean_val = np.mean(x_plot)
    med_val  = np.median(x_plot)
    ax.axvline(mean_val, linestyle="--", linewidth=1.1, label="mean")
    ax.axvline(med_val,  linestyle="-.", linewidth=1.1, label="median")

    note = " (log1p)" if use_log else ""
    ax.set_title(f"{c}{note}\nskew={raw_skew[c]:.2f}")
    ax.set_xlabel(c + note)
    ax.set_ylabel("density")
    ax.legend()

for j in range(i+1, len(axes)):
    axes[j].axis("off")

plt.suptitle("Skewness — distributions of ALL numeric features", y=1.02)
plt.tight_layout()
plt.show()


# Class imbalance check

In [None]:

TARGET = "loan_paid_back"
vc = df_train[TARGET].value_counts().sort_index()        
props = (vc / vc.sum()).sort_index()                      

fig, ax = plt.subplots(figsize=(4,3))
props.plot(kind="bar", ax=ax)

for i, v in enumerate(props.values):
    ax.text(i, v + 0.01, f"{v*100:.1f}%", ha="center", va="bottom", fontsize=10)

ax.set_ylim(0, 1.1 * props.max())
ax.set_xticklabels(props.index.astype(str))
ax.set_ylabel("Proportion")
ax.set_title("Target Distribution (Class Imbalance)")
plt.tight_layout()
plt.show()


# Box-plots for ALL numeric features + outlier (IQR)

In [None]:

TARGET = "loan_paid_back"
ID_COL = "id"

num_cols = df_train.drop(columns=[TARGET, ID_COL]).select_dtypes(include=["number"]).columns.tolist()
print("Numeric columns:", num_cols)

log_scale_features = {"annual_income", "loan_amount"}  

iqr_rows = []
for c in num_cols:
    x = df_train[c].astype(float)
    q1, q3 = x.quantile([0.25, 0.75])
    iqr = q3 - q1
    lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
    out_frac = ((x < lo) | (x > hi)).mean()
    iqr_rows.append((c, float(q1), float(q3), float(lo), float(hi), float(out_frac)))
iqr_df = pd.DataFrame(iqr_rows, columns=["feature","Q1","Q3","lower_fence","upper_fence","outlier_fraction"]).sort_values("outlier_fraction", ascending=False)
print("\nApprox. outlier fractions by feature (IQR rule):")
display(iqr_df)

n = len(num_cols)
cols = 3
rows = math.ceil(n / cols)
fig, axes = plt.subplots(rows, cols, figsize=(cols*4.2, rows*4.0))
axes = axes.flatten()

for i, c in enumerate(num_cols):
    ax = axes[i]
    x = df_train[c].astype(float).values
    use_log = c in log_scale_features
    x_plot = np.log1p(x) if use_log else x

    sns.boxplot(y=x_plot, ax=ax, fliersize=2, width=0.4)
    ax.set_title(f"{c}{' (log1p)' if use_log else ''}")
    ax.set_xlabel("")
    ax.set_ylabel("value" + (" (log1p)" if use_log else ""))

    out_pct = iqr_df.loc[iqr_df["feature"]==c, "outlier_fraction"].values[0] * 100
    ax.text(0.05, 0.93, f"outliers ≈ {out_pct:.1f}%", transform=ax.transAxes, ha="left", va="top")

for j in range(i+1, len(axes)):
    axes[j].axis("off")

plt.suptitle("Box-plots for numeric features (IQR-based outlier visualization)", y=1.02)
plt.tight_layout()
plt.show()


# Minimal preprocessing

In [None]:

TARGET = "loan_paid_back"
ID_COL = "id"

feat_cols = [c for c in df_train.columns if c not in [TARGET, ID_COL]]
num_cols  = df_train[feat_cols].select_dtypes(include=["number"]).columns.tolist()
cat_cols  = df_train[feat_cols].select_dtypes(include=["object"]).columns.tolist()

def compute_iqr_fences(df, cols):
    fences = {}
    for c in cols:
        x = df[c].astype(float)
        q1, q3 = x.quantile([0.25, 0.75])
        iqr = q3 - q1
        lo, hi = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        if c == "debt_to_income_ratio":
            lo = max(lo, 0.0)
        fences[c] = (float(lo), float(hi))
    return fences

iqr_fences = compute_iqr_fences(df_train, num_cols)

def apply_clip(df, fences):
    df2 = df.copy()
    for c,(lo,hi) in fences.items():
        df2[c] = df2[c].astype(float).clip(lower=lo, upper=hi)
    return df2

tr = apply_clip(df_train, iqr_fences)
te = apply_clip(df_test,  iqr_fences)

def add_features(df):
    out = df.copy()
    if "grade_subgrade" in out.columns:
        s = out["grade_subgrade"].astype(str)
        out["grade_letter"] = s.str[0]
        out["subgrade_num"] = s.str[1:]
    if "annual_income" in out.columns:
        out["log_annual_income"] = np.log1p(out["annual_income"])
    if "loan_amount" in out.columns:
        out["log_loan_amount"] = np.log1p(out["loan_amount"])
    if set(["annual_income","loan_amount"]).issubset(out.columns):
        out["income_to_loan"] = out["annual_income"] / (1.0 + out["loan_amount"])
    if set(["interest_rate","loan_amount","annual_income"]).issubset(out.columns):
        out["interest_burden"] = (out["interest_rate"] * out["loan_amount"]) / (1.0 + out["annual_income"])
    return out

tr = add_features(tr)
te = add_features(te)

all_feats = [c for c in tr.columns if c not in [TARGET, ID_COL]]
num_final = tr[all_feats].select_dtypes(include=["number"]).columns.tolist()
cat_final = tr[all_feats].select_dtypes(include=["object"]).columns.tolist()

for c in cat_final:
    cats = pd.Index(tr[c].astype("string").unique()).union(te[c].astype("string").unique())
    tr[c] = pd.Categorical(tr[c].astype("string"), categories=cats)
    te[c] = pd.Categorical(te[c].astype("string"), categories=cats)

print(f"Final numeric features: {len(num_final)}")
print(f"Final categorical features: {len(cat_final)}")

X_train = tr[all_feats].copy()
y_train = tr[TARGET].astype(int).values
X_test  = te[all_feats].copy()
categorical_feature = cat_final


In [None]:

SEED  = 2025
FOLDS = 5

oof = np.zeros(len(X_train))
pred_test = np.zeros(len(X_test))
fold_aucs = []

kf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train, y_train), 1):
    X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
    y_tr, y_va = y_train[tr_idx], y_train[va_idx]

    model = lgb.LGBMClassifier(
        n_estimators=4000,
        learning_rate=0.03,
        num_leaves=64,
        subsample=0.9,
        colsample_bytree=0.8,
        min_child_samples=50,
        reg_lambda=5.0,
        reg_alpha=0.0,
        objective="binary",
        random_state=SEED + fold,
        n_jobs=-1,

    )

    model.fit(
        X_tr, y_tr,
        eval_set=[(X_va, y_va)],
        eval_metric="auc",
        categorical_feature=categorical_feature,
        callbacks=[lgb.early_stopping(stopping_rounds=250, verbose=False)]
    )

    val_pred = model.predict_proba(X_va, num_iteration=model.best_iteration_)[:,1]
    oof[va_idx] = val_pred
    fold_auc = roc_auc_score(y_va, val_pred)
    fold_aucs.append(fold_auc)

    pred_test += model.predict_proba(X_test, num_iteration=model.best_iteration_)[:,1] / FOLDS

    print(f"Fold {fold}: AUC = {fold_auc:.5f} | best_iter = {model.best_iteration_}")

print("\nCV AUCs:", [round(a,5) for a in fold_aucs], 
      "| Mean:", round(np.mean(fold_aucs),5), 
      "±", round(np.std(fold_aucs),5))
print("OOF AUC:", round(roc_auc_score(y_train, oof),5))

sub = pd.DataFrame({
    "id": df_test["id"],
    "loan_paid_back": np.clip(pred_test, 0, 1)
})
sub.to_csv("submission_lgb_preproc.csv", index=False)
print("Saved -> submission_lgb_preproc.csv")


In [None]:

pred_series = pd.Series(np.clip(pred_test, 0, 1), index=df_test["id"])  # id -> prob

sub = df_sub.copy()
assert "id" in sub.columns and "loan_paid_back" in sub.columns, "Unexpected df_sub format!"
assert len(sub) == len(df_test), "df_sub and df_test should have same length!"

sub["loan_paid_back"] = pred_series.loc[sub["id"]].values

assert sub["loan_paid_back"].between(0,1).all(), "Predictions must be in [0,1]!"
assert not sub["loan_paid_back"].isna().any(), "Found NaNs in predictions after alignment!"

sub.to_csv("submission.csv", index=False)
print("Saved: submission.csv")
print(sub.head())
