In [1]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import roc_auc_score

pd.set_option("display.max_columns", 200)

DATA_DIR = ""  # set path if needed
TRAIN_PATH = os.path.join(DATA_DIR, "application_train.csv")
TEST_PATH  = os.path.join(DATA_DIR, "application_test.csv")
BUREAU_PATH = os.path.join(DATA_DIR, "bureau.csv")
BBAL_PATH   = os.path.join(DATA_DIR, "bureau_balance.csv")
PREV_PATH   = os.path.join(DATA_DIR, "previous_application.csv")

TARGET = "TARGET"
ID_COL = "SK_ID_CURR"

# Load core app tables (re-load to be self-contained)
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

# Known sentinel fix
for df in (train, test):
    if "DAYS_EMPLOYED" in df.columns:
        df.loc[df["DAYS_EMPLOYED"] == 365243, "DAYS_EMPLOYED"] = np.nan

print("Train:", train.shape, " Test:", test.shape)
print("Target distribution:", train[TARGET].value_counts(normalize=True).rename("proportion"))

Train: (307511, 122)  Test: (48744, 121)
Target distribution: TARGET
0    0.919271
1    0.080729
Name: proportion, dtype: float64


In [2]:
# =========================================================
# 1) LOAD SECONDARY TABLES & BASIC SANITY
# =========================================================
bureau = pd.read_csv(BUREAU_PATH)
bbal  = pd.read_csv(BBAL_PATH)
prev  = pd.read_csv(PREV_PATH)

print("bureau:", bureau.shape, " | bureau_balance:", bbal.shape, " | previous_application:", prev.shape)
display(bureau.head(3))
display(bbal.head(3))
display(prev.head(3))

bureau: (1716428, 17)  | bureau_balance: (27299925, 3)  | previous_application: (1670214, 37)


Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,CREDIT_ACTIVE,CREDIT_CURRENCY,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,DAYS_ENDDATE_FACT,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,AMT_CREDIT_SUM_OVERDUE,CREDIT_TYPE,DAYS_CREDIT_UPDATE,AMT_ANNUITY
0,215354,5714462,Closed,currency 1,-497,0,-153.0,-153.0,,0,91323.0,0.0,,0.0,Consumer credit,-131,
1,215354,5714463,Active,currency 1,-208,0,1075.0,,,0,225000.0,171342.0,,0.0,Credit card,-20,
2,215354,5714464,Active,currency 1,-203,0,528.0,,,0,464323.5,,,0.0,Consumer credit,-16,


Unnamed: 0,SK_ID_BUREAU,MONTHS_BALANCE,STATUS
0,5715448,0,C
1,5715448,-1,C
2,5715448,-2,C


Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0


In [3]:
# =========================================================
# 2) FEATURE ENGINEERING — BUREAU
#    External credit history per client
# =========================================================
# Safety: coerce some known numeric fields (dataset can have mixed dtypes)
num_cols_hint = ["AMT_CREDIT_SUM", "AMT_CREDIT_SUM_DEBT", "AMT_CREDIT_SUM_LIMIT",
                 "CREDIT_DAY_OVERDUE", "DAYS_CREDIT", "DAYS_CREDIT_ENDDATE", "DAYS_ENDDATE_FACT"]
for c in num_cols_hint:
    if c in bureau.columns:
        bureau[c] = pd.to_numeric(bureau[c], errors="coerce")

# Basic aggregations per client
agg = bureau.groupby("SK_ID_CURR").agg({
    "SK_ID_BUREAU": "count",
    "AMT_CREDIT_SUM": ["mean", "sum", "max"],
    "AMT_CREDIT_SUM_DEBT": ["mean", "sum", "max"],
    "AMT_CREDIT_SUM_LIMIT": ["mean", "sum"],
    "CREDIT_DAY_OVERDUE": ["max", "mean"],
    "DAYS_CREDIT": ["mean", "min", "max"]
})

# Flatten names
agg.columns = ["BUREAU_" + "_".join(col).upper() for col in agg.columns]
agg = agg.reset_index()

# Active vs closed counts
if "CREDIT_ACTIVE" in bureau.columns:
    active_pivot = pd.crosstab(bureau["SK_ID_CURR"], bureau["CREDIT_ACTIVE"]).add_prefix("BUREAU_ACTIVE_").reset_index()
    agg = agg.merge(active_pivot, on="SK_ID_CURR", how="left")

# Type distribution (top 6 types to limit width)
if "CREDIT_TYPE" in bureau.columns:
    top_types = bureau["CREDIT_TYPE"].value_counts().index[:6]
    type_pivot = pd.crosstab(bureau["SK_ID_CURR"], bureau["CREDIT_TYPE"].where(bureau["CREDIT_TYPE"].isin(top_types))).add_prefix("BUREAU_TYPE_").reset_index()
    agg = agg.merge(type_pivot, on="SK_ID_CURR", how="left")

# Debt ratio
if {"AMT_CREDIT_SUM_DEBT_SUM","AMT_CREDIT_SUM_SUM"}.issubset(set(agg.columns)):
    agg["BUREAU_DEBT_RATIO"] = agg["AMT_CREDIT_SUM_DEBT_SUM"] / (agg["AMT_CREDIT_SUM_SUM"] + 1e-6)

print("bureau agg shape:", agg.shape)
display(agg.head(3))

bureau agg shape: (305811, 25)


Unnamed: 0,SK_ID_CURR,BUREAU_SK_ID_BUREAU_COUNT,BUREAU_AMT_CREDIT_SUM_MEAN,BUREAU_AMT_CREDIT_SUM_SUM,BUREAU_AMT_CREDIT_SUM_MAX,BUREAU_AMT_CREDIT_SUM_DEBT_MEAN,BUREAU_AMT_CREDIT_SUM_DEBT_SUM,BUREAU_AMT_CREDIT_SUM_DEBT_MAX,BUREAU_AMT_CREDIT_SUM_LIMIT_MEAN,BUREAU_AMT_CREDIT_SUM_LIMIT_SUM,BUREAU_CREDIT_DAY_OVERDUE_MAX,BUREAU_CREDIT_DAY_OVERDUE_MEAN,BUREAU_DAYS_CREDIT_MEAN,BUREAU_DAYS_CREDIT_MIN,BUREAU_DAYS_CREDIT_MAX,BUREAU_ACTIVE_Active,BUREAU_ACTIVE_Bad debt,BUREAU_ACTIVE_Closed,BUREAU_ACTIVE_Sold,BUREAU_TYPE_Car loan,BUREAU_TYPE_Consumer credit,BUREAU_TYPE_Credit card,BUREAU_TYPE_Loan for business development,BUREAU_TYPE_Microloan,BUREAU_TYPE_Mortgage
0,100001,7,207623.571429,1453365.0,378000.0,85240.928571,596686.5,373239.0,0.0,0.0,0,0.0,-735.0,-1572,-49,3,0,4,0,0.0,7.0,0.0,0.0,0.0,0.0
1,100002,8,108131.945625,865055.565,450000.0,49156.2,245781.0,245781.0,7997.14125,31988.565,0,0.0,-874.0,-1437,-103,2,0,6,0,0.0,4.0,4.0,0.0,0.0,0.0
2,100003,4,254350.125,1017400.5,810000.0,0.0,0.0,0.0,202500.0,810000.0,0,0.0,-1400.75,-2586,-606,1,0,3,0,0.0,2.0,2.0,0.0,0.0,0.0


In [4]:
# =========================================================
# 3) FEATURE ENGINEERING — BUREAU_BALANCE
#    Monthly delinquency per external loan, rolled to client
# =========================================================
# Map STATUS to numeric delinquency (C/X -> 0; '0'..'5' to ints)
bbal_map = {**{str(i): i for i in range(6)}, "C": 0, "X": 0}
bbal["STATUS_NUM"] = bbal["STATUS"].map(lambda v: bbal_map.get(str(v), 0)).astype(int)

# Per SK_ID_BUREAU monthly stats
bb_by_loan = bbal.groupby("SK_ID_BUREAU").agg(
    BBAL_MONTHS=("MONTHS_BALANCE", "count"),
    BBAL_MAX_STAT=("STATUS_NUM", "max"),
    BBAL_MEAN_STAT=("STATUS_NUM", "mean"),
    BBAL_PCT_DELINQ=("STATUS_NUM", lambda x: (x >= 1).mean())
).reset_index()

# Join onto bureau to get SK_ID_CURR, then roll up to client level
bb_client = bureau[["SK_ID_BUREAU", "SK_ID_CURR"]].merge(bb_by_loan, on="SK_ID_BUREAU", how="left")
bb_client_agg = bb_client.groupby("SK_ID_CURR").agg(
    BBAL_LOANS_WITH_HISTORY=("SK_ID_BUREAU", "nunique"),
    BBAL_MONTHS_SUM=("BBAL_MONTHS", "sum"),
    BBAL_MAX_STAT_MAX=("BBAL_MAX_STAT", "max"),
    BBAL_MEAN_STAT_MEAN=("BBAL_MEAN_STAT", "mean"),
    BBAL_PCT_DELINQ_MEAN=("BBAL_PCT_DELINQ", "mean")
).reset_index()

print("bbal agg shape:", bb_client_agg.shape)
display(bb_client_agg.head(3))

bbal agg shape: (305811, 6)


Unnamed: 0,SK_ID_CURR,BBAL_LOANS_WITH_HISTORY,BBAL_MONTHS_SUM,BBAL_MAX_STAT_MAX,BBAL_MEAN_STAT_MEAN,BBAL_PCT_DELINQ_MEAN
0,100001,7,172.0,1.0,0.007519,0.007519
1,100002,8,110.0,1.0,0.255682,0.255682
2,100003,4,0.0,,,


In [5]:
# =========================================================
# 4) FEATURE ENGINEERING — PREVIOUS_APPLICATION
#    Past Home Credit applications per client
# =========================================================
prev = prev.copy()
# Basic coercions
for c in ["AMT_APPLICATION", "AMT_CREDIT", "AMT_GOODS_PRICE", "DAYS_DECISION"]:
    if c in prev.columns:
        prev[c] = pd.to_numeric(prev[c], errors="coerce")

# Flags for approval/refusal
approved = {"Approved"}
refused  = {"Refused"}

prev["IS_APPROVED"] = prev["NAME_CONTRACT_STATUS"].isin(approved).astype(int)
prev["IS_REFUSED"]  = prev["NAME_CONTRACT_STATUS"].isin(refused).astype(int)

# Ratios
prev["APPROVAL_RATIO"] = np.where(prev["AMT_APPLICATION"].notna() & prev["AMT_APPLICATION"].ne(0),
                                  prev["AMT_CREDIT"] / prev["AMT_APPLICATION"], np.nan)

# Aggregate per client
prev_agg = prev.groupby("SK_ID_CURR").agg(
    PREV_CNT=("SK_ID_PREV", "count"),
    PREV_APPROVAL_RATE=("IS_APPROVED", "mean"),
    PREV_REFUSAL_RATE=("IS_REFUSED", "mean"),
    PREV_AMT_APP_MEAN=("AMT_APPLICATION", "mean"),
    PREV_AMT_CREDIT_MEAN=("AMT_CREDIT", "mean"),
    PREV_AMT_GOODS_MEAN=("AMT_GOODS_PRICE", "mean"),
    PREV_APPROVAL_RATIO_MEAN=("APPROVAL_RATIO", "mean"),
    PREV_DAYS_DECISION_MIN=("DAYS_DECISION", "min"),   # most recent (closest to 0)
    PREV_DAYS_DECISION_MEAN=("DAYS_DECISION", "mean")
).reset_index()

print("previous_application agg shape:", prev_agg.shape)
display(prev_agg.head(3))

previous_application agg shape: (338857, 10)


Unnamed: 0,SK_ID_CURR,PREV_CNT,PREV_APPROVAL_RATE,PREV_REFUSAL_RATE,PREV_AMT_APP_MEAN,PREV_AMT_CREDIT_MEAN,PREV_AMT_GOODS_MEAN,PREV_APPROVAL_RATIO_MEAN,PREV_DAYS_DECISION_MIN,PREV_DAYS_DECISION_MEAN
0,100001,1,1.0,0.0,24835.5,23787.0,24835.5,0.957782,-1740,-1740.0
1,100002,1,1.0,0.0,179055.0,179055.0,179055.0,1.0,-606,-606.0
2,100003,3,1.0,0.0,435436.5,484191.0,435436.5,1.057664,-2341,-1305.0


In [6]:
# =========================================================
# 5) MERGE ALL AGGREGATES INTO TRAIN/TEST
# =========================================================
def merge_all(base_df):
    out = base_df.merge(agg, on="SK_ID_CURR", how="left")
    out = out.merge(bb_client_agg, on="SK_ID_CURR", how="left")
    out = out.merge(prev_agg, on="SK_ID_CURR", how="left")
    return out

train_en = merge_all(train)
test_en  = merge_all(test)

print("Enriched shapes -> train:", train_en.shape, " test:", test_en.shape)

Enriched shapes -> train: (307511, 160)  test: (48744, 159)


In [7]:
# =========================================================
# 6) QUICK EDA OF NEW FEATURES
#    - Missing rates
#    - Top correlations with TARGET
# =========================================================
new_cols = [c for c in train_en.columns if c not in train.columns or c in [
    # include when names exist after flatten
    *list(agg.columns), *list(bb_client_agg.columns), *list(prev_agg.columns)
]]
# Ensure unique
new_cols = [c for c in new_cols if c not in (TARGET,)]

# Missing %
miss = train_en[new_cols].isna().mean().sort_values(ascending=False)
print("New feature missing % (top 15):")
display(miss.head(15))

# Correlation with TARGET (numeric only)
num_new = [c for c in new_cols if pd.api.types.is_numeric_dtype(train_en[c])]
corrs = train_en[num_new + [TARGET]].corr(numeric_only=True)[TARGET].drop(TARGET).sort_values(ascending=False)
print("Top positively correlated new features with TARGET:")
display(corrs.head(15))
print("Top negatively correlated new features with TARGET:")
display(corrs.tail(15))

New feature missing % (top 15):


BBAL_MAX_STAT_MAX                            0.700073
BBAL_PCT_DELINQ_MEAN                         0.700073
BBAL_MEAN_STAT_MEAN                          0.700073
BUREAU_AMT_CREDIT_SUM_LIMIT_MEAN             0.211599
BUREAU_AMT_CREDIT_SUM_DEBT_MEAN              0.167083
BUREAU_AMT_CREDIT_SUM_DEBT_MAX               0.167083
BUREAU_TYPE_Mortgage                         0.143351
BUREAU_TYPE_Consumer credit                  0.143351
BUREAU_TYPE_Credit card                      0.143351
BUREAU_TYPE_Loan for business development    0.143351
BUREAU_TYPE_Microloan                        0.143351
BUREAU_TYPE_Car loan                         0.143351
BUREAU_AMT_CREDIT_SUM_MAX                    0.143153
BUREAU_AMT_CREDIT_SUM_MEAN                   0.143153
BBAL_LOANS_WITH_HISTORY                      0.143149
dtype: float64

Top positively correlated new features with TARGET:


BUREAU_DAYS_CREDIT_MEAN     0.089729
PREV_REFUSAL_RATE           0.077671
BUREAU_DAYS_CREDIT_MIN      0.075248
BUREAU_ACTIVE_Active        0.067128
PREV_APPROVAL_RATIO_MEAN    0.065003
BBAL_PCT_DELINQ_MEAN        0.059677
PREV_DAYS_DECISION_MIN      0.053434
BUREAU_DAYS_CREDIT_MAX      0.049782
PREV_DAYS_DECISION_MEAN     0.046864
BBAL_MEAN_STAT_MEAN         0.036774
BBAL_MAX_STAT_MAX           0.035989
BUREAU_TYPE_Credit card     0.034801
BUREAU_TYPE_Microloan       0.034111
PREV_CNT                    0.019762
BUREAU_ACTIVE_Sold          0.012058
Name: TARGET, dtype: float64

Top negatively correlated new features with TARGET:


BUREAU_TYPE_Loan for business development   -0.003785
BUREAU_AMT_CREDIT_SUM_LIMIT_SUM             -0.009419
BUREAU_TYPE_Consumer credit                 -0.010739
BUREAU_AMT_CREDIT_SUM_LIMIT_MEAN            -0.011446
BBAL_MONTHS_SUM                             -0.013638
BUREAU_AMT_CREDIT_SUM_SUM                   -0.014057
PREV_AMT_GOODS_MEAN                         -0.015847
PREV_AMT_CREDIT_MEAN                        -0.016114
BUREAU_AMT_CREDIT_SUM_MAX                   -0.019737
BUREAU_AMT_CREDIT_SUM_MEAN                  -0.019957
BUREAU_TYPE_Car loan                        -0.020825
PREV_AMT_APP_MEAN                           -0.021803
BUREAU_TYPE_Mortgage                        -0.023314
BUREAU_ACTIVE_Closed                        -0.030812
PREV_APPROVAL_RATE                          -0.063521
Name: TARGET, dtype: float64

In [8]:
# =========================================================
# 7) (OPTIONAL) FAST MODEL CHECK WITH ENRICHED FEATURES
#    HistGradientBoosting on numeric features only
# =========================================================
ID_COL = "SK_ID_CURR"
X = train_en.drop(columns=[TARGET, ID_COL])
y = train_en[TARGET].astype(int)

# numeric-only for HGB (it ignores NaNs fine)
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
X_num = X[num_cols]

X_tr, X_va, y_tr, y_va = train_test_split(X_num, y, test_size=0.2, stratify=y, random_state=42)

hgb = HistGradientBoostingClassifier(
    learning_rate=0.08, max_leaf_nodes=31, random_state=42
)
hgb.fit(X_tr, y_tr)
va_proba = hgb.predict_proba(X_va)[:, 1]
auc = roc_auc_score(y_va, va_proba)
print(f"Fast check AUC with enriched numeric features: {auc:.4f}")

Fast check AUC with enriched numeric features: 0.7628


In [9]:
# =========================================================
# 8) SAVE ENRICHED DATASETS FOR TABLEAU / MODELING
# =========================================================
train_en.to_csv("application_train_enriched.csv", index=False)
test_en.to_csv("application_test_enriched.csv", index=False)
print("Saved: application_train_enriched.csv, application_test_enriched.csv")

Saved: application_train_enriched.csv, application_test_enriched.csv
