## Dataset Introduction

We are using the **Home Credit Default Risk dataset** (`application_train.csv`) from the Home Credit Group challenge.  
This dataset contains information about loan applicants and whether they defaulted on their loan.

### Dataset Overview
- **Rows (records):** ~307,511  
- **Columns (features):** 122+  
- **Target Variable:** `TARGET`
  - `0` → Loan repaid (no default)  
  - `1` → Loan not repaid (default)  

### Key Feature Groups
- **Demographics** – gender, age, family status, education.  
- **Financial Information** – income, employment type, credit amount.  
- **Social & Housing** – housing type, region, family members.  
- **Credit Behavior** – previous loan applications, payment history.  

### Data Challenges
- **Imbalanced Target** – only ~8% defaults.  
- **Missing Values** – several categorical and numerical columns contain missing entries.  
- **High Dimensionality** – dataset has many categorical variables that need special handling.  

This makes the dataset an **excellent case study for credit risk modeling and cost-based optimization.**


In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, accuracy_score, confusion_matrix, classification_report,
    roc_curve, precision_recall_curve
)
from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Load dataset
train = pd.read_csv("/kaggle/input/home-credit-default-risk-dataset/application_train.csv")
print("Train shape:", train.shape)
print("Columns:", train.columns[:15], "...") 

Train shape: (307511, 122)
Columns: Index(['SK_ID_CURR', 'TARGET', 'NAME_CONTRACT_TYPE', 'CODE_GENDER',
       'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE', 'NAME_TYPE_SUITE',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS'],
      dtype='object') ...


In [4]:
# Features & Target
TARGET = "TARGET"
y = train[TARGET]
X = train.drop(columns=[TARGET, "SK_ID_CURR"])  # drop ID + target

In [5]:
# Handle missing values
# Numeric → median fill
num_cols = X.select_dtypes(include=[np.number]).columns
X[num_cols] = X[num_cols].fillna(X[num_cols].median())

# Categorical → "MISSING" fill
cat_cols = X.select_dtypes(include=["object"]).columns
X[cat_cols] = X[cat_cols].fillna("MISSING")

# Convert categoricals to category dtype
for c in cat_cols:
    X[c] = X[c].astype("category")

In [None]:
# Train/Test Split 
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print("Train size:", X_train.shape, "Validation size:", X_val.shape)


In [None]:
# Logistic Regression 
numeric_features = X_train.select_dtypes(include=[np.number]).columns
scaler = StandardScaler()
X_train_num = scaler.fit_transform(X_train[numeric_features])
X_val_num = scaler.transform(X_val[numeric_features])

lr = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)
lr.fit(X_train_num, y_train)
probs_lr = lr.predict_proba(X_val_num)[:, 1]
preds_lr = (probs_lr >= 0.5).astype(int)

print("\n--- Logistic Regression ---")
print("AUC:", roc_auc_score(y_val, probs_lr))
print("Accuracy:", accuracy_score(y_val, preds_lr))
print("Classification Report:",classification_report(y_val, preds_lr))


In [None]:
# CatBoost 
cat_features_idx = [X.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
val_pool = Pool(X_val, y_val, cat_features=cat_features_idx)

cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric="AUC",
    random_seed=42,
    early_stopping_rounds=50,
    verbose=200
)

cat_model.fit(train_pool, eval_set=val_pool)
probs_cb = cat_model.predict_proba(X_val)[:, 1]
preds_cb = (probs_cb >= 0.5).astype(int)

print("\n--- CatBoost ---")
print("AUC:", roc_auc_score(y_val, probs_cb))
print("Accuracy:", accuracy_score(y_val, preds_cb))
print(classification_report(y_val, preds_cb))


In [None]:
# ===== Utility: Cost-based threshold optimization =====
from sklearn.metrics import confusion_matrix

def find_best_threshold(y_true, probs, cost_fp=100, cost_fn=10, step=0.01):
    best_cost = np.inf
    best_t = 0.5
    thresholds = np.arange(0, 1, step)
    for t in thresholds:
        preds = (probs >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()
        cost = fp*cost_fp + fn*cost_fn
        if cost < best_cost:
            best_cost = cost
            best_t = t
    return best_t, best_cost

In [None]:
from xgboost import XGBClassifier

# Convert categorical features to "category" type explicitly
for col in cat_cols:
    X_train[col] = X_train[col].astype("category")
    X_val[col] = X_val[col].astype("category")

# ===== Step 6b: XGBoost =====
xgb = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=(y_train.value_counts()[0] / y_train.value_counts()[1]),  # balance classes
    eval_metric="auc",
    random_state=42,
    enable_categorical=True,   
    tree_method="hist"        
)

xgb.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=100,
    early_stopping_rounds=50
)

probs_xgb = xgb.predict_proba(X_val)[:, 1]
preds_xgb = (probs_xgb >= 0.5).astype(int)

print("\n--- XGBoost ---")
print("AUC:", roc_auc_score(y_val, probs_xgb))
print("Accuracy:", accuracy_score(y_val, preds_xgb))
print(classification_report(y_val, preds_xgb))

best_t, best_cost = find_best_threshold(y_val, probs_xgb, cost_fp=100, cost_fn=10)
print(f"\n[XGBoost] Best threshold: {best_t:.2f}, Business cost: {best_cost:.2f}")

best_preds_xgb = (probs_xgb >= best_t).astype(int)
print("Accuracy at best threshold:", accuracy_score(y_val, best_preds_xgb))
print(confusion_matrix(y_val, best_preds_xgb))


In [None]:
# Cost-based threshold optimization 
def find_best_threshold(y_true, probs, cost_fp=100, cost_fn=10, step=0.01):
    best_cost = np.inf
    best_t = 0.5
    thresholds = np.arange(0, 1, step)
    for t in thresholds:
        preds = (probs >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, preds).ravel()
        cost = fp*cost_fp + fn*cost_fn
        if cost < best_cost:
            best_cost = cost
            best_t = t
    return best_t, best_cost

best_t, best_cost = find_best_threshold(y_val, probs_cb)
print(f"\nBest threshold: {best_t:.2f}, Business cost: {best_cost:.2f}")

best_preds_cb = (probs_cb >= best_t).astype(int)
print("Accuracy at best threshold:", accuracy_score(y_val, best_preds_cb))
print(confusion_matrix(y_val, best_preds_cb))

In [None]:
# Feature importance (CatBoost)
fi = pd.DataFrame({
    "feature": X.columns,
    "importance": cat_model.get_feature_importance()
}).sort_values(by="importance", ascending=False).head(20)

plt.figure(figsize=(8,6))
sns.barplot(x="importance", y="feature", data=fi)
plt.title("Top 20 CatBoost Features")
plt.show()


In [None]:
# Feature Importance(Logistic Regression )
fi_lr = pd.DataFrame({
    "feature": numeric_features,
    "importance": np.abs(lr.coef_[0])  # absolute value of coefficients
}).sort_values(by="importance", ascending=False).head(20)

plt.figure(figsize=(8,6))
sns.barplot(x="importance", y="feature", data=fi_lr)
plt.title("Top 20 Logistic Regression Features")
plt.show()


In [None]:
# Feature Importance(XGBoost)
xgb_importance = xgb.feature_importances_

fi_xgb = pd.DataFrame({
    "feature": X.columns,
    "importance": xgb_importance
}).sort_values(by="importance", ascending=False).head(20)

plt.figure(figsize=(8,6))
sns.barplot(x="importance", y="feature", data=fi_xgb)
plt.title("Top 20 XGBoost Features")
plt.show()

In [None]:
# ROC Curves (Logistic, XGBoost, CatBoost)
fpr_lr, tpr_lr, _ = roc_curve(y_val, probs_lr)
fpr_xgb, tpr_xgb, _ = roc_curve(y_val, probs_xgb)
fpr_cb, tpr_cb, _ = roc_curve(y_val, probs_cb)

plt.figure(figsize=(7,6))
plt.plot(fpr_lr, tpr_lr, label=f"Logistic AUC={roc_auc_score(y_val, probs_lr):.3f}")
plt.plot(fpr_xgb, tpr_xgb, label=f"XGBoost AUC={roc_auc_score(y_val, probs_xgb):.3f}")
plt.plot(fpr_cb, tpr_cb, label=f"CatBoost AUC={roc_auc_score(y_val, probs_cb):.3f}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.show()

# --- Precision-Recall Curves ---
prec_lr, rec_lr, _ = precision_recall_curve(y_val, probs_lr)
prec_xgb, rec_xgb, _ = precision_recall_curve(y_val, probs_xgb)
prec_cb, rec_cb, _ = precision_recall_curve(y_val, probs_cb)

plt.figure(figsize=(7,6))
plt.plot(rec_lr, prec_lr, label="Logistic PR")
plt.plot(rec_xgb, prec_xgb, label="XGBoost PR")
plt.plot(rec_cb, prec_cb, label="CatBoost PR")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curves")
plt.legend()
plt.show()