In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay, precision_recall_curve
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier, Pool

In [2]:
data_dir = Path("../../data/processed")
figures_dir = Path("../../figures")
figures_dir.mkdir(parents=True, exist_ok=True)
models_dir = Path("../../trained_models")
models_dir.mkdir(parents=True, exist_ok=True)

In [3]:
df = pd.read_csv(data_dir / "imputed_dataset_with_synthetic.csv")

# Features
features = [
    'credit_limit_used(%)',
    'credit_score',
    'prev_defaults',
    'default_in_last_6months',
    'no_of_children',
    'owns_car',
    'no_of_days_employed',
    'yearly_debt_payments',
    'migrant_worker',
    'total_family_members',
    'credit_score_squared',
    'credit_limit_used_squared',
    'credit_score_x_credit_limit_used',
    'credit_ratio_limit'
]

target = 'credit_card_default'

In [4]:
X = df[features]
y = df[target]

# Train/validation split (keep validation purely real)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Validation set shape: {X_val.shape}")

Training set shape: (52171, 14)
Validation set shape: (13043, 14)


In [5]:
classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_dict = {c: w for c, w in zip(classes, class_weights)}
print("Class weights:", class_weights_dict)

Class weights: {0.0: 0.7794860301807859, 1.0: 1.3944990912006843}


In [6]:
# CatBoost Pool for train/validation
train_pool = Pool(X_train, y_train, weight=y_train.map(class_weights_dict))
val_pool = Pool(X_val, y_val)

# Base CatBoost model
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='F1',
    random_seed=42,
    early_stopping_rounds=50,
    verbose=100,
    class_weights=class_weights_dict
)

In [7]:
param_grid = {
    'depth': [4, 6, 8, 10],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'iterations': [500, 1000, 1500],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'border_count': [32, 64, 128]
}

cat_random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_grid,
    n_iter=30,
    scoring='f1',
    cv=3,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

cat_random_search.fit(X_train, y_train)

print("Best parameters:", cat_random_search.best_params_)
print("Best F1 score on CV:", cat_random_search.best_score_)

best_cat = cat_random_search.best_estimator_

Fitting 3 folds for each of 30 candidates, totalling 90 fits
0:	learn: 0.9747182	total: 140ms	remaining: 2m 19s
100:	learn: 0.9846184	total: 794ms	remaining: 7.07s
200:	learn: 0.9915402	total: 1.43s	remaining: 5.69s
300:	learn: 0.9953057	total: 1.97s	remaining: 4.58s
400:	learn: 0.9971931	total: 2.58s	remaining: 3.85s
500:	learn: 0.9982103	total: 3.15s	remaining: 3.14s
600:	learn: 0.9990447	total: 3.68s	remaining: 2.44s
700:	learn: 0.9993729	total: 4.21s	remaining: 1.8s
800:	learn: 0.9995072	total: 4.74s	remaining: 1.18s
900:	learn: 0.9996714	total: 5.24s	remaining: 576ms
999:	learn: 0.9997013	total: 5.69s	remaining: 0us
Best parameters: {'learning_rate': 0.1, 'l2_leaf_reg': 3, 'iterations': 1000, 'depth': 8, 'border_count': 128}
Best F1 score on CV: 0.9747810460829848


In [8]:
best_cat.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    use_best_model=True,
    verbose=100
)

0:	learn: 0.9747182	test: 0.9767120	best: 0.9767120 (0)	total: 6.94ms	remaining: 6.94s
100:	learn: 0.9846184	test: 0.9797650	best: 0.9798069 (97)	total: 573ms	remaining: 5.1s
200:	learn: 0.9915402	test: 0.9821171	best: 0.9822336 (195)	total: 1.18s	remaining: 4.7s
300:	learn: 0.9953057	test: 0.9830266	best: 0.9832516 (287)	total: 1.76s	remaining: 4.09s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9832515753
bestIteration = 287

Shrink model to first 288 iterations.


<catboost.core.CatBoostClassifier at 0x2120d1fe810>

In [9]:
y_pred = best_cat.predict(X_val)
y_prob = best_cat.predict_proba(X_val)[:,1]

print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      8366
         1.0       0.96      0.99      0.97      4677

    accuracy                           0.98     13043
   macro avg       0.98      0.98      0.98     13043
weighted avg       0.98      0.98      0.98     13043



In [10]:
RocCurveDisplay.from_predictions(y_val, y_prob)
plt.title("CatBoost ROC Curve")
plt.savefig(figures_dir / "catboost_roc_curve.png")
plt.close()

# Precision-Recall Curve
precision, recall, thresholds = precision_recall_curve(y_val, y_prob)
plt.figure()
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('CatBoost Precision-Recall Curve')
plt.savefig(figures_dir / "catboost_precision_recall_curve.png")
plt.close()

In [11]:
importances = best_cat.get_feature_importance(prettified=True)
print(importances)

# Optional: plot
plt.figure(figsize=(10,6))
sns.barplot(x='Importances', y='Feature Id', data=importances)
plt.title("CatBoost Feature Importance")
plt.tight_layout()
plt.savefig(figures_dir / "catboost_feature_importance.png")
plt.close()

                          Feature Id  Importances
0                       credit_score    14.501480
1               total_family_members    14.455846
2               credit_score_squared    14.415757
3                     migrant_worker     7.791366
4                           owns_car     7.490728
5                     no_of_children     6.903608
6                 credit_ratio_limit     6.272294
7            default_in_last_6months     6.139444
8               yearly_debt_payments     5.412267
9                no_of_days_employed     4.965576
10  credit_score_x_credit_limit_used     4.782549
11         credit_limit_used_squared     4.324373
12              credit_limit_used(%)     2.344997
13                     prev_defaults     0.199716


In [12]:
best_cat.save_model(models_dir / "catboost_credit_default.cbm")