In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    RocCurveDisplay, roc_auc_score, precision_recall_curve, classification_report
)

In [3]:
train = pd.read_csv('../../data/processed/imputed_dataset.csv')
train.head()

Unnamed: 0,age,owns_car,owns_house,no_of_children,net_yearly_income,no_of_days_employed,occupation_type,total_family_members,migrant_worker,yearly_debt_payments,credit_limit,credit_limit_used(%),credit_score,prev_defaults,default_in_last_6months,credit_card_default
0,46.0,0.0,1.0,0.0,107934.04,612.0,17.0,1.0,1.0,33070.28,18690.93,73.0,544.0,2.0,1.0,1.0
1,29.0,0.0,1.0,0.0,109862.62,2771.0,8.0,2.0,0.0,15329.53,37745.19,52.0,857.0,0.0,0.0,0.0
2,37.0,0.0,1.0,0.0,230153.17,204.0,8.0,2.0,0.0,48416.6,41598.36,43.0,650.0,0.0,0.0,0.0
3,39.0,0.0,1.0,0.0,122325.82,11941.0,3.0,2.0,0.0,22574.36,32627.76,20.0,754.0,0.0,0.0,0.0
4,46.0,1.0,1.0,0.0,387286.0,1459.0,3.0,1.0,0.0,38282.95,52950.64,75.0,927.0,0.0,0.0,0.0


In [5]:
features = ['credit_limit_used(%)', 'credit_score', 'prev_defaults', 'default_in_last_6months']
target = 'credit_card_default'

X = train[features]
y = train[target]

# Drop any rows with NaNs in either features or target
X = X.dropna()
y = y[X.index]  # keep only corresponding target values

In [7]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [15]:
y_val_pred = model.predict(X_val)
y_val_prob = model.predict_proba(X_val)[:,1]

# Classification report
print(classification_report(y_val, y_val_pred))

# Confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

# ROC-AUC
roc_auc = roc_auc_score(y_val, y_val_prob)
print(f"ROC-AUC Score: {roc_auc:.3f}")

              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99      8367
         1.0       0.96      0.78      0.86       739

    accuracy                           0.98      9106
   macro avg       0.97      0.89      0.93      9106
weighted avg       0.98      0.98      0.98      9106

Confusion Matrix:
[[8342   25]
 [ 161  578]]
ROC-AUC Score: 0.994


In [21]:
figures_dir = Path("../../figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [27]:
# y_true and y_pred from your logistic regression
cm = confusion_matrix(y_val, y_val_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Logistic Regression Confusion Matrix")
plt.savefig(figures_dir / "logreg_confusion_matrix.png")
plt.close()

In [31]:
roc_disp = RocCurveDisplay.from_predictions(y_val, y_val_prob)
plt.title("ROC Curve of Logistic Regression")
plt.savefig(figures_dir / "logreg_roc_curve.png")
plt.close()

In [33]:
precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)
plt.figure()
plt.plot(recall, precision, marker='.')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.savefig(figures_dir / "logreg_precision_recall_curve.png")
plt.close()

In [35]:
coefficients = pd.DataFrame({
    "feature": X.columns,
    "coefficient": model.coef_[0]
}).sort_values(by="coefficient", key=abs, ascending=False)

plt.figure(figsize=(8,6))
sns.barplot(data=coefficients, x="coefficient", y="feature", palette="viridis")
plt.title("Feature Importance (Logistic Regression Coefficients)")
plt.tight_layout()
plt.savefig(figures_dir / "feature_coefficients.png")
plt.close()


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=coefficients, x="coefficient", y="feature", palette="viridis")
