In [None]:
#############################################################
# Artificial Intelligence Workshop RUG
# -----------------------------------------------------------
# R.M. (Rolando) Gonzales Martinez
# -----------------------------------------------------------
# ~~~~~~~ Credit scoring model with Machine Learning ~~~~~~~~
#       confusion matrix, elastic nets regularization
#############################################################
import pandas as pd
df = pd.read_excel("bankloans.xlsx")
print(df.head())
# age: Age in years
# education: Level of education, (1) did not complete high school, (2) high school degree, (3) some college, (4) college degree, (5) postundergraduate degree
# employears: Years with current employer
# address: Years at current address
# salary: salary in thousands
# creddebt: Credit card debt in thousands
# othdebt: Other debt in thousands
# default: credit default

In [None]:
# ---------------------------------------------------
#  Machine learning logic: train and test partition
# ---------------------------------------------------
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

# Convert categorical variable to dummy (one-hot encoding) for modeling
df_encoded = pd.get_dummies(df, columns=['education'], drop_first=True)

# Define predictors and target
X = df_encoded.drop(columns=['default'])
y = df_encoded['default']

# Split into train and test sets (70/30 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Fit the logit model on the training data
train_data = X_train.copy()
train_data['default'] = y_train
formula = "default ~ " + " + ".join(X_train.columns)
logit_model = smf.logit(formula=formula, data=train_data).fit()

# Predict on the test data
y_pred_prob = logit_model.predict(X_test)
y_pred_class = (y_pred_prob > 0.5).astype(int)

# Evaluate model performance
auc_score = roc_auc_score(y_test, y_pred_prob)

# ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure()
plt.plot(fpr, tpr, label=f"ROC Curve (AUC = {auc_score:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for Credit Default Prediction")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Print AUC and Gini with 2 decimals
print(['AUC: ', round(auc_score,4)])
print(['Gini: ', round(2 * auc_score- 1,4)])

In [None]:
# ---------------------------------------------------
#   confusion matrix of the ML credit scoring model
# ---------------------------------------------------
import numpy as np
from sklearn.metrics import confusion_matrix

# Compute Youden's J statistic for each threshold
youden_j = tpr - fpr
optimal_index = np.argmax(youden_j)
optimal_threshold = thresholds[optimal_index]
optimal_tpr = tpr[optimal_index]
optimal_fpr = fpr[optimal_index]

# Predict probabilities for test dataset and classify using optimal threshold
test_df = X_test.copy()
test_df["default"] = y_test
test_df["probab_score"] = logit_model.predict(X_test)
test_df["predicted_default"] = (test_df["probab_score"] >= optimal_threshold).astype(int)

# Create a confusion matrix based only on the test set
conf_matrix_test = confusion_matrix(test_df["default"], test_df["predicted_default"])

# Format confusion matrix as DataFrame
conf_matrix_test_df = pd.DataFrame(conf_matrix_test, 
                                   index=["Actual Non-Default", "Actual Default"], 
                                   columns=["Predicted Non-Default", "Predicted Default"])

conf_matrix_test_df

In [None]:
# ---------------------------------------------------
#   ML credit scoring model based on Elastic Nets
# ---------------------------------------------------
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split standardized data
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Fit logistic regression model with L1 regularization (Lasso)
logit_EN_model = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=666, max_iter=10000)
logit_EN_model.fit(X_train_scaled, y_train)

# Predict probabilities and classes
y_pred_prob_EN = logit_EN_model.predict_proba(X_test_scaled)[:, 1]
y_pred_class_EN = (y_pred_prob_EN > 0.5).astype(int)

# Evaluate model performance
auc_score_EN = roc_auc_score(y_test, y_pred_prob_EN)

# ROC Curve
fpr_EN, tpr_EN, thresholds_EN = roc_curve(y_test, y_pred_prob_EN)
plt.figure()
plt.plot(fpr_EN, tpr_EN, label=f"ROC Curve (AUC = {auc_score_EN:.2f})")
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve for L1-Regularized Logistic Regression")
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Print AUC and Gini 
print(['AUC: ', round(auc_score_EN,4)])
print(['Gini: ', round(2 * auc_score_EN - 1,4)])

# Compute Youden's J statistic for each threshold
youden_j_EN = tpr_EN - fpr_EN
optimal_index_EN = np.argmax(youden_j_EN)
optimal_threshold_EN = thresholds_EN[optimal_index]
optimal_tpr_EN = tpr[optimal_index_EN]
optimal_fpr_EN = fpr[optimal_index_EN]

# Reconstruct X_test_scaled into a DataFrame
test_df_EN = pd.DataFrame(X_test_scaled, columns=X.columns)

# Now you can safely assign the target and predictions
test_df_EN["default"] = y_test.values
test_df_EN["probab_score"] = logit_EN_model.predict_proba(X_test_scaled)[:, 1]
test_df_EN["predicted_default"] = (test_df_EN["probab_score"] >= optimal_threshold_EN).astype(int)


# Create a confusion matrix based only on the test set
conf_matrix_test_EN = confusion_matrix(test_df_EN["default"], test_df_EN["predicted_default"])

# Format confusion matrix as DataFrame
conf_matrix_test_df_EN = pd.DataFrame(conf_matrix_test_EN, 
                                   index=["Actual Non-Default", "Actual Default"], 
                                   columns=["Predicted Non-Default", "Predicted Default"])

conf_matrix_test_df_EN