In [None]:
'''
 -----------------------------------------------------------
          Artificial Intelligence Workshop RUG
 -----------------------------------------------------------
            R.M. (Rolando) Gonzales Martinez
 -----------------------------------------------------------
 ~~~~~~~ Credit scoring model with Machine Learning ~~~~~~~~
         Comparing models to predict credit default
      XGBoost, random forests, SVM, logistic regression
 -----------------------------------------------------------
'''
import pandas as pd
df = pd.read_excel("bankloans.xlsx")
print(df.head())
# age: Age in years
# education: Level of education, (1) did not complete high school, (2) high school degree, (3) some college, (4) college degree, (5) postundergraduate degree
# employears: Years with current employer
# address: Years at current address
# salary: salary in thousands
# creddebt: Credit card debt in thousands
# othdebt: Other debt in thousands
# default: credit default

In [None]:
# Comparing models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

# Preprocess the data
num_cols = ['age', 'employears', 'salary', 'creddebt']
cat_cols = ['education']

# One-hot encode categorical variable
encoder = OneHotEncoder(sparse_output=False, drop='first')
edu_encoded = pd.DataFrame(
    encoder.fit_transform(df[cat_cols]),
    columns=encoder.get_feature_names_out(cat_cols)
)
X = pd.concat([df[num_cols].reset_index(drop=True), edu_encoded], axis=1)
y = df['default']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Define models
models = {
    'Logistic Regression': LogisticRegression(solver='liblinear'),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=),  #<------ fill here
    'SVM (RBF kernel)': SVC(probability=True, random_state=),                  #<------ fill here
    'XGBoost': XGBClassifier(eval_metric='logloss', random_state=, verbosity=) #<------ fill here
}

# Train and collect metrics
metrics_list = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = model.predict(X_test)
    metrics_list.append({
        'Model': name,
        'AUC': roc_auc_score(y_test, y_prob),
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, zero_division=0),
        'Recall': recall_score(y_test, y_pred, zero_division=0),
        'F1 Score': f1_score(y_test, y_pred, zero_division=0)
    })

metrics_df = pd.DataFrame(metrics_list)

print(metrics_df)

# Train and evaluate
plt.figure(figsize=(8, 6))
for name, model in models.items():
    model.fit(X_train, y_train)
    # ROC curve
    probs = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, probs)
    model_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {model_auc:.2f})')

plt.plot([0, 1], [0, 1], linestyle='--', label='Chance')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Credit Default Models')
plt.legend()
plt.show()

# Confusion matrices and classification metrics
for name, model in models.items():
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)

    # Plot confusion matrix
    plt.figure()
    plt.imshow(cm, interpolation='nearest')
    plt.title(f'Confusion Matrix: {name}')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.xticks([0, 1], ['No Default (0)', 'Default (1)'])
    plt.yticks([0, 1], ['No Default (0)', 'Default (1)'])
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha='center', va='center')
    plt.tight_layout()
    plt.show()

    # Print classification report
    print(f'Classification Report for {name}:\n')
    print(classification_report(y_test, y_pred, zero_division=0))

