In [None]:
'''
 -----------------------------------------------------------
          Artificial Intelligence Workshop RUG
 -----------------------------------------------------------
            R.M. (Rolando) Gonzales Martinez
 -----------------------------------------------------------
 ~~~~~~~ Credit scoring model with Machine Learning ~~~~~~~~
 support vector machines (with RBF kernel) vs. logistic model
'''
import pandas as pd
df = pd.read_excel("bankloans.xlsx")
print(df.head())
# age: Age in years
# education: Level of education, (1) did not complete high school, (2) high school degree, (3) some college, (4) college degree, (5) postundergraduate degree
# employears: Years with current employer
# address: Years at current address
# salary: salary in thousands
# creddebt: Credit card debt in thousands
# othdebt: Other debt in thousands
# default: credit default

In [None]:
# Credit scoring models vs logit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score

# 1. Data preparation
y = df['default'].astype(int)
X = df[['age', 'employears', 'salary', 'creddebt', 'education']]

# 2. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=, random_state=, stratify=y) # <-------------------- fill here

# 3. Preprocessing
num_cols = ['age', 'employears', 'salary', 'creddebt']
cat_cols = ['education']
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first'), cat_cols)
])

# 4. Pipelines
'''
In SVM:
    gamma controls the "spread" of the RBF kernel and influences how much a single training example affects the decision boundary.
    * A small gamma means a larger similarity radius, leading to smoother decision boundaries (may underfit).
    * A large gamma makes the model focus more on individual data points, leading to more complex (possibly overfit) boundaries.
    Options: kernel='poly', kernel='sigmoid', 'linear'
In logit:
    C is the inverse of regularization strength in logistic regression to prevent overfitting.
    Smaller C → Stronger regularization (more penalty on large coefficients, simpler model).
    Larger C → Weaker regularization (model can fit training data more closely, risk of overfitting).
    Default: C=1.0 (balanced regularization).
    Range: C > 0 (typically between 0.001 and 100 in practice).
'''
svm_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('svm', SVC(kernel='', C=, gamma='scale', probability=True)) # <-------------------- fill here
])
logit_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('logit', LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000))
])

# 5. Fit models
svm_pipeline.fit(X_train, y_train)
logit_pipeline.fit(X_train, y_train)

# 6. Predictions & probabilities
y_pred_svm = svm_pipeline.predict(X_test)
y_pred_logit = logit_pipeline.predict(X_test)
y_prob_svm = svm_pipeline.predict_proba(X_test)[:, 1]
y_prob_logit = logit_pipeline.predict_proba(X_test)[:, 1]

# 7. Metrics
models = [
    ('SVM', y_pred_svm, y_prob_svm),
    ('Logistic', y_pred_logit, y_prob_logit)
]
results = []
for name, y_pred, y_prob in models:
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)
    report_dict = classification_report(y_test, y_pred, output_dict=True)
    results.append({
        'Model': name,
        'Accuracy': acc,
        'AUC': auc,
        'Precision(0)': report_dict['0']['precision'],
        'Recall(0)': report_dict['0']['recall'],
        'F1(0)': report_dict['0']['f1-score'],
        'Precision(1)': report_dict['1']['precision'],
        'Recall(1)': report_dict['1']['recall'],
        'F1(1)': report_dict['1']['f1-score']
    })

df_results = pd.DataFrame(results)

df_results

In [None]:
# 8. Plot ROC curves
fpr_svm, tpr_svm, _ = roc_curve(y_test, y_prob_svm)
fpr_logit, tpr_logit, _ = roc_curve(y_test, y_prob_logit)
auc_svm = roc_auc_score(y_test, y_prob_svm)
auc_logit = roc_auc_score(y_test, y_prob_logit)
plt.figure()
plt.plot(fpr_svm, tpr_svm, label=f'SVM (AUC = {auc_svm:.3f})')
plt.plot(fpr_logit, tpr_logit, label=f'Logistic (AUC = {auc_logit:.3f})')
plt.plot([0, 1], [0, 1], linestyle='--', label='Baseline')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves: SVM vs Logistic Regression')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()