In [None]:
'''
 -----------------------------------------------------------
          Artificial Intelligence Workshop RUG
 -----------------------------------------------------------
            R.M. (Rolando) Gonzales Martinez
 -----------------------------------------------------------
 ~~~~~~~ Credit scoring model with Machine Learning ~~~~~~~~
    support vector machines with k-fold cross-validation
'''
import pandas as pd

# 1. Load data
df = pd.read_excel("b.xlsx") # <-------------------- fill here
print(df.head())
# age: Age in years
# education: Level of education, (1) did not complete high school, (2) high school degree, (3) some college, (4) college degree, (5) postundergraduate degree
# employears: Years with current employer
# address: Years at current address
# salary: salary in thousands
# creddebt: Credit card debt in thousands
# othdebt: Other debt in thousands
# default: credit default

In [None]:
# Support vector machines with cross-validation
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# 2. Preprocessing
y = df['default'].astype(int)
X = df[['age', 'employears', 'salary', 'creddebt', 'education']]
num_cols = ['age', 'employears', 'salary', 'creddebt']
cat_cols = []
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(drop='first'), cat_cols)
])

# 3. Define pipelines
'''
In SVM:
    C is the regularization parameter that controls the trade-off between maximizing the margin and minimizing classification errors.
    Small C → Wider margin, allows more misclassifications (stronger regularization, simpler model).
    Large C → Narrower margin, penalizes misclassifications more (weaker regularization, fits training data tightly, risk of overfitting).
    C interacts with gamma (for RBF/poly/sigmoid kernels):
        High C + High gamma → Very complex boundaries (overfitting risk).
        Low C + Low gamma → Smoother boundaries (underfitting risk).
    If kernel='linear', C is the main tunable parameter.
'''
svm_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('svm', SVC(kernel='', C=, gamma='scale', probability=True)) # <-------------------- fill here
])
logit_pipeline = Pipeline([
    ('preproc', preprocessor),
    ('logit', LogisticRegression(solver='lbfgs', C=1.0, max_iter=1000))
])

models = [
    ('SVM', svm_pipeline),
    ('Logistic', logit_pipeline)
]

# 4. Cross-validation setup
cv = StratifiedKFold(n_splits= , shuffle=True, random_state=) # <-------------------- fill here
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',
    'precision_macro': 'precision_macro',
    'recall_macro': 'recall_macro',
    'f1_macro': 'f1_macro'
}

# 5. Run cross-validation
results = []
for name, pipeline in models:
    cv_res = cross_validate(
        pipeline, X, y,
        cv=cv,
        scoring=scoring,
        return_train_score=False
    )
    results.append({
        'Model': name,
        'Acc (mean ± std)': f"{cv_res['test_accuracy'].mean():.3f} ± {cv_res['test_accuracy'].std():.3f}",
        'AUC (mean ± std)': f"{cv_res['test_roc_auc'].mean():.3f} ± {cv_res['test_roc_auc'].std():.3f}",
        'Precision (mean ± std)': f"{cv_res['test_precision_macro'].mean():.3f} ± {cv_res['test_precision_macro'].std():.3f}",
        'Recall (mean ± std)':    f"{cv_res['test_recall_macro'].mean():.3f} ± {cv_res['test_recall_macro'].std():.3f}",
        'F1 (mean ± std)':        f"{cv_res['test_f1_macro'].mean():.3f} ± {cv_res['test_f1_macro'].std():.3f}"
    })

df_cv_comparison = pd.DataFrame(results)

df_cv_comparison

In [None]:
# Figure
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, roc_auc_score
mean_fpr = np.linspace(0, 1, 100)

plt.figure()
for name, pipeline in models:              # <-- loop directly over the list, not .items()
    tprs = []
    aucs = []
    for train_idx, test_idx in cv.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
        pipeline.fit(X_train, y_train)
        y_prob = pipeline.predict_proba(X_test)[:, 1]
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(roc_auc_score(y_test, y_prob))
    # Compute mean and std
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    std_tpr = np.std(tprs, axis=0)
    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)
    # Plot mean ROC
    plt.plot(mean_fpr, mean_tpr, label=f'{name} (AUC = {mean_auc:.3f} ± {std_auc:.3f})')
    # Shade std interval
    plt.fill_between(mean_fpr, 
                     np.clip(mean_tpr - std_tpr, 0, 1),
                     np.clip(mean_tpr + std_tpr, 0, 1),
                     alpha=0.2)

# Baseline
plt.plot([0, 1], [0, 1], linestyle='--', label='Baseline')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Cross-Validated ROC Curves with ±1 STD')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()