In [1]:
## Support Vector Machine Models (SVMs) attempts to maximally separate classes by finding support vector w/ lowest error rate or maximum separation.
  # Can use many types of kernal functions, but most common are linear & radial basis function (rbf).
     # Linear basis function (lbf) attempts to separate classes by finding hyperplanes in feature space that maximally separate classes.
     # Radial basis function (rbf) uses set of local Guassian shaped basis kernels to find nonlinear separation of classes.
  # SVMs are widely use & powerful category of ML algorithms w/ many variations.

#Import the packages to be used

from sklearn import svm, preprocessing
#from statsmodels.api import datasets
from sklearn import datasets ## Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr

%matplotlib inline

In [2]:
# Load data previously prepared in data preparation step.

Features = np.array(pd.read_csv('Credit_Features.csv'))
Labels = np.array(pd.read_csv('Credit_Labels.csv'))
Labels = Labels.reshape(Labels.shape[0],)
print(Features.shape)
print(Labels.shape)

(1000, 35)
(1000,)


In [3]:
#Above- There are 1000 cases w/ 35 features & 1 label. Numeric features wer Zscore scaled so they are 0 centered (mean removed) & unit variance (divide by std dev).

#Below- Perform model selection w/ nested cross validation (ncv) for optimal hyperparameters & model selection.
 # Compute inner loop to find optimal learning rate parameter w/ 5 fold cv since training SVM is computationally intensive.
   # Additional folds would give better estimates but at cost of greater computation time.

# Define inside & outer folds

nr.seed(123)
inside = ms.KFold(n_splits=5, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=5, shuffle = True)

In [4]:
# Estimate optimal hyperparameters using 5 fold cv.
 # In interest of computational efficiency, values for only 2 parameters will be searched.
  #1) Grid of 2 hyperparameters:
  #   A) C- is inverse of lamda of l2 regularization hyperparameter
  #   B) gamma- span of RBF kernel
  #2) Class weights are used due to class imbalance & difference in cost to bank of misclassification of bad credit risk customers
  #3) Model is fit on grid & best estimated hyperparameters are displayed

nr.seed(3456)

param_grid = {"C": [1, 10, 100, 1000], "gamma":[1.0/50.0, 1.0/200.0, 1.0/500.0, 1.0/1000.0]}   # Define dictionary for grid search & model object to search on
svc_clf = svm.SVC(class_weight = {0:0.33, 1:0.67})    # Define SVM model

clf = ms.GridSearchCV(estimator = svc_clf, param_grid = param_grid,    # Perform grid search over parameters
                      cv = inside,                       # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(Features, Labels)
print(clf.best_estimator_.C)
print(clf.best_estimator_.gamma)

10
0.02


In [5]:
#Above- Estimated optimal learning rate parameters are C=10, & gamma=0.02.

#Below- Perform outer cv of model.

nr.seed(498)
cv_estimate = ms.cross_val_score(clf, Features, Labels, 
                                 cv = outside)       # Use the outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean Performance Metric = 0.774
SDT of the Metric       = 0.016
Outcomes by CV Fold
Fold  1    0.761
Fold  2    0.758
Fold  3    0.800
Fold  4    0.768
Fold  5    0.782


In [6]:
#Above- Std dev of mean of AUC is more than an order of magnitude smaller than mean itself, indicating that this model is likely to generalize well, but level of performance is unclear.

#Below- Build, train & evaluate model w/ estimated optimal hyperparameters.
  # Create Bernoulli sampled test & training subsets

nr.seed(1115)
indx = range(Features.shape[0])    # Randomly sample cases to create independent training & test data
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

In [7]:
# Define rbf SVM model object w/ optimal hyperparmeters & fit model to training data subset

nr.seed(1115)
svm_mod = svm.SVC(C = clf.best_estimator_.C,
                  gamma = clf.best_estimator_.gamma,
                  class_weight = {0:0.33, 1:0.67},
                  probability=True) 
svm_mod.fit(x_train, y_train)

SVC(C=10, cache_size=200, class_weight={0: 0.33, 1: 0.67}, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.02, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
#Above- Hyperparameters of rbf SVM model object are as expected. 

#Below- Score & evaluate the test model

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = svm_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive       187                25
Actual Negative        48                40

Accuracy        0.76
AUC             0.80
Macro Precision 0.71
Macro Recall    0.67
 
           Positive      Negative
Num Case      212            88
Precision    0.80          0.62
Recall       0.88          0.45
F1           0.84          0.52


In [None]:
#Above- Performance of the rbf SVM above is acceptable.
  # Large majority of high risk customers are identified, but at cost of large # of FP & low precision for negative cases.
  # AUC= 0.80 is better than mean achieved w/ 5 fold cv.

## Summary- 
  # Used 5 fold to find estimated optimal hyperparameters for nonlinear SVM model to classify credit risk cases.
    # Model appears to generalize well.