In [1]:
## Classification using Neural Networks
 # Neural Networks usse 1+ hidden layer of multiple units to perform complex function approximation, which is said to have high model capacity.
  # B/c of large # of hidden units, neural networks (nn) have many weights or parameters, which often leads to over-fitting & limits generalization.
  # Finding optimal hyperparameter when fitting nn is essential for good performance.
  # Another issue is computational complexity, requiring many optimization iterations, w/ each optimization iteration requiring update of a large # of parameters.

# Import the packages to be used

from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
# from statsmodels.api import datasets
from sklearn import datasets      # Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr

%matplotlib inline

In [2]:
# Load data previously prepared in data preparation step.

Features = np.array(pd.read_csv('Credit_Features.csv'))
Labels = np.array(pd.read_csv('Credit_Labels.csv'))
Labels = Labels.reshape(Labels.shape[0],)
print(Features.shape)
print(Labels.shape)

(1000, 35)
(1000,)


In [3]:
#Above- There are 1000 cases w/ 35 features & 1 label. Numeric features wer Zscore scaled so they are 0 centered (mean removed) & unit variance (divide by std dev).

#Below- Neural network is known to be problematic when there is a significant class imbalance
 # Unfortunately, nn have no method for weighting classes. Alternatives:
 # 1) Impute new values using statistical alogrithm.
 # 2) Undersample majority of cases. For this method, a # of cases = to minority case are Bernoulli sampled from majority case.
 # 3) Oversample minority cases. For this method, a # of minority cases are resampled until = # of majority cases.
    
# Oversample the minority cases (bad credit customers).
 # Create data set w/ balanced cases.
    
temp_Labels = Labels[Labels == 1] 
temp_Features = Features[Labels == 1,:]
temp_Features = np.concatenate((Features, temp_Features), axis = 0)
temp_Labels = np.concatenate((Labels, temp_Labels), axis = 0) 

print(temp_Features.shape)
print(temp_Labels.shape)

(1300, 35)
(1300,)


In [4]:
#Above- Now w/ 1300 cases overall.

#Below- Perform model selection w/ nested cross validation (ncv) for optimal hyperparameters & model selection.
 # Compute inner loop to find optimal learning rate parameter w/ 3 fold cv since training nn is computationally intensive.
   # Additional folds would give better estimates but at cost of greater computation time.

#Define inside & outer folds
nr.seed(123)
inside = ms.KFold(n_splits=3, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=3, shuffle = True)

In [5]:
# Estimate optimal hyperparameters using 3 fold cv.
 # In interest of computational efficiency, values for only 4 parameters will be searched.
  #1) Grid of 4 hyperparameters:
  #   A) alpha- is l2 regularization hyperparameter
  #   B) early_stopping- determine when training metric becomes worse following an iteration of optimization algorithm stop training at previous iteration.
  #       This is powerful method to prevent over-fitting or ML models in general & nn in particular.
  #   C) beta_1 & beta_2- hyperparameters that control adaptive learning rate used by Adam optimizer.
  #2) Model is fit on grid
  #3) Best estimated hyperparameters are displayed

# Code below searches over a 3x3x3x2 or 54 element grid using 3 fold cv, requiring model to be trained 162 times.
 # Execution will take some time.
    

param_grid = {"alpha":[0.0000001,0.000001,0.00001],   # Define dictionary for grid search & model object to search on
              "early_stopping":[True, False], 
              "beta_1":[0.95,0.90,0.80], 
              "beta_2":[0.999,0.9,0.8]}

nn_clf = MLPClassifier(hidden_layer_sizes = (100,100),    # Define Neural Network model
                       max_iter=300)

nr.seed(3456)
nn_clf = ms.GridSearchCV(estimator = nn_clf, param_grid = param_grid,   # Perform grid search over parameters
                      cv = inside,                            # Use inside folds
                      scoring = 'recall',
                      return_train_score = True)

nr.seed(6677)
nn_clf.fit(temp_Features, temp_Labels)
print(nn_clf.best_estimator_.alpha)
print(nn_clf.best_estimator_.early_stopping)
print(nn_clf.best_estimator_.beta_1)
print(nn_clf.best_estimator_.beta_2)

1e-05
False
0.8
0.999


In [6]:
#Above- Estimated optimal learning rate parameters are alpha=1e-05, early_stopping=False, beta_1=0.8, beta_2=0.999.

#Below- Perform outer cv of mode, which will take time.

nr.seed(498)
cv_estimate = ms.cross_val_score(nn_clf, temp_Features, temp_Labels, cv = outside)     # Use outside folds
                                 

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean Performance Metric = 0.866
SDT of the Metric       = 0.006
Outcomes by CV Fold
Fold  1    0.865
Fold  2    0.874
Fold  3    0.859


In [7]:
#Above- Std dev of mean of Recall is an order of magnitude less than mean itself, indicating that this model is likely to generalize well, but level of performance is unclear.

#Below- Build, train & evaluate model w/ balanced cases & estimated optimal hyperparameters.
  # Create Bernoulli sampled test & training subsets
  # Oversample minority case for training data subset

nr.seed(1115)
indx = range(Features.shape[0])    # Randomly sample cases to create independent training & test data
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

y_temp = y_train[y_train == 1]     # Oversample minority case for training data
x_temp = x_train[y_train == 1,:]
x_train = np.concatenate((x_train, x_temp), axis = 0)
y_train = np.concatenate((y_train, y_temp), axis = 0) 

In [8]:
# Define neural network model object w/ optimal hyperparmeters & fits model to training data subset

nr.seed(1115)
nn_mod = MLPClassifier(hidden_layer_sizes = (100,100), 
                       alpha = nn_clf.best_estimator_.alpha, 
                       early_stopping = nn_clf.best_estimator_.early_stopping, 
                       beta_1 = nn_clf.best_estimator_.beta_1, 
                       beta_2 = nn_clf.best_estimator_.beta_2,
                       max_iter = 300)
nn_mod.fit(x_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.8,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=300, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [9]:
#Above- Hyperparameters of nn model object are as expected. 

#Below- Score & evaluate the test model

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = nn_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)     

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive       169                43
Actual Negative        43                45

Accuracy        0.71
AUC             0.72
Macro Precision 0.65
Macro Recall    0.65
 
           Positive      Negative
Num Case      212            88
Precision    0.80          0.51
Recall       0.80          0.51
F1           0.80          0.51


In [None]:
#Above- Performance of the nn above is less than ideal and is worse than if only beta_1 & beta_2 were searched.
  # Negative (bad credit) case recall is adequate, but precision is poor.
  # Perhaps oversampling does not help.

## Summary- 
  # Used 3 fold to find estimated optimal hyperparameters for neural network (nn) model to classify credit risk cases.
    # Oversampling of minority case for training data was required to deal w/ class imbalance, & result achieved were marginal at best.
    # Perhaps model w/ greater capacity would achieve better results, or different approach to deal w/ class imbalance would be more successful.