In [1]:
## Bootstrap Aggregation (Bagging) is a widely used ensemble method
    # Ensemble methods achieve better performance of weak machine learning algorithms (weak learners) by aggregating results of many statisically independent models.
    # This process averages out errors & produces a final better prediction
    # Simple procedure consists of:
      # 1) N learners (machine learing models) are defined
      # 2) N subsamples of the training data are created by Bernoulli sampling w/ replacement
      # 3) N learners are trained on subsamples of training data
      # 4) Ensemble is scored by averaging, or taking amjority vote, of predictions from N learners
    # Claissification & regression tree models typically used w/ bagging, such as Random Forest.
     # Random forest (rf) is highly scalable & generally produces good results, even for complex problems
     # Tend to be robust to noise or outliers in training data, & true for rf as well.

    
#Import the packages to be used

from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
#from statsmodels.api import datasets
from sklearn import datasets ## Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr

%matplotlib inline

In [2]:
# Load data previously prepared in data preparation step.

Features = np.array(pd.read_csv('Credit_Features.csv'))
Labels = np.array(pd.read_csv('Credit_Labels.csv'))
Labels = Labels.reshape(Labels.shape[0],)
print(Features.shape)
print(Labels.shape)

(1000, 35)
(1000,)


In [3]:
#Above- There are 1000 cases w/ 35 features & 1 label. Numeric features wer Zscore scaled so they are 0 centered (mean removed) & unit variance (divide by std dev).

#Below- Nested cross validation (cv) used to estimate optimal hyperparameters & perform model selection for rf model using 10 folds.

nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle = True)

In [4]:
# Estimate best hyperparameters using 10 fold cv
 # 1) Grid of 2 hyperparameters are searched (intended to optimize level of regularization)
 #   A) max_Features- determines max # of features used to determine splits. 
 #         Minimizing # of features can prevent model over-fitting by induced bias.
 #   B) min_samples_leaf- determines min # of smpls/leaves which must be on terminal node of tree.
 #         Maintaining min # of smpls per terminal node (a regularization method) allows model training to memorize data if too few.
 #         Biased predictions can result if too many smpls on terminal nodes.
 # 2) "balanced" argument is used since class imbalance & differencein cost to bank for misclassification of bad credit risk customers.
 #      Each tree will have balanced case subsets for training
 # 3) Model fit on each set of hyperparameters from grid
 # 4) Best estimated hyperparameters are displayed


param_grid = {"max_features": [2, 3, 5, 10, 15], "min_samples_leaf":[3, 5, 10, 20]}    # Define dictionary for grid search & model object to search on

nr.seed(3456)
rf_clf = RandomForestClassifier(class_weight = "balanced")   # Define random forest model w/ class_weight = {0:0.33, 1:0.67}) 

nr.seed(4455)
rf_clf = ms.GridSearchCV(estimator = rf_clf, param_grid = param_grid,    # Perform grid search over parameters
                      cv = inside,                                       # Use inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
rf_clf.fit(Features, Labels)
print(rf_clf.best_estimator_.max_features)
print(rf_clf.best_estimator_.min_samples_leaf)

3
20


In [6]:
# Perform outer cv of model

nr.seed(498)
cv_estimate = ms.cross_val_score(rf_clf, Features, Labels, 
                                 cv = outside)              # Use outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by cv Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean Performance Metric = 0.762
SDT of the Metric       = 0.038
Outcomes by cv Fold
Fold  1    0.754
Fold  2    0.717
Fold  3    0.733
Fold  4    0.724
Fold  5    0.787
Fold  6    0.825
Fold  7    0.743
Fold  8    0.828
Fold  9    0.776
Fold 10    0.731


In [7]:
#Above- Std dev of mean of AUC is more than an order of manitude smaller than mean, indicating this model will generalize well.

#Below- Build & test model using estimated optimal hyperparameters

nr.seed(1115)   # Randomly sample cases to create independent training & test data
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

In [8]:
# Define rf model object using estimated optimal hyperparameter & fit model to training data

nr.seed(1115)
rf_mod = RandomForestClassifier(class_weight = "balanced", 
                                max_features = rf_clf.best_estimator_.max_features, 
                                min_samples_leaf = rf_clf.best_estimator_.min_samples_leaf) 
rf_mod.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features=3,
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [10]:
# Score & display performance metrics for test dataset model

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = rf_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)     

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive       147                65
Actual Negative        23                65

Accuracy        0.71
AUC             0.78
Macro Precision 0.68
Macro Recall    0.72
 
           Positive      Negative
Num Case      212            88
Precision    0.86          0.50
Recall       0.69          0.74
F1           0.77          0.60


In [None]:
#Above- Performance metrics look good.Large majority of negative (bad credit) cases are identified at expense of significant fp.
  # Reported AUC is w/in a std dev of AUC obtained w/ cv, indicating model is generalizing well.
    
## Summary
  # 1) Used random forest model to classify cases of iris data. 
  #    A model w/ more trees had marginally lower error rates, but likely no significant different.
  # 2) Applied feature importance  was used ofr feature selection w/ iris data.
  #    Model created & evaluated w/ reduced feature set has essentially the same performance as model w/ more features.
  # 3) Used 10 fold to find estimated optimal hyperparameters for random forest model to classify credit cases. Model appears to generalizing well.
