In [1]:
##Boosting is a meta-alogrithm since method can be applied to many types of ML alogrithms.
  #Boosting iteratively improves learning of N models by giving greater weight to training cases w/ larger errors.
    # Ensemble methods achieve better performance of weak machine learning algorithms (weak learners) by aggregating results of many statisically independent models.
    # This process averages out errors & produces a final better prediction
    # Simple Boosting procedure consists of:
      # 1) N learners (machine learing models) are defined
      # 2) Each of i training data cases is given an initial equal weight of 1/i
      # 3) N learners are trained on weighted training data
      # 4) Prediction is computed based on aggregatation of learners; averaging over hypthosis of N learners
      # 5) Weights for training data cases are updated based on aggregated error made by learners. Cases w/ larger errors are given larger weights.
      # 6) Steps 3, 4, 5 are reapeated until a convergence criteria is met.  
    # Classification & regression tree models typically weak learners used w/ boosting.
     # Adaptive boosting/AdaBoost is one of most successful boosted methods that uses some large #, N, tree models.
       # The rate at which weights are updated is adaptive with the errors.
    # Boosted ML is NOT robust to significant noise or outliers in training data. 
     # Reweighting process gives greater weight to large errors, & therefore can give undue weight to outliers & errors.
     # When traing data cases are noisy, the random forest algorithm may prove to be more robust.
    
    
#Import the packages to be used

from sklearn.ensemble import AdaBoostClassifier
from sklearn import preprocessing
#from statsmodels.api import datasets
from sklearn import datasets ## Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr

%matplotlib inline

In [2]:
# Load data previously prepared in data preparation step.

Features = np.array(pd.read_csv('Credit_Features.csv'))
Labels = np.array(pd.read_csv('Credit_Labels.csv'))
Labels = Labels.reshape(Labels.shape[0],)
print(Features.shape)
print(Labels.shape)

(1000, 35)
(1000,)


In [3]:
#Above- There are 1000 cases w/ 35 features & 1 label. Numeric features wer Zscore scaled so they are 0 centered (mean removed) & unit variance (divide by std dev).

#Below- Nested cross validation (Ncv) used to estimate optimal hyperparameters & perform model selection for AdaBoost tree model using 10 folds.

nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle = True)

In [4]:
# Estimate best hyperparameters using 10 fold cv
 # 1) Grid of 1 hyperparameter is searched (intended to optimize level of regularization)
 #   # Learning_rate shrinks contribution of each classifer. THere is a trade-off b/w learning-rate & n_estimators.
 # 2) Class imbalance is true & difference in cost to bank for misclassification of bad credit risk customers
     # will be addressed later.
 # 3) Model fit on each set of hyperparameters from grid
 # 4) Best estimated hyperparameters are displayed


param_grid = {"learning_rate": [0.1, 1, 10]}    ## Define dictionary for grid search & model object to search on

nr.seed(3456)
ab_clf = AdaBoostClassifier()  ## Define AdaBoosted tree model


nr.seed(4455)
ab_clf = ms.GridSearchCV(estimator = ab_clf, param_grid = param_grid,    ## Perform grid search over parameters
                      cv = inside,          # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
ab_clf.fit(Features, Labels)
print(ab_clf.best_estimator_.learning_rate)

1


In [5]:
# Perform outer cv of model

nr.seed(498)
cv_estimate = ms.cross_val_score(ab_clf, Features, Labels, 
                                 cv = outside) # Use the outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean Performance Metric = 0.758
SDT of the Metric       = 0.034
Outcomes by CV Fold
Fold  1    0.765
Fold  2    0.721
Fold  3    0.704
Fold  4    0.736
Fold  5    0.788
Fold  6    0.772
Fold  7    0.728
Fold  8    0.827
Fold  9    0.765
Fold 10    0.771


In [6]:
#Above- Std dev of mean of AUC is more than an order of manitude smaller than mean, indicating this model will generalize well.

#Below- Build & test model using estimated optimal hyperparameters

nr.seed(1115)   # Randomly sample cases to create independent training & test data
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

In [7]:
# Define AdaBoosted tree model object using estimated optimal hyperparameter & fit model to training data

nr.seed(1115)
ab_mod = AdaBoostClassifier(learning_rate = ab_clf.best_estimator_.learning_rate) 
ab_mod.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=50, random_state=None)

In [8]:
# Score & display performance metrics for test dataset model

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = ab_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive       176                36
Actual Negative        45                43

Accuracy        0.73
AUC             0.76
Macro Precision 0.67
Macro Recall    0.66
 
           Positive      Negative
Num Case      212            88
Precision    0.80          0.54
Recall       0.83          0.49
F1           0.81          0.51


In [9]:
#Above- Performance metrics look poor. Large majority of negative (bad credit) cases are misclassified as positive.

#Above- Performance metrics look good.Large majority of negative (bad credit) cases are identified at expense of significant fp.
  # This shows AdaBoosted method are sensitive to class imbalance.
    
#Below- Poor performance is more than likely due to class imbalance & unable to reweigh classes w/ boosting methods. Alternatives:
 # 1) Impute new values using statistical alogrithm
 # 2) Undersample majority of cases. For this method, a # of cases = to minority case are Bernoulli sampled from majority case.
 # 3) Oversampl minority cases. For this method, a # of minority cases are resampled until = # of majority cases.
    
# Undersample the majority cases (good credit customers), using choice funtion from Numpy.random package to randomize undersampling.
 # Count of unique label values & shape of resulting arrays is displayed.
 # Create data set w/ balanced cases.

temp_Labels_1 = Labels[Labels == 1]         # Save these
temp_Features_1 = Features[Labels == 1,:]    # Save these
temp_Labels_0 = Labels[Labels == 0]        # Undersample these
temp_Features_0 = Features[Labels == 0,:]    # Undersample these

indx = nr.choice(temp_Features_0.shape[0], temp_Features_1.shape[0], replace=True)

temp_Features = np.concatenate((temp_Features_1, temp_Features_0[indx,:]), axis = 0)
temp_Labels = np.concatenate((temp_Labels_1, temp_Labels_0[indx,]), axis = 0) 

print(np.bincount(temp_Labels))
print(temp_Features.shape)
print(temp_Labels.shape)

[300 300]
(600, 35)
(600,)


In [10]:
#Above- Now 300 of each label case w/ 600 cases overall.

#Below- Perform model selection again w/ ncv
 #Compute inner loop to find optimal learning rate parameter
    
nr.seed(1234)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(3214)
outside = ms.KFold(n_splits=10, shuffle = True)

ab_clf = AdaBoostClassifier()    # Define AdaBoosted tree model
nr.seed(3456)

nr.seed(4455)
ab_clf = ms.GridSearchCV(estimator = ab_clf, param_grid = param_grid,    # Perform grid search over parameters
                      cv = inside,                     # Use the inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
ab_clf.fit(temp_Features, temp_Labels)
print(ab_clf.best_estimator_.learning_rate)

0.1


In [11]:
#Above- Estimated optimal learning rate parameter is small (0.1) than before (1).

#Below- Perform outer cv of model

nr.seed(498)
cv_estimate = ms.cross_val_score(ab_clf, Features, Labels, 
                                 cv = outside) # Use the outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean Performance Metric = 0.758
SDT of the Metric       = 0.034
Outcomes by CV Fold
Fold  1    0.765
Fold  2    0.721
Fold  3    0.704
Fold  4    0.736
Fold  5    0.788
Fold  6    0.772
Fold  7    0.728
Fold  8    0.827
Fold  9    0.765
Fold 10    0.771


In [12]:
#Above- Average AUC is improved compared to imbalanced training cases. Differences are w/in 1 std dev. Still reasonable chance represent improvement.

#Below- Train & evaluate model w/ balanced cases & updated hyperparameter.
  # Create Bernoulli sampled test & training subsets
  # Define AdaBoosted model
  # Train AdaBoosted model
    

nr.seed(1115)
indx = range(Features.shape[0])   # Randomly sample cases to create independent training & test data
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

# Undersample majority case for training data
temp_Labels_1 = y_train[y_train == 1]      # Save these
temp_Features_1 = x_train[y_train == 1,:]      # Save these
temp_Labels_0 = y_train[y_train == 0]       # Undersample these
temp_Features_0 = x_train[y_train == 0,:]     # Undersample these

indx = nr.choice(temp_Features_0.shape[0], temp_Features_1.shape[0], replace=True)

x_train = np.concatenate((temp_Features_1, temp_Features_0[indx,:]), axis = 0)
y_train = np.concatenate((temp_Labels_1, temp_Labels_0[indx,]), axis = 0) 

print(np.bincount(y_train))
print(x_train.shape)
print(y_train.shape)

[212 212]
(424, 35)
(424,)


In [13]:
# Define & fit the model
nr.seed(1115)
ab_mod = AdaBoostClassifier(learning_rate = ab_clf.best_estimator_.learning_rate) 
ab_mod.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.1, n_estimators=50, random_state=None)

In [14]:
# Score & evaluate the model

probabilities = ab_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive       146                66
Actual Negative        19                69

Accuracy        0.72
AUC             0.79
Macro Precision 0.70
Macro Recall    0.74
 
           Positive      Negative
Num Case      212            88
Precision    0.88          0.51
Recall       0.69          0.78
F1           0.77          0.62


In [None]:
#Above- Results are significantly better than previously obtained imbalanced training data in classifying negative cases.
  # AUC is more than 1 std dev away from ncv AUC.
    
## Summary
  # 1) Used 10 fold to find estimated optimal hyperparameters for AdaBoosted tree model to classify credit risk cases. 
      # Model did not generalize well due to class imbalance.
  # 2) Applied undersampling of majority cases to create balanced training dataset & retrained & evaluated model.
      # Model created w/ balanced training data was significantly better.
