In [1]:
## Naive Bayes (nb) Models make the naive assumption of statisitcal independence of features
  # Surprisingly effective despite violating assumption of indenpendence. A useful & effective simplification of general Bayesian models.
  # Use empirical distributions of features to compute probabilities of labels & can use most any family of distributions for features.
  # Important to select correct distribution family for data you are working with.
    # Gaussian- for continuous or numerical features
    # Bernoulli- features w/ binary features
    # Multinomial- feature w/ more than 2 categories
  # Pitfall- model fails if 0 probability is encountered, which occurs when there's a 'hole' in smpl space where there are no smpls.
    # Simple smoothing hyperparameter, alpha, can deal w/ this problem & is one of few required for nb models. 
  # Computational complexity is linear in # of parameters/features, making nb highly scalable. There are out or core approaches suitable for masive datasets.
  # Requires minimal data to produce models that generalizes well.
    # If only a few cases per category to train model, a nb model can be a good choice.
  # Have simple & inherent regularization
  # NB used in:
    # document classification
    # SPAM detection
    # Image classification
    
#Import the packages to be used

from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB, BernoulliNB
#from statsmodels.api import datasets
from sklearn import datasets ## Get dataset from sklearn
import sklearn.model_selection as ms
import sklearn.metrics as sklm
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import numpy.random as nr

%matplotlib inline

In [2]:
# Load data previously prepared in data preparation step.

Features = np.array(pd.read_csv('Credit_Features.csv'))
Labels = np.array(pd.read_csv('Credit_Labels.csv'))
Labels = Labels.reshape(Labels.shape[0],)
print(Features.shape)
print(Labels.shape)

(1000, 35)
(1000,)


In [3]:
#Above- There are 1000 cases w/ 35 features & 1 label. Numeric features wer Zscore scaled so they are 0 centered (mean removed) & unit variance (divide by std dev).

#Below- Features array has both numeric & binary features (dummy variables for categorical features)
  # Therefore, Gaussian model must be used, however it's not ideal since numeric features mixed w/ features exhibiting Bernoulli distributions (binary features)
  

nr.seed(321)
cv_folds = ms.KFold(n_splits=10, shuffle = True)  #Define 10 fold cv object
    
nr.seed(498)
NB_credit = GaussianNB()      #Define Gaussian naive Bayes model
cv_estimate = ms.cross_val_score(NB_credit, Features, Labels,     #Performs 10 fold cv
                                 cv = cv_folds)       # Use the outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))    # Display cv results

Mean Performance Metric = 0.691
SDT of the Metric       = 0.093
Outcomes by CV Fold
Fold  1    0.720
Fold  2    0.660
Fold  3    0.770
Fold  4    0.690
Fold  5    0.750
Fold  6    0.430
Fold  7    0.690
Fold  8    0.760
Fold  9    0.710
Fold 10    0.730


In [4]:
#Above- Std dev of mean of AUC is more than an order of magnitude smaller than mean itself, indicating that this model is likely to generalize well, but level of performance is unclear.

#Below- Build, train & evaluate model w/ estimated optimal hyperparameters.
  # Create Bernoulli sampled test & training subsets

nr.seed(1115)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])    

In [5]:
# Define G-nb model object & fit model to training data subset

NB_credit_mod = GaussianNB() 
NB_credit_mod.fit(x_train, y_train)

GaussianNB(priors=None)

In [7]:
#Below- Score & evaluate the test model

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = NB_credit_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive        30               182
Actual Negative         6                82

Accuracy        0.37
AUC             0.69
Macro Precision 0.57
Macro Recall    0.54
 
           Positive      Negative
Num Case      212            88
Precision    0.83          0.31
Recall       0.14          0.93
F1           0.24          0.47


In [8]:
#Above- Performance of the G-nb above is poor.
  # Barely half bad credit risk customer correctly identified.
  # AUC= 0.69 is quite a bit better than mean achieved w/ 5 fold cv. Likely these figures are optimistic.

#Below- Check if Bernoulli naive Bayes (B-nb) model is better, since less sensitive to quantity of training data.
  # First remove numeric features from array & examine results.
    
Features = Features[:,4:]
Features[:3,:]

array([[0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
        0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0.]])

In [9]:
# Perform model selection w/ nested cross validation (ncv) for optimal hyperparameters & model selection.
 # Compute inner loop to find optimal learning rate parameter w/ 10 fold cv.
   # Additional folds would give better estimates but at cost of greater computation time.

nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle = True)

In [10]:
# Estimate optimal hyperparameters using 10 fold cv.
  #1) Grid of 1 hyperparameter:
  #   A) alpha- smoothing parameter to avoid 0 possibilities.
  #2) Model is fit on grid & best estimated hyperparameters are displayed

nr.seed(3456)
param_grid = {"alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]}    # Define dictionary for grid search & model object to search on
NB_clf = BernoulliNB()   # Define B-NB regression model

clf = ms.GridSearchCV(estimator = NB_clf, param_grid = param_grid,   # Perform grid search over 1 parameter
                      cv = inside,                 # Use inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(Features, Labels)
print(clf.best_estimator_.alpha)

1


In [11]:
#Above- Estimated optimal learning rate parameters are alpha= 1.
  # Indicates, there is very little problem w/ 0 probabilities in this problem, resulting from the fact that probability space sampld is dense.

#Below- Perform outer cv of model to estimate model performance w/ optimal hyperparameters.

#NB_credit = BernoulliNB(alpha = clf.best_estimator_.alpha)
nr.seed(498)
cv_estimate = ms.cross_val_score(clf, Features, Labels, 
                                 cv = outside)              # Use the outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean Performance Metric = 0.754
SDT of the Metric       = 0.040
Outcomes by CV Fold
Fold  1    0.735
Fold  2    0.701
Fold  3    0.733
Fold  4    0.745
Fold  5    0.771
Fold  6    0.757
Fold  7    0.762
Fold  8    0.857
Fold  9    0.765
Fold 10    0.713


In [12]:
#Above- Std dev of mean of AUC (acceptable) is an order of magnitude less than mean itself.

#Below- Build, train & evaluate model w/ estimated optimal hyperparameters.
  # Create Bernoulli sampled test & training subsets

nr.seed(1115)
indx = range(Features.shape[0])      # Randomly sample cases to create independent training & test data
indx = ms.train_test_split(indx, test_size = 300)
x_train = Features[indx[0],:]
y_train = np.ravel(Labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(Labels[indx[1]])

In [13]:
# Define B-nb model object w/ optimal hyperparmeters & fit model to training data subset

NB_credit_mod = BernoulliNB(alpha = clf.best_estimator_.alpha) 
NB_credit_mod.fit(x_train, y_train)
probabilities = NB_credit_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive       178                34
Actual Negative        34                54

Accuracy        0.77
AUC             0.78
Macro Precision 0.73
Macro Recall    0.73
 
           Positive      Negative
Num Case      212            88
Precision    0.84          0.61
Recall       0.84          0.61
F1           0.84          0.61


In [14]:
#Above- Performance of the B-nb is much better than G-nb, but still could be better.
  # Current model uses empirical distribution of label values for prior value of p of Bernoulli distribution.
   # This probability is invariably skewed toward majority case, setting this distribution to a fixed prior value can help overcome class imbalance.

#Below- Redefine model object w/ prior probability of 0.6 for minority case.

NB_credit_mod = BernoulliNB(alpha = clf.best_estimator_.alpha,
                            class_prior = [0.4,0.6]) 
NB_credit_mod.fit(x_train, y_train)
probabilities = NB_credit_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive       116                96
Actual Negative        17                71

Accuracy        0.62
AUC             0.78
Macro Precision 0.65
Macro Recall    0.68
 
           Positive      Negative
Num Case      212            88
Precision    0.87          0.43
Recall       0.55          0.81
F1           0.67          0.56


In [None]:
#Above- 
  # Large majority of high risk (bad credit) customers are identified, but at cost of large # of FP error rate.
  # False negative is 5* the cost of FP, this may be good solution.
  # Inifinte # of other models are possible by changing the prior distribution.

## Summary- 
  # 1) Fit Gaussian naive Bayes model on bank credit data.
      # Performance of this model was poor as result of many Bernoulli distributed dummy features.
  # 2) Used Bernoulli naive Bayes model for bank credit data by eliminating numeric values. 
      # Overall, this model was much better.
  # 3) A model skewed toward detecting bad credit cases was created using prior distribution rather than empirical distribution of labels.
      # This model correctly classified significant # of bad credit cases. Selecting other prior distributions will give other models.