In [1]:
## Goal: Explore data w/ visualizations for Adventure Works dataset 
      #    for purpose of Classification Supervised ML w/ label= BikeBuyer

# Import Python pkgs pandas, numpy, matplotlib.pyplot, & seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy.random as nr
import math
from sklearn import preprocessing
import sklearn.model_selection as ms
from sklearn import linear_model
import sklearn.metrics as sklm
from sklearn import cross_validation
from sklearn import feature_selection as fs
from sklearn import metrics, cross_validation
import scipy.stats as ss
import sklearn.decomposition as skde
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB

%matplotlib inline  
# Start of magic command which configures execution environment, to display graphics w/in notebook



In [2]:
# Load already prepared dataset, display shape, & explore first 10 rows of Pandas data frame

AW_Custs_C = pd.read_csv('AdvWorksCusts_Preped.csv', header=0)
print(AW_Custs_C.shape)
AW_Custs_C.head()

(16404, 21)


Unnamed: 0,FirstName,LastName,AddressLine1,City,StateProvinceName,CountryRegionName,PostalCode,PhoneNumber,BirthDate,Education,...,Gender,MaritalStatus,HomeOwnerFlag,NumberCarsOwned,NumberChildrenAtHome,TotalChildren,YearlyIncome,Age,AveMonthSpend,BikeBuyer
0,Jon,Yang,3761 N. 14th St,Rockhampton,Queensland,Australia,4700,1 (11) 500 555-0162,4/8/1966,Bachelors,...,M,M,1,0,0,2,137947,31,89,0
1,Eugene,Huang,2243 W St.,Seaford,Victoria,Australia,3198,1 (11) 500 555-0110,5/14/1965,Bachelors,...,M,S,0,1,3,3,101141,32,117,1
2,Ruben,Torres,5844 Linden Land,Hobart,Tasmania,Australia,7001,1 (11) 500 555-0184,8/12/1965,Bachelors,...,M,M,1,1,3,3,91945,32,123,0
3,Christy,Zhu,1825 Village Pl.,North Ryde,New South Wales,Australia,2113,1 (11) 500 555-0162,2/15/1968,Bachelors,...,F,S,0,1,0,0,86688,29,50,0
4,Elizabeth,Johnson,7553 Harness Circle,Wollongong,New South Wales,Australia,2500,1 (11) 500 555-0131,8/8/1968,Bachelors,...,F,S,1,4,5,5,92771,29,95,1


In [3]:
# Testing for Class Imbalance by Examining Classes where label= BikeBuyer
 # Unequal numbers of cases for the categories of labels, which can seriously bias the training of classifier alogrithms 
 #  higher error rate for the minority class. This should be tested for before training any model.   

AW_Custs_C_counts =  AW_Custs_C['BikeBuyer'].value_counts()
print(AW_Custs_C_counts) 

0    10949
1     5455
Name: BikeBuyer, dtype: int64


In [4]:
#Above- Knowing imbalance exists, the best accuracy we can get w/out creating a ML model is 70%.
 # This is achieved by guessing all customers will buy a bike
    
#Below- Create a numpy array of label values

labels = np.array(AW_Custs_C['BikeBuyer'])

In [5]:
#Create a numpy array with all of the features (Model Matrix)
 # Encode categorical string variables into integers. 
 # Transform integer coded variables to dummy variables.
 # Append each dummy coded categorical variable to model matrix.
    
def encode_string(cat_features):
    enc = preprocessing.LabelEncoder()  # Encode strings to numeric categories
    enc.fit(cat_features)
    enc_cat_features = enc.transform(cat_features)
    
    ohe = preprocessing.OneHotEncoder()  #Apply One Hot Encoder
    encoded = ohe.fit(enc_cat_features.reshape(-1,1))
    return encoded.transform(enc_cat_features.reshape(-1,1)).toarray()

categorical_columns = ['Education', 'Gender', 'MaritalStatus', 'NumberCarsOwned', 'NumberChildrenAtHome', 'TotalChildren']

Features = encode_string(AW_Custs_C['Occupation'])
for col in categorical_columns:
    temp = encode_string(AW_Custs_C[col])
    Features = np.concatenate([Features, temp], axis = 1)
    
print(Features.shape)
print(Features[:2, :])

(16404, 31)
[[0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0.
  0. 0. 0. 0. 1. 0. 0.]]


In [6]:
# Append numeric features to model matrix

Features = np.concatenate([Features, np.array(AW_Custs_C[['YearlyIncome', 'Age']])], axis = 1)

print(Features.shape)
print(Features[:2, :])

(16404, 33)
[[0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 1.37947e+05 3.10000e+01]
 [0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00
  0.00000e+00 1.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00
  0.00000e+00 1.01141e+05 3.20000e+01]]


In [7]:
# 6 categorical variables were converted into 31 dummy variables. 

#Below- How many dummy variables came from checking_account_status? -5-
print(AW_Custs_C['Occupation'].unique())

['Professional' 'Management' 'Skilled Manual' 'Clerical' 'Manual']


In [8]:
#Below- Features array has both numeric & binary features (dummy variables for categorical features)
  # Therefore, Gaussian model must be used, however it's not ideal since numeric features mixed w/ features exhibiting Bernoulli distributions (binary features)
  

nr.seed(321)
cv_folds = ms.KFold(n_splits=10, shuffle = True)  #Define 10 fold cv object
    
nr.seed(498)
NB_credit = GaussianNB()      #Define Gaussian naive Bayes model
cv_estimate = ms.cross_val_score(NB_credit, Features, labels,     #Performs 10 fold cv
                                 cv = cv_folds)       # Use the outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))    # Display cv results

Mean Performance Metric = 0.742
SDT of the Metric       = 0.009
Outcomes by CV Fold
Fold  1    0.763
Fold  2    0.728
Fold  3    0.745
Fold  4    0.747
Fold  5    0.735
Fold  6    0.733
Fold  7    0.738
Fold  8    0.748
Fold  9    0.741
Fold 10    0.738


In [12]:
#Above- Std dev of mean of AUC is more than an order of magnitude smaller than mean itself, indicating that this model is likely to generalize well, but level of performance is unclear.

#Below- Build, train & evaluate model w/ estimated optimal hyperparameters.
  # Create Bernoulli sampled test & training subsets

nr.seed(1115)
indx = range(Features.shape[0])
indx = ms.train_test_split(indx, test_size = 5000)
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])    

In [13]:
# Define G-nb model object & fit model to training data subset

NB_credit_mod = GaussianNB() 
NB_credit_mod.fit(x_train, y_train)

GaussianNB(priors=None)

In [14]:
#Below- Score & evaluate the test model

def score_model(probs, threshold):
    return np.array([1 if x > threshold else 0 for x in probs[:,1]])

def print_metrics(labels, probs, threshold):
    scores = score_model(probs, threshold)
    metrics = sklm.precision_recall_fscore_support(labels, scores)
    conf = sklm.confusion_matrix(labels, scores)
    print('                 Confusion Matrix')
    print('                 Score Positive    Score Negative')
    print('Actual Positive    %6d' % conf[0,0] + '             %5d' % conf[0,1])
    print('Actual Negative    %6d' % conf[1,0] + '             %5d' % conf[1,1])
    print('')
    print('Accuracy        %0.2f' % sklm.accuracy_score(labels, scores))
    print('AUC             %0.2f' % sklm.roc_auc_score(labels, probs[:,1]))
    print('Macro Precision %0.2f' % float((float(metrics[0][0]) + float(metrics[0][1]))/2.0))
    print('Macro Recall    %0.2f' % float((float(metrics[1][0]) + float(metrics[1][1]))/2.0))
    print(' ')
    print('           Positive      Negative')
    print('Num Case   %6d' % metrics[3][0] + '        %6d' % metrics[3][1])
    print('Precision  %6.2f' % metrics[0][0] + '        %6.2f' % metrics[0][1])
    print('Recall     %6.2f' % metrics[1][0] + '        %6.2f' % metrics[1][1])
    print('F1         %6.2f' % metrics[2][0] + '        %6.2f' % metrics[2][1])
    
probabilities = NB_credit_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive      3044               273
Actual Negative      1054               629

Accuracy        0.73
AUC             0.80
Macro Precision 0.72
Macro Recall    0.65
 
           Positive      Negative
Num Case     3317          1683
Precision    0.74          0.70
Recall       0.92          0.37
F1           0.82          0.49


In [15]:
#Above- Performance of the G-nb above is ideal.
  # Barely half bad credit risk customer correctly identified.
  # AUC= 0.80 is quite a bit better than mean achieved w/ 5 fold cv. Likely these figures are optimistic.

#Below- Check if Bernoulli naive Bayes (B-nb) model is better, since less sensitive to quantity of training data.
  # First remove numeric features from array & examine results.
    
Features = Features[:,4:]
Features[:3,:]

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.37947e+05, 3.10000e+01],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 1.01141e+05, 3.20000e+01],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 1.00000e+00, 0.00000e+00,
  

In [16]:
# Perform model selection w/ nested cross validation (ncv) for optimal hyperparameters & model selection.
 # Compute inner loop to find optimal learning rate parameter w/ 10 fold cv.
   # Additional folds would give better estimates but at cost of greater computation time.

nr.seed(123)
inside = ms.KFold(n_splits=10, shuffle = True)
nr.seed(321)
outside = ms.KFold(n_splits=10, shuffle = True)

In [17]:
# Estimate optimal hyperparameters using 10 fold cv.
  #1) Grid of 1 hyperparameter:
  #   A) alpha- smoothing parameter to avoid 0 possibilities.
  #2) Model is fit on grid & best estimated hyperparameters are displayed

nr.seed(3456)
param_grid = {"alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10]}    # Define dictionary for grid search & model object to search on
NB_clf = BernoulliNB()   # Define B-NB regression model

clf = ms.GridSearchCV(estimator = NB_clf, param_grid = param_grid,   # Perform grid search over 1 parameter
                      cv = inside,                 # Use inside folds
                      scoring = 'roc_auc',
                      return_train_score = True)
clf.fit(Features, labels)
print(clf.best_estimator_.alpha)

10


In [19]:
#Above- Estimated optimal learning rate parameters are alpha= 10.
  # Indicates, there is very little problem w/ 0 probabilities in this problem, resulting from the fact that probability space sampld is dense.

#Below- Perform outer cv of model to estimate model performance w/ optimal hyperparameters.

#NB_credit = BernoulliNB(alpha = clf.best_estimator_.alpha)
nr.seed(498)
cv_estimate = ms.cross_val_score(clf, Features, labels, 
                                 cv = outside)              # Use the outside folds

print('Mean Performance Metric = %4.3f' % np.mean(cv_estimate))
print('SDT of the Metric       = %4.3f' % np.std(cv_estimate))
print('Outcomes by CV Fold')
for i, x in enumerate(cv_estimate):
    print('Fold %2d    %4.3f' % (i+1, x))

Mean Performance Metric = 0.796
SDT of the Metric       = 0.007
Outcomes by CV Fold
Fold  1    0.807
Fold  2    0.790
Fold  3    0.796
Fold  4    0.800
Fold  5    0.788
Fold  6    0.796
Fold  7    0.798
Fold  8    0.803
Fold  9    0.784
Fold 10    0.798


In [20]:
#Above- Std dev of mean of AUC (acceptable) is an order of magnitude less than mean itself.

#Below- Build, train & evaluate model w/ estimated optimal hyperparameters.
  # Create Bernoulli sampled test & training subsets

nr.seed(1115)
indx = range(Features.shape[0])      # Randomly sample cases to create independent training & test data
indx = ms.train_test_split(indx, test_size = 5000)
x_train = Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

In [21]:
# Define B-nb model object w/ optimal hyperparmeters & fit model to training data subset

NB_credit_mod = BernoulliNB(alpha = clf.best_estimator_.alpha) 
NB_credit_mod.fit(x_train, y_train)
probabilities = NB_credit_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive      2847               470
Actual Negative       696               987

Accuracy        0.77
AUC             0.80
Macro Precision 0.74
Macro Recall    0.72
 
           Positive      Negative
Num Case     3317          1683
Precision    0.80          0.68
Recall       0.86          0.59
F1           0.83          0.63


In [22]:
#Above- Performance of the B-nb is much better than G-nb, but still could be better.
  # Current model uses empirical distribution of label values for prior value of p of Bernoulli distribution.
   # This probability is invariably skewed toward majority case, setting this distribution to a fixed prior value can help overcome class imbalance.

#Below- Redefine model object w/ prior probability of 0.6 for minority case.

NB_credit_mod = BernoulliNB(alpha = clf.best_estimator_.alpha,
                            class_prior = [0.4,0.6]) 
NB_credit_mod.fit(x_train, y_train)
probabilities = NB_credit_mod.predict_proba(x_test)
print_metrics(y_test, probabilities, 0.5)    

                 Confusion Matrix
                 Score Positive    Score Negative
Actual Positive      2461               856
Actual Negative       499              1184

Accuracy        0.73
AUC             0.80
Macro Precision 0.71
Macro Recall    0.72
 
           Positive      Negative
Num Case     3317          1683
Precision    0.83          0.58
Recall       0.74          0.70
F1           0.78          0.64


In [None]:
#Above- 
  # Large majority of non bike buyer/customers are identified, but at cost of large # of FP error rate.
  # Inifinte # of other models are possible by changing the prior distribution.
