Creative Commons CC BY 4.0 Lynd Bacon & Associates, Ltd. Not warranted to be suitable for any particular purpose. (You're on your own!)

# Ensembles of Voters

Ensembles, or collections, of voters can be used to improve predictive accuracy.  In the following we're going to aggregate the outputs of a couple of different classifiers to predict positive biopsy of cervical cancer.

In [39]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, Markdown
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import scikitplot as skplt
from sklearn import linear_model
from sklearn.model_selection import KFold, StratifiedShuffleSplit
from sklearn.model_selection import cross_val_predict, cross_validate
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score
from sklearn.metrics import roc_curve, precision_score, recall_score, classification_report
from sklearn.naive_bayes import GaussianNB

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
import os
import shelve
%matplotlib inline

In [7]:
os.getcwd()

'/Users/lyndbacon/DeCART2019'

# Get the Data

In [11]:
with shelve.open('cervical.sdb','rb') as inFile:
    cervical2=inFile['cervical2']


In [12]:
cervical2.columns

Index(['Biopsy', 'Age', 'Number of sexual partners',
       'First sexual intercourse', 'Num of pregnancies', 'Smokes',
       'Smokes (years)', 'Smokes (packs/year)', 'Hormonal Contraceptives',
       'Hormonal Contraceptives (years)'],
      dtype='object')

In [13]:
cervical.Biopsy.value_counts()

0    803
1     55
Name: Biopsy, dtype: int64

In [26]:
y=cervical2.Biopsy.to_numpy()
X=cervical2.loc[:,'Age':].to_numpy()
y.shape
X.shape

(676,)

(676, 9)

# Classifiers

In [29]:
clf1 = linear_model.LogisticRegression(solver='lbfgs',random_state=99,
                                      max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=50, random_state=99)
clf3 = AdaBoostClassifier(base_estimator=None, n_estimators=50, 
                          learning_rate=1.0, algorithm='SAMME.R',
                          random_state=99)
clf4 = GaussianNB()


In [30]:
vclf1 = VotingClassifier(estimators=[
...         ('lr', clf1), ('rf', clf2), ('adab', clf3),('nb',clf4)],
...         voting='soft')

# Training Test Split

In [27]:
skf1=StratifiedShuffleSplit(n_splits=1,random_state=99) # Note, just a single random split here
for trainNdx,testNdx in skf1.split(X,y):
    Xtrain=X[trainNdx]
    ytrain=y[trainNdx]
    Xtest=X[testNdx]
    ytest=y[testNdx]

# Training

In [31]:
vclf1 = vclf1.fit(Xtrain, ytrain)

In [32]:
vclf1

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=1000, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=99, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', RandomFo..._rate=1.0, n_estimators=50, random_state=99)), ('nb', GaussianNB(priors=None, var_smoothing=1e-09))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

# Evaluation

In [34]:
yPredTrain=vclf1.predict(Xtrain)
yPredTest=vclf1.predict(Xtest)

In [46]:
yProbtrain=vclf1.predict_proba(Xtrain)
yProbtest= vclf1.predict_proba(Xtest)

In [38]:
accuracy_score(ytrain,yPredTrain)
accuracy_score(ytest,yPredTest)

0.9375

0.9264705882352942

In [41]:
print(classification_report(ytrain,yPredTrain))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97       567
           1       1.00      0.07      0.14        41

   micro avg       0.94      0.94      0.94       608
   macro avg       0.97      0.54      0.55       608
weighted avg       0.94      0.94      0.91       608



In [42]:
print(classification_report(ytest,yPredTest))

              precision    recall  f1-score   support

           0       0.93      1.00      0.96        63
           1       0.00      0.00      0.00         5

   micro avg       0.93      0.93      0.93        68
   macro avg       0.46      0.50      0.48        68
weighted avg       0.86      0.93      0.89        68



  'precision', 'predicted', average, warn_for)


In [44]:
roc_auc_score(ytrain,yProbtrain[:,1])

0.973050286058416

In [47]:
roc_auc_score(ytest,yProbtest[:,1])

0.5365079365079366