In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data", header=None)

X = df.iloc[:,0:56]
y = df.iloc[:,57]

le = LabelEncoder()
y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [3]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,46,47,48,49,50,51,52,53,54,55
1507,0.0,0.17,0.0,0.0,0.0,0.0,0.17,0.52,0.0,0.17,...,0.0,0.0,0.029,0.147,0.029,0.117,0.058,0.235,3.521,39
1652,0.7,0.0,1.06,0.0,0.0,0.0,0.0,1.41,0.35,0.35,...,0.0,0.0,0.0,0.117,0.0,0.353,0.0,0.0,1.209,13
2279,0.0,0.0,1.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.149,0.0,0.149,0.0,0.0,1.482,10
2106,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.56,...,0.0,0.0,0.0,0.194,0.194,0.0,0.0,0.0,3.631,17
3688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [5]:
y_train[:5]

array([1, 1, 0, 0, 0], dtype=int64)

In [8]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

clf = LogisticRegression(solver='lbfgs')
tree = DecisionTreeClassifier()
svm = svm.SVC(probability=True, gamma='auto')

lr = Pipeline([['sc', StandardScaler()],
                  ['clf', clf]])

clf_labels = ['Logistic Regression', 'Decision Tree', 'Naive Bayesian']

def cv(all_clf, clf_labels):
    for clf, label in zip(all_clf, clf_labels):
        scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='roc_auc')
        print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
        
cv([lr, tree, svm], clf_labels)

ROC AUC: 0.97 (+/- 0.01) [Logistic Regression]
ROC AUC: 0.90 (+/- 0.02) [Decision Tree]
ROC AUC: 0.97 (+/- 0.01) [Naive Bayesian]


In [9]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator

class MajorityVoteClassifier(BaseEstimator, 
                             ClassifierMixin):
    def __init__(self, classifiers={}, vote='classlabel', weights=None):

        self.classifiers = classifiers
        self.named_classifiers = {key: value for key, value
                                  in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights

    def fit(self, X, y):
        if self.vote not in ('probability', 'classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel'"
                             "; got (vote=%r)"
                             % self.vote)

        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d classifiers'
                             % (len(self.weights), len(self.classifiers)))

        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self

    def predict(self, X):
        if self.vote == 'probability':
            maj_vote = np.argmax(self.predict_proba(X), axis=1)
        else:  # 'classlabel' vote

            #  Collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X)
                                      for clf in self.classifiers_]).T

            maj_vote = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions)
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote

    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X)
                             for clf in self.classifiers_])
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba



In [None]:
mv_clf = MajorityVoteClassifier(classifiers=[lr, tree, svm])

clf_labels += ['Majority Voting']
all_clf = [lr, tree, svm, mv_clf]

cv(all_clf, clf_labels)

ROC AUC: 0.97 (+/- 0.01) [Logistic Regression]
ROC AUC: 0.90 (+/- 0.02) [Decision Tree]
ROC AUC: 0.97 (+/- 0.01) [Naive Bayesian]


In [None]:
mv_clf = MajorityVoteClassifier(classifiers=[lr, tree, svm], vote='probability')
cv(all_clf, clf_labels)