## MajorityVoteClassifier is used

### Works in a similar fashion as ensemble.VoteClassifier.

In [7]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np
import matplotlib as plt
data,target = load_breast_cancer(return_X_y = True)
Data = pd.DataFrame(data)
Target = pd.DataFrame(target)

In [15]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
from sklearn.preprocessing import LabelEncoder
import operator
%matplotlib inline

In [41]:
class MajorityVoteClassifier(BaseEstimator, ClassifierMixin):
    #check pg:-227 of packt box for more info of this class.
    def __init__(self, classifiers, vote = 'classlabel', weights = None):
        self.classifiers = classifiers
        self.named_classifiers = {key:value for key, value in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
    
    def fit(self, X, y):
        #Use LabelEncoder to ensure class labels start with 0, which is important for np.argmax.
        self.lablenc_ = LabelEncoder()
        self.lablenc_.fit(y)
        self.classes_ = self.lablenc_.classes_
        self.classifiers_ = []
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
            self.classifiers_.append(fitted_clf)
        return self
    
    def predict(self, X):
        if self.vote == "probability":
            maj_vote = np.argmax(self.predict_proba(X), axis = 1)
        else:
            #collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X) for clf in self.classifiers_]).T
            maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bitcount(x, weights = self.weights)), axis = 1, arr = predictions)
            
        maj_vote = self.lablenc_.inverse_transform(maj_vote)
        return maj_vote
    
    def predict_proba(self, X):
        probas = np.asarray([clf.predict_proba(X) for clf in self.classifiers_])
        avg_proba = np.average(probas, axis = 0, weights = self.weights)
        return avg_proba
    
    def get_params(self, deep = True):
        """
        Get classifier parameter names for GridSearch
        """
        if not deep:
            return super(MajorityVoteClassifier, self).get_params(deep = False)
        else:
            out = self.named_classifiers.copy()
            #six is used for python 2.6
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteritems(step.get_params(deep = True)):
                    out['%s__%s' % (name, key)] = value
            return out
        

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
le = LabelEncoder()
y = le.fit_transform(Target)
y.shape
#not really needed as the classes were already defined as 0 or 1.

  y = column_or_1d(y, warn=True)


(569,)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(Data, y, test_size = 0.3, random_state = 1)

In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [27]:
clf1 = LogisticRegression(penalty = 'l2', C = 0.001, random_state = 1)
clf2 = DecisionTreeClassifier(max_depth = 1, criterion = 'entropy', random_state = 0)
clf3 = KNeighborsClassifier(n_neighbors = 1, p = 2, metric = 'minkowski')

In [30]:
pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
pipe2 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])
clf_labels = ['Logisitc Regression', 'Decision Tree', 'KNN']
print('10-fold cross validation:\n')
for clf, label in zip([pipe1, clf2, pipe2], clf_labels):
    score = cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 10, scoring = 'roc_auc')
    print("ROC_AUC:  %0.2f - [%s]" % (score.mean(), label))

10-fold cross validation:

ROC_AUC:  0.99 - [Logisitc Regression]
ROC_AUC:  0.94 - [Decision Tree]




ROC_AUC:  0.95 - [KNN]


In [43]:
mv_clf = MajorityVoteClassifier(classifiers = [pipe1, clf2, pipe2])
clf_labels = clf_labels + ['Majority Voting']
all_clf = [pipe1, clf2, pipe2, mv_clf]
for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator = clf, X = X_train, y = y_train, cv = 10, scoring = 'roc_auc')
    print("Accuracy : %0.3f - [%s]" % (score.mean(),label))



Accuracy : 0.950 - [Logisitc Regression]
Accuracy : 0.950 - [Decision Tree]
Accuracy : 0.950 - [KNN]
Accuracy : 0.950 - [Majority Voting]




In [44]:
mv_clf.get_params()

{'pipeline-1': Pipeline(memory=None,
          steps=[['sc',
                  StandardScaler(copy=True, with_mean=True, with_std=True)],
                 ['clf',
                  LogisticRegression(C=0.001, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='warn', n_jobs=None,
                                     penalty='l2', random_state=1, solver='warn',
                                     tol=0.0001, verbose=0, warm_start=False)]],
          verbose=False),
 'decisiontreeclassifier': DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
                        max_features=None, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=2,
                        min_weight_fraction_leaf=

In [46]:
from sklearn.model_selection import GridSearchCV
params = {'decisiontreeclassifier__max_depth': [1,2,4,6], 'pipeline-1__clf__C': [0.0001, 0.01, 1.0]}
grid = GridSearchCV(estimator = mv_clf, param_grid = params, cv = 10, scoring = 'roc_auc')
grid.fit(X_train, y_train)







GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=MajorityVoteClassifier(classifiers=[Pipeline(memory=None,
                                                                    steps=[['sc',
                                                                            StandardScaler(copy=True,
                                                                                           with_mean=True,
                                                                                           with_std=True)],
                                                                           ['clf',
                                                                            LogisticRegression(C=0.001,
                                                                                               class_weight=None,
                                                                                               dual=False,
                                                           

In [53]:
print("Best parameters: %s" % grid.best_params_)

Best parameters: {'decisiontreeclassifier__max_depth': 2, 'pipeline-1__clf__C': 1.0}


In [54]:
print("Accuracy: %0.3f" % grid.best_score_)

Accuracy: 0.993
