In [1]:
#Chapter focus:
#1. Make predictions based on majority voting
#2. Use bagging the reduce overfitting by drawing random combinations of the training set with repetition
#3. Apply boosting to build powerful models from weak learners that learn from their mistakes

#Ensemble methods:
#-Combine different classifiers into a meta-classifier that has better generalization performance
#-Most popular: majority voting
# -> select the class label that has been predicted by the majority of classifiers
# -> Binary class settings only
# -> Multi-class settings? Plurality voting
#-Techniques:
# -> build from different classification algorithms
# -> use the same base classification algorithm, fitting different subsets of the training set
#-Why ensemble methods can work better than individual classifiers?
# -> the error probability of an ensemble of base classifiers is a probability mass function of a binomial distribution
# -> if many people say yes, then probably it is more true than a single person says yes.

#Compare an idealistic ensemble classifier to a base classifier over a range of different base error rates

from scipy.special import comb
import math

#implement the ensemble_error function
def ensemble_error(n_classifier,error):
    k_start = int(math.ceil(n_classifier/2.))
    probs = [comb(n_classifier,k)*error**k*(1-error)**(n_classifier-k)
             for k in range(k_start,n_classifier+1)]
    return sum(probs)

#Test
ensemble_error(n_classifier=11,error=0.25)

0.03432750701904297

In [2]:
#Visualize the relationship between ensemble and base errors in a line graph
import numpy as np
import matplotlib.pyplot as plt

error_range = np.arange(0.0,1.01,0.01)
ens_errors = [ensemble_error(n_classifier=11,error=error)
              for error in error_range]

#Plot the line of ensemble errors
plt.plot(error_range,ens_errors,label='Ensemble error',linewidth=2)

#Plot the line of base error
plt.plot(error_range,error_range,linestyle='--',label='Base error',linewidth=2)

#Legend stuff
plt.xlabel('Base error')
plt.ylabel('Base/Ensemble error')
plt.legend(loc='upper left')
plt.grid(alpha=0.5)
plt.show()

#As long as the base classifiers perform better than random guessing (error rate < 0.5)
#Ensemble always performs better than individual base classifier

<Figure size 640x480 with 1 Axes>

In [3]:
#Combining classifiers via majority vote

#Implementing a simple majority vote classifier
#Combine different classification algorithms associated with individual weights for confidence
#Goal: build a stronger meta-classifier that balances out the individual classifiers' weaknesses on a particular dataset

#Illustration of the weighted majority vote:
np.argmax(np.bincount([0,0,1],weights=[0.2,0.2,0.6]))



1

In [4]:
#Illustration of the weighted majority vote based on class probability

ex = np.array([[0.9,0.1],
               [0.8,0.2],
               [0.4,0.6]])
p = np.average(ex,axis=0,weights=[.2,.2,.6])
p

array([0.58, 0.42])

In [5]:
np.argmax(p)

0

In [6]:
from sklearn.base import BaseEstimator #inherit some base functionalities
from sklearn.base import ClassifierMixin #inherit some base functionalities
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six #make the class compatible with Python 2.6
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator

class MajorityVoteClassifier(BaseEstimator,ClassifierMixin): #inheritance
    """A Majority Vote Ensemble Classifier
    
    Parameters
    ------------
    classifiers: array-like, shape = [n_classifiers]
        Different classifiers for the ensemble
        
    vote: str, {'classlabel','probability'}(default='label')
        If 'classlabel' the prediction is based on the armax of class labels.
        Else if 'probability', the argmax of sum of probabilities is used to predict the class label
        (recommended for calibrated classifiers).
        
    weights: array-like, shape = [n_classifiers], optional (default = None)
        If a list of `int` or `float` values are provided, the classifiers
        are weighted by importance; Uses uniform weights if `weights=None
    """
    
    def __init__(self,classifiers,vote='classlabel',weights=None):
        self.classifiers = classifiers
        self.named_classifiers={key:value 
                                for key, value in _name_estimators(classifiers)}
        self.vote = vote
        self.weights = weights
        
    def fit(self,X,y):
        """Fit classifiers.
        
        Parameters
        ------------------
        X:{array-like, sparse matrix}, shape=[n_samples,n_features]
            Matrix of training samples
            
        y: array-like, shape=[n_samples]
            Vector of target class labels
            
        Returns
        ----------------
        self: object
        """
        #Exception handling
        if self.vote not in ('probability','classlabel'):
            raise ValueError("vote must be 'probability' or 'classlabel'"
                             "; got (vote=%r)" % self.vote)
        if self.weights and len(self.weights) != len(self.classifiers):
            raise ValueError('NUmber of classifiers and weight must be equal;'
                             'got %d weights, %d classifiers'
                             % (len(self.weights),len(self.classifiers)))
        
        #Use LabelEncoder to ensure class labels start with 0, which
        #is important for np.argmax call in self.predict
        self.labelenc_=LabelEncoder()
        self.labelenc_.fit(y)
        self.classes_ = self.labelenc_.classes_
        self.classifiers_ = []
        
        #fit the training sets with different classifiers
        for clf in self.classifiers:
            fitted_clf = clone(clf).fit(X,self.labelenc_.transform(y)) #without clone, maybe only return data not a classifier???
            self.classifiers_.append(fitted_clf)
            
        return self
    
    def predict(self,X):
        """Predict class labels for X.
        
        Parameters
        -----------
        X: {array-like, sparse matrix}, shape = [n_samples, n_features]
        
        Returns
        -----------
        maj_vote: array-like, shape = [n_samples]
        
        """
        
        if self == 'probability':
            maj_vote = np.argmax(self.predict_proba(X),axis=1)
        else: # 'classlabel' vote
            # Collect results from clf.predict calls
            predictions = np.asarray([clf.predict(X) #asarray convert structured data into an ndarray
                                      for clf in self.classifiers_]).T #noted that it is transposed
            #Apply the lambda function along the column
            #each column stands for each sample, 
            #lambda function find the weighted mode of the prediction
            #as a result, each sample has a major_vote result
            
            maj_vote = np.apply_along_axis(lambda x: np.argmax(np.bincount(x,weights=self.weights)),
                                          axis=1,arr=predictions)
        #convert the labelencoded code back to labels
        maj_vote = self.labelenc_.inverse_transform(maj_vote)
        
        return maj_vote
        
    def predict_proba(self,X):
        """Predict class probabilities for X
        
        Parameters
        -----------
        X: {array-like, sparse matrix}, shape=[n_samples, n_features]
            Training vectors, where n_samples is the number of samples and n_features i the number of features.
            
        Returns
        -----------
        avg.proba: array-like, shape=[n_samples, n_classes]
            Weighted averge probability for each class per sample.
        """
        #Calculate the proba for each label in each sample
        probas = np.asarray([clf.predict_proba(X)
                             for clf in self.classifiers_]) #no transpose
        #column: prob each label, row: prob
        #apply to row, get
        avg_proba = np.average(probas, axis=0, weights=self.weights)
        return avg_proba
    
    def get_params(self,deep=True):
        """Get classifier parameter names for GridSearch"""
        if not deep:
            return super(MajorityVoteClassifier,self).get_params(get_params=False)
        else:
            out = self.named_classifiers.copy()
            for name, step in six.iteritems(self.named_classifiers):
                for key, value in six.iteriterms(step.get_params(deep=True)):
                    out['%s__%s' % (name,key)] = value
            return out
            
        

In [8]:
#Using the majority voting principle to make predictions

#load the Iris dataset from scikit-learn's dataset module
#only select two features, sepal width and petal length 
#to make the classification task more challenging for illustration purpose

#Although MajorityVoteClassifier generalises to multiclass problems,
#we will only classify flower samples from the Iris-versicolor and Iris-virginica classes

from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#load the data
iris = datasets.load_iris()
#only classify flower samples from the Iris-versicolor and Iris-virginica classes
#only select two features, sepal width and petal length 
X,y = iris.data[50:,[1,2]],iris.target[50:]
#encode the label
le = LabelEncoder()
y = le.fit_transform(y)

#split the Iris samples into 50% training and 50% test data
X_train, X_test, y_train, y_test = \
    train_test_split(X,y,test_size=0.5,random_state=1,stratify=y)


In [9]:
#Train 3 different classifiers:
#1. Logistic regression classifier
#2. Decision tree classifier
#3. k-nearest neighbors classifier

#evaluate the performance of each classifier via 10-fold cross-validation on the training dataset
#before combining them in to an ensemble classifier

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

import numpy as np
clf1 = LogisticRegression(penalty='l2',C=0.001,random_state=1)
clf2 = DecisionTreeClassifier(max_depth=1,criterion='entropy',random_state=0)
clf3 = KNeighborsClassifier(n_neighbors=1,p=2,metric='minkowski')

pipe1 = Pipeline([['sc',StandardScaler()],['clf',clf1]])
pipe3 = Pipeline([['sc',StandardScaler()],['clf',clf3]])

clf_labels = ['Logistic regression','Decision tree','KNN']

print('10-fold cross validation: \n')
for clf, label in zip([pipe1,clf2,pipe3],clf_labels):

SyntaxError: unexpected EOF while parsing (<ipython-input-9-cae61fdfd4be>, line 26)