In [1]:
#
import sklearn
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

Run next cell to make 500 moon data at noise=0.30, random_state=42. Then split them to training and testing data

In [2]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
X, y = make_moons(n_samples=5000, noise=0.30)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

Build three classifiers using default settings: 
1. logistic regression
2. random forest
3. support vector machine

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [4]:
clf_lr = LogisticRegression()
clf_forest = RandomForestClassifier()
clf_svm = SVC()

Now build a hard-voting classifier by the three models above.

In [5]:
from sklearn.ensemble import VotingClassifier

In [6]:
clf_voting = VotingClassifier([('lr', clf_lr), ('forest', clf_forest), ('svc', clf_svm)], voting='hard')

Compare the test accuracy of the four model built above.

In [7]:
from sklearn.metrics import accuracy_score

In [8]:
for clf in (clf_lr, clf_forest, clf_svm, clf_voting):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy of {}: {:.4f}".format(clf.__class__.__name__, accuracy))

Accuracy of LogisticRegression: 0.8528
Accuracy of RandomForestClassifier: 0.8872
Accuracy of SVC: 0.9040
Accuracy of VotingClassifier: 0.9008


  if diff:


Now change vote_clf to soft voting and compare the accuracy again. Make sure that all compents can give probability prediction.

In [9]:
clf_voting_soft = VotingClassifier([('lr', clf_lr), ('forest', clf_forest), ('svc', clf_svm)], voting='soft')
clf_voting_soft.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('forest', Ran...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [10]:
clf_svm_prob = SVC(probability=True)

In [11]:
clf_voting_soft = VotingClassifier([('lr', clf_lr), ('forest', clf_forest), ('svc', clf_svm_prob)], voting='soft')

In [12]:
clf_voting_soft.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('forest', Ran...',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=1, voting='soft', weights=None)

In [13]:
y_pred = clf_voting_soft.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of {}: {:.4f}".format(clf_voting_soft.__class__.__name__, accuracy))

Accuracy of VotingClassifier: 0.9104


  if diff:


# Bagging and pasting

First, build a single decision tree with no restribtion. Fit it by X_train, y_train, and print out the test accuary.

In [14]:
from sklearn.tree import DecisionTreeClassifier

In [15]:
clf_tree = DecisionTreeClassifier()
clf_tree.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [16]:
print(accuracy_score(y_test, clf_tree.predict(X_test)))

0.8704


Second, build an ensemble classifier of 500 decision trees using bootstraping. Print out the test accuracy.

In [17]:
from sklearn.ensemble import BaggingClassifier

In [18]:
clf_bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, n_jobs=-1, bootstrap=True)

In [19]:
clf_bagging.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

Get the out-of-bag accuracy. Fit the model again if necessary.

In [20]:
clf_bagging = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, \
                                n_jobs=-1, bootstrap=True, oob_score=True)

In [21]:
clf_bagging.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=500, n_jobs=-1, oob_score=True,
         random_state=None, verbose=0, warm_start=False)

In [22]:
clf_bagging.oob_score_

0.8989333333333334

# Random Forest

Create a random forest classifier with 500 trees, max node number 16. Print out the test accuracy.

In [23]:
from sklearn.ensemble import RandomForestClassifier

In [24]:
clf_forest = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
clf_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [25]:
print(accuracy_score(y_test, clf_forest.predict(X_test)))

0.9136


Show the feature importance.

In [26]:
clf_forest.feature_importances_

array([0.42925266, 0.57074734])

Now create an equivalent classifier by BaggingClassifier. Print out the test accuracy.

In [27]:
clf_bagging = BaggingClassifier(DecisionTreeClassifier(max_leaf_nodes=16), n_jobs=-1, n_estimators=500)
clf_bagging.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=500, n_jobs=-1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [28]:
print(accuracy_score(y_test, clf_bagging.predict(X_test)))

0.9128


Now try the ExtraTreeClassifier, which introduced even more randomness than random forest. Print out the accuracy.

In [29]:
from sklearn.ensemble import ExtraTreesClassifier

In [30]:
clf_extra = ExtraTreesClassifier()
clf_extra.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [31]:
print(accuracy_score(y_test, clf_extra.predict(X_test)))

0.8784
