In [38]:
# Ensembles of same algorithm, different (random) training sets.

# Bagging: sampling with replacement (bootstrap).
# Pasting: sampling without replacement.
# Both are amenable to parallelization.

# With bagging, one training instance could be in some training set twice.
# Bagging increases bias (due to smaller training set) but not variance.
# Since bags are not correlated, this helps with regularization.
# With the ensemble, the bias comes back down.

# Whereas Decision Tree chooses orthogonal linear boundaries,
# Random Forest achives an almost smooth boundary.

from sklearn.metrics import accuracy_score
def compare_accuracy(classifiers):
    for classifier in classifiers:
        #classifier.fit(X_train,y_train)
        y_pred = classifier.predict(X_test)
        print(classifier.__class__.__name__,accuracy_score(y_test,y_pred))

from sklearn.datasets import make_moons
X,y = make_moons(n_samples=2000, noise=0.20)
# This is a wrapper for ShuffleSplit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.30)

from sklearn.tree import DecisionTreeClassifier
dtc1 = DecisionTreeClassifier()
dtc1.fit(X_train,y_train)

DecisionTreeClassifier()

In [39]:
compare_accuracy([dtc1])

DecisionTreeClassifier 0.9533333333333334


In [43]:
# Bagging
from sklearn.ensemble import BaggingClassifier
dtc2 = DecisionTreeClassifier(
    min_samples_split=2   # default = 2
)
dtc2.fit(X_train,y_train)

bag = BaggingClassifier(
    dtc2,             # the base estimator (DT by default)
    oob_score=True,  
    bootstrap=True,   # bagging/pasting
    n_estimators=500,  # num trees in forest
    max_samples=100,  # max samples use to train a tree
    n_jobs= -1    # num cpu
)
bag.fit(X_train,y_train)
# Estimate generalization using out-of-bag score.
# Useful if you don't want to look at the test set yet.
bag.oob_score_

0.9564285714285714

In [44]:
# Compare to test set.
compare_accuracy([dtc2,bag])

DecisionTreeClassifier 0.9533333333333334
BaggingClassifier 0.955


In [None]:
# Feature sampling.

# The BaggingClassifier has another parameter max_features.
# Each Decision Tree uses a random subset of features.
# This increases bias but also diversity.
# Good for high feature dimension.

# Random Patches Method = sample instances and features.
# Random Subspaces Method = 