In [1]:
import numpy as np 
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# Chapter 7: Esemble learing and random forests

In [2]:
# group of predictor is called ensemble, thus, this technique is called 
# ensemble learning, and an ensemble learning algorith is called a ensemble method.

# do read page->191

# For example, you can train a group of Decision Tree classifiers, each on a different
# random subset of the training set. To make predictions, you just obtain the predictions
# of all individual trees, then predict the class that gets the most votes (see the last
# exercise in Chapter 6). Such an ensemble of Decision Trees is called a Random Forest,
# and despite its simplicity, this is one of the most powerful Machine Learning algorithms
# available today.

## voting classifiers

In [3]:
# read upto page->194
# Ensemble methods work best when the predictors are as independent
# from one another as possible. One way to get diverse classifiers
# is to train them using very different algorithms. This increases the
# chance that they will make very different types of errors, improving
# the ensemble’s accuracy.

In [4]:
# data
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
# The following code creates and trains a voting classifier in Scikit-Learn, composed of
# three diverse classifiers

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[("lr", log_clf), ("rf", rnd_clf), ("svc", svm_clf)],
voting="hard")
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [6]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [7]:
# now lets look at each classifier's accuracy on the test set:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.904
VotingClassifier 0.896


In [8]:
# do read page->194 really importand... 
# tells voting="hard", voting="soft"

## Bagging and pasting

In [9]:
# One way to get a diverse set of classifiers is to use very different training algorithms,
# as just discussed. Another approach is to use the same training algorithm for every
# predictor, but to train them on different random subsets of the training set. When
# sampling is performed with replacement, this method is called bagging1 (short for
# bootstrap aggregating2). When sampling is performed without replacement, it is called
# pasting.

In [10]:
# n other words, both bagging and pasting allow training instances to be sampled several
# times across multiple predictors, but only bagging allows training instances to be
# sampled several times for the same predictor. This sampling and training process is
# represented in Figure 7-4.
# page->195

## bagging and pasting in scikit-learn

In [11]:
# Scikit-Learn offers a simple API for both bagging and pasting with the BaggingClas
# sifier class (or BaggingRegressor for regression). The following code trains an
# ensemble of 500 Decision Tree classifiers,5 each trained on 100 training instances randomly
# sampled from the training set with replacement (this is an example of bagging,
# but if you want to use pasting instead, just set bootstrap=False). The n_jobs parameter
# tells Scikit-Learn the number of CPU cores to use for training and predictions
# (–1 tells Scikit-Learn to use all available cores):

In [12]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

## out-of-bag evaluation

In [13]:
# With bagging, some instances may be sampled several times for any given predictor,
# while others may not be sampled at all. By default a BaggingClassifier samples m
# training instances with replacement (bootstrap=True), where m is the size of the
# training set. This means that only about 63% of the training instances are sampled on
# average for each predictor.6 The remaining 37% of the training instances that are not
# sampled are called out-of-bag (oob) instances. Note that they are not the same 37%
# for all predictors.

In [14]:
# Since a predictor never sees the oob instances during training, it can be evaluated on
# these instances, without the need for a separate validation set. You can evaluate the
# ensemble itself by averaging out the oob evaluations of each predictor.
# In Scikit-Learn, you can set oob_score=True when creating a BaggingClassifier to
# request an automatic oob evaluation after training. The following code demonstrates
# this. The resulting evaluation score is available through the oob_score_ variable:

bag_clf = BaggingClassifier(
DecisionTreeClassifier(), n_estimators=500,
bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(X_train, y_train)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=500,
                  n_jobs=-1, oob_score=True)

In [15]:
bag_clf.oob_score_
# According to this oob evaluation, this BaggingClassifier is likely to achieve about
# 89% accuracy on the test set. Let’s verify this:

0.8986666666666666

In [16]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [17]:
# The oob decision function for each training instance is also available through the
# oob_decision_function_ variable. In this case (since the base estimator has a pre
# dict_proba() method) the decision function returns the class probabilities for each
# training instance. For example, the oob evaluation estimates that the first training
# instance has a 63.5% probability of belonging to the positive class (and 36.5% of
# belonging to the negative class):

bag_clf.oob_decision_function_[:5]

array([[0.31937173, 0.68062827],
       [0.31460674, 0.68539326],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ]])

## random patches and random subspaces

In [18]:
# just read the chapter from book cause there is a lot of theory involved

## random forest

In [19]:
# As we have discussed, a Random Forest9 is an ensemble of Decision Trees, generally
# trained via the bagging method

# The following code trains a
# Random Forest classifier with 500 trees (each limited to maximum 16 nodes), using
# all available CPU cores:

from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,
                                n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)


In [20]:
# The following BaggingClassifier is
# roughly equivalent to the previous RandomForestClassifier:
bag_clf = BaggingClassifier(
DecisionTreeClassifier(splitter="random", max_leaf_nodes=16),
n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1)

## Extra trees

In [21]:
# It is hard to tell in advance whether a RandomForestClassifier
# will perform better or worse than an ExtraTreesClassifier. Generally,
# the only way to know is to try both and compare them using
# cross-validation (and tuning the hyperparameters using grid
# search).

# page->200

## feature importance

In [22]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.08995085653727478
sepal width (cm) 0.02304238094998636
petal length (cm) 0.42411520552570325
petal width (cm) 0.4628915569870356


In [23]:
# Random Forests are very handy to get a quick understanding of what features
# actually matter, in particular if you need to perform feature selection.

## boosting

In [24]:
# Boosting (originally called hypothesis boosting) refers to any Ensemble method that
# can combine several weak learners into a strong learner.
# page->201

In [25]:
# The most popular one are AdaBoost and Gradient Boosting

## AdaBoost

In [26]:
# generally skipped the math part behind it.. 
# from page->203 to 205...

In [27]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
DecisionTreeClassifier(max_depth=1), n_estimators=200,
algorithm="SAMME.R", learning_rate=0.5)
ada_clf.fit(X_train, y_train)

# If your AdaBoost ensemble is overfitting the training set, you can
# try reducing the number of estimators or more strongly regularizing
# the base estimator.


AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

## Gradient boosting