### VOTING CLASSIFIERS

In [1]:
from sklearn.datasets import make_moons
X, y = make_moons(n_samples = 100, noise = 0.15)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC(probability=True)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
voting_clf = VotingClassifier(
                estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
                voting='soft')
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('svc', SVC(probability=True))],
                 voting='soft')

In [6]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.8
RandomForestClassifier 1.0
SVC 0.96
VotingClassifier 0.96


### BAGGING AND PASTING

In [7]:
# Training the same algorithm on different random subsets of training set

In [8]:
# when sampling is performed with replacement, this method is called bagging
# (short for bootstrap), when sampling is performed without replacement, it is
# called pasting

In [9]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [10]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=0.1, bootstrap=True, n_jobs=-1)
# n_jobs=-1 tells scikit-learn to use all available cores
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [11]:
# max_samples can alternatively be set to a float between 0.0 and 1.0, in which case the max number of instances
# to sample is equal to the size of the training set times max_samples.

### OUT OF BAG EVALUATION

In [12]:
# With bagging, some instances may be sampled several times for any given predictor,
# while others may not be sampled at all. By default a BaggingClassifier samples m
# training instances with replacement (bootstrap=True), where m is the size of the
# training set. This means that only about 63% of the training instances are sampled on
# average for each predictor. The remaining 37% of the training instances that are not
# sampled are called out-of-bag (oob) instances. Note that they are not the same 37%
# for all predictors.

In [13]:
# Since a predictor never sees the oob instances during training, it can be evaluated on
# these instances, without the need for a separate validation set. You can evaluate the
# ensemble itself by averaging out the oob evaluations of each predictor.

In [14]:
# In Scikit-Learn, you can set oob_score=True when creating a BaggingClassifier to
# request an automatic oob evaluation after training. The following code demonstrates
# this. The resulting evaluation score is available through the oob_score_ variable:

In [15]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                           n_estimators=500,
                           bootstrap=True,
                           n_jobs=-1,
                           oob_score=True)
bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8933333333333333

In [16]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.96

In [17]:
# The oob decision function for each training instance is also available through the
# oob_decision_function_ variable. In this case (since the base estimator has a pre
# dict_proba() method), the decision function returns the class probabilities for each
# training instance. For example, the oob evaluation estimates that the first training
# instance has a 68.25% probability of belonging to the positive class (and 31.75% of
# belonging to the negative class):

bag_clf.oob_decision_function_

array([[1.        , 0.        ],
       [0.00540541, 0.99459459],
       [1.        , 0.        ],
       [0.2849162 , 0.7150838 ],
       [0.97109827, 0.02890173],
       [0.92473118, 0.07526882],
       [0.50549451, 0.49450549],
       [0.14367816, 0.85632184],
       [0.95      , 0.05      ],
       [0.94857143, 0.05142857],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.01058201, 0.98941799],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.00507614, 0.99492386],
       [0.17751479, 0.82248521],
       [0.22105263, 0.77894737],
       [0.85454545, 0.14545455],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.98888889, 0.01111111],
       [0.07428571, 0.92571429],
       [0.51256281, 0.48743719],
       [1.        , 0.        ],
       [0.98870056, 0.01129944],
       [0.65968586, 0.34031414],
       [1.        , 0.        ],
       [0.22631579, 0.77368421],
       [0.

### RANDOM PATCHES AND RANDOM SUBSPACES

In [18]:
# The BaggingClassifier class supports sampling the features as well. Sampling is
# controlled by two hyperparameters: max_features and bootstrap_features. They
# work the same way as max_samples and bootstrap, but for feature sampling instead
# of instance sampling. Thus, each predictor will be trained on a random subset of the
# input features.

In [19]:
# This technique is particularly useful when you are dealing with high-dimensional
# inputs (such as images). Sampling both training instances and features is called the
# Random Patches method. Keeping all training instances (by setting bootstrap=False
# and max_samples=1.0) but sampling features (by setting bootstrap_features to
# True and/or max_features to a value smaller than 1.0) is called the Random Subspaces
# method.

### RANDOM FORESTS

In [20]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [21]:
print("Testing Score", accuracy_score(y_pred, y_test))

Testing Score 0.96


In [22]:
rnd_clf.score(X_train, y_train)

1.0

In [23]:
# With a few exceptions, a RandomForestClassifier has all the hyperparameters of a
# DecisionTreeClassifier (to control how trees are grown), plus all the hyperparameters
# of a BaggingClassifier to control the ensemble itself.

### FEATURE IMPORTANCE

In [24]:
from sklearn.datasets import load_iris
iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'],iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.09389981023043377
sepal width (cm) 0.024171971629572433
petal length (cm) 0.4423607975498418
petal width (cm) 0.43956742059015197


### BOOSTING

In [25]:
# Boosting (originally called hypothesis boosting) refers to any Ensemble method that
# can combine several weak learners into a strong learner. The general idea of most
# boosting methods is to train predictors sequentially, each trying to correct its predecessor.

In [26]:
# The mostpopular are AdaBoost(short for adaptive boosting) and Gradient Boosting

#### AdaBoost

In [27]:
# One way for a new predictor to correct its predecessor is to pay a bit more attention
# to the training instances that the predecessor underfitted. This results in new predictors
# focusing more and more on the hard cases. This is the technique used by
# AdaBoost.

In [28]:
# There is one important drawback to this sequential learning technique:
# it cannot be parallelized (or only partially), since each predictor
# can only be trained after the previous predictor has been
# trained and evaluated. As a result, it does not scale as well as bagging
# or pasting.

In [29]:
# Scikit-Learn uses a multiclass version of AdaBoost called SAMME16 (which stands for
# Stagewise Additive Modeling using a Multiclass Exponential loss function). When there
# are just two classes, SAMME is equivalent to AdaBoost. If the predictors can estimate
# class probabilities (i.e., if they have a predict_proba() method), Scikit-Learn can use
# a variant of SAMME called SAMME.R (the R stands for “Real”), which relies on class
# probabilities rather than predictions and generally performs better.

In [30]:
# The following code trains an AdaBoost classifier based on 200 Decision Stumps using
# Scikit-Learn’s AdaBoostClassifier class (as you might expect, there is also an Ada
# BoostRegressor class). A Decision Stump is a Decision Tree with max_depth=1—in
# other words, a tree composed of a single decision node plus two leaf nodes. This is
# the default base estimator for the AdaBoostClassifier class:

In [31]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                            n_estimators=200, algorithm='SAMME.R',
                            learning_rate=0.5)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [32]:
y_pred = ada_clf.predict(X_test)

In [33]:
accuracy_score(y_test, y_pred)

1.0

In [34]:
ada_clf.score(X_train, y_train)

1.0

In [35]:
ada_clf.score(X_test, y_test)

1.0

### GRADIENT BOOSTING

In [36]:
# this method tries to fit the new predictor to the residual
# errors made by the previous predictor.

In [37]:
# Let’s go through a simple regression example, using Decision Trees as the base predictors
# (of course, Gradient Boosting also works great with regression tasks). This is
# called Gradient Tree Boosting, or Gradient Boosted Regression Trees (GBRT). First, let’s
# fit a DecisionTreeRegressor to the training set (for example, a noisy quadratic training
# set):

In [38]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

DecisionTreeRegressor(max_depth=2)

In [39]:
# Now we'll train a second DTR on the residual errors made by tree_reg1

In [40]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(max_depth=2)

In [41]:
# Then we train a third regressor on the residual erros made by tree_reg2

In [42]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(max_depth=2)

In [43]:
# Now we have an esemble containing three trees, It can make predictions
# on a new instance simply by adding up the predictions of all three trees

In [44]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [45]:
# The learning_rate hyperparameter scales the contribution of each tree. If you set it
# to a low value, such as 0.1, you will need more trees in the ensemble to fit the training
# set, but the predictions will usually generalize better. This is a regularization technique
# called shrinkage.

In [46]:
# In order to find the optimal number of trees, you can use early stopping
# . A simple way to implement this is to use the staged_predict() method: it
# returns an iterator over the predictions made by the ensemble at each stage of training
# (with one tree, two trees, etc.). The following code trains a GBRT ensemble with
# 120 trees, then measures the validation error at each stage of training to find the optimal
# number of trees, and finally trains another GBRT ensemble using the optimal
# number of trees:

In [48]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [49]:
X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [71]:
errors = [mean_squared_error(y_val ,y_pred)
         for y_pred in gbrt.staged_predict(X_val)]


In [72]:
bst_n_estimators = np.argmin(errors) + 1
bst_n_estimators

35

In [73]:
gbrt_best = GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=35)

In [74]:
y_best_pred = gbrt_best.predict(X_val)

In [75]:
gbrt_best.score(X_val, y_val)

0.7176576630357447

In [76]:
# It is also possible to implement early stopping by actually stopping training early
# (instead of training a large number of trees first and then looking back to find the
# optimal number). You can do so by setting warm_start=True, which makes Scikit-
# Learn keep existing trees when the fit() method is called, allowing incremental
# training. The following code stops training when the validation error does not
# improve for five iterations in a row:

In [77]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True)

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up ==5:
            break # early stopping

In [78]:
# The GradientBoostingRegressor class also supports a subsample hyperparameter,
# which specifies the fraction of training instances to be used for training each tree. For
# example, if subsample=0.25, then each tree is trained on 25% of the training instances,
# selected randomly. As you can probably guess by now, this technique trades a
# higher bias for a lower variance. It also speeds up training considerably. This is called
# Stochastic Gradient Boosting.

In [79]:
# It is possible to use Gradient Boosting with other cost functions,
# This is controlled by the loss parameter

In [91]:
# It is worth noting that an optimized implementation of Gradient Boosting is available
# in the popular Python library XGBoost, which stands for Extreme Gradient Boosting.
# This package was initially developed by Tianqi Chen as part of the Distributed (Deep)
# Machine Learning Community (DMLC), and it aims to be extremely fast, scalable,
# and portable. In fact, XGBoost is often an important component of the winning
# entries in ML competitions. XGBoost’s API is quite similar to Scikit-Learn’s:

### XGBOOST

In [83]:
import xgboost

xgb_reg = xgboost.XGBRegressor()
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)



In [85]:
xgb_reg.score(X_val, y_val)

0.7278227693386341

In [86]:
# XGBoost also offers several nice features, such as automatically taking care of early
# stopping:

In [87]:
xgb_reg.fit(X_train, y_train,
           eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)

[0]	validation_0-rmse:0.479884
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.451401
[2]	validation_0-rmse:0.421084
[3]	validation_0-rmse:0.399774
[4]	validation_0-rmse:0.376603
[5]	validation_0-rmse:0.359273
[6]	validation_0-rmse:0.343927
[7]	validation_0-rmse:0.326412
[8]	validation_0-rmse:0.319348
[9]	validation_0-rmse:0.318835
[10]	validation_0-rmse:0.315544
[11]	validation_0-rmse:0.315144
[12]	validation_0-rmse:0.314637
[13]	validation_0-rmse:0.307304
[14]	validation_0-rmse:0.30832
[15]	validation_0-rmse:0.304065
[16]	validation_0-rmse:0.298247
[17]	validation_0-rmse:0.293409
[18]	validation_0-rmse:0.2894
[19]	validation_0-rmse:0.286086
[20]	validation_0-rmse:0.278837
[21]	validation_0-rmse:0.275791
[22]	validation_0-rmse:0.27106
[23]	validation_0-rmse:0.267135
[24]	validation_0-rmse:0.266516
[25]	validation_0-rmse:0.260934
[26]	validation_0-rmse:0.260543
[27]	validation_0-rmse:0.261063
[28]	validation_0-rmse:0.258415
[29]	validation_0-rmse

In [90]:
xgb_reg.score(X_val, y_val)

0.7416505144490212