## Voting Classifiers

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [3]:
# composed of 3 diverse classifiers
# hard voting 

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.ensemble import VotingClassifier

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators = [('ln', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting = 'hard')

voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('ln', LogisticRegression()),
                             ('rf', RandomForestClassifier()), ('svc', SVC())])

In [4]:
# accuracy test

from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))
    
# voting had the most accurate result

LogisticRegression 0.864
RandomForestClassifier 0.904
SVC 0.896
VotingClassifier 0.904


In [5]:
# Soft voting
log_clf = LogisticRegression(solver="lbfgs", random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=100, random_state=42)
svm_clf = SVC(gamma="scale", probability=True, random_state=42)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')
voting_clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.896
SVC 0.896
VotingClassifier 0.92


## Bagging and Pasting

Bagging: sampling is performed with replacement \
Pasting: sampling is performed without replacement

In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 500, # ensemble of 500 Decision Tree classifiers
    max_samples = 100, bootstrap = True, n_jobs = -1) # each is trained on 100 training instances, with replacement
# n_jobs = -1 : use all available cores

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [7]:
# Out-of-Bag (oob) Evaluation = remaining instances
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators = 500,
    bootstrap = True, n_jobs = -1, oob_score = True) # request automatic oob evaluation after training

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

# according to oob evaluation, BaggingClassifier is likely to achieve about 89.6% accuracy

0.896

In [8]:
from sklearn.metrics import accuracy_score
y_predict = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [10]:
# oob decision function for each training instance

bag_clf.oob_decision_function_
#[negative class, positive class]

array([[0.38121547, 0.61878453],
       [0.31666667, 0.68333333],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.00552486, 0.99447514],
       [0.0631068 , 0.9368932 ],
       [0.45505618, 0.54494382],
       [0.01612903, 0.98387097],
       [1.        , 0.        ],
       [0.98863636, 0.01136364],
       [0.79532164, 0.20467836],
       [0.01086957, 0.98913043],
       [0.775     , 0.225     ],
       [0.85628743, 0.14371257],
       [0.96666667, 0.03333333],
       [0.05487805, 0.94512195],
       [0.        , 1.        ],
       [0.99408284, 0.00591716],
       [0.95      , 0.05      ],
       [1.        , 0.        ],
       [0.02116402, 0.97883598],
       [0.27225131, 0.72774869],
       [0.93181818, 0.06818182],
       [1.        , 0.        ],
       [0.97727273, 0.02272727],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.70899471, 0.29100529],
       [0.

## Random Forests

In [13]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators = 500, max_leaf_nodes = 16, n_jobs = -1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [15]:
# can be written in this way:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter = "random", max_leaf_nodes = 16),
    n_estimators = 500, max_samples = 1.0, bootstrap = True, n_jobs = -1)

## Feature Importance

In random forest, feature's importance is measured by looking at how much the tree nodes that use that feature reduce impurity on average.

In [17]:
# using iris data
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators = 500, n_jobs = -1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)
    
# petal length (44%) and petal width (42%) are the most important

sepal length (cm) 0.10683373284303369
sepal width (cm) 0.026560042227229505
petal length (cm) 0.40142169192197735
petal width (cm) 0.4651845330077594


## Boosting

Ensemble method that combine weak learners into a strong learner.

### AdaBoost

Pay more attention to the training instances that the precedessor underfitted --> new predictors focusing more and more on the hard cases
- Adaboost adds predictors to the ensemble gradually making it better. Tweak the instance weight at every iteration.

In [18]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth = 1), n_estimators = 200,
    algorithm = "SAMME.R", learning_rate = 0.5) # SAMME = AdaBoost when there are only 2 classes, R = "real"
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

### Gradient Boosting

Gradient Boosting adds predictors sequentially to an ensemble, each one correcting its precessor. 
- Difference from the AdaBoost is that the method tries to fit the new predictor to the residual errors made by the previous predictor.

In [25]:
# Gradient Tree Boosting
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

import numpy as np

from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth = 2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X) # residual error made by the first predictor
tree_reg2 = DecisionTreeRegressor(max_depth = 2)
tree_reg2.fit(X, y2)

y3 = y2 - tree_reg2.predict(X) # residual error made by the second predictor
tree_reg3 = DecisionTreeRegressor(max_depth = 2)
tree_reg3.fit(X, y3)

X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
y_pred

array([0.75026781])

In [28]:
from sklearn.ensemble import GradientBoostingRegressor # simpler way

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [30]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)

gbrt = GradientBoostingRegressor(max_depth = 2, n_estimators = 120)
gbrt.fit(X, y)

errors = [mean_squared_error(y_val, y_pred)
         for y_pred in gbrt.staged_predict(X_val)] 
# staged_predict = returns an iterator over the predictions made by the ensemble at each stage of training
bst_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingRegressor(max_depth = 2, n_estimators = bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(max_depth=2, n_estimators=120)

In [31]:
# early stopping

grbt = GradientBoostingRegressor(max_depth = 2, warm_start = True)
# warm_start = keep existing trees when fit() method is called

min_val_error = float("inf")
error_going_up = 0
for n_estimators in range(1, 120):
    grbt.n_estimators = n_estimators
    grbt.fit(X_train, y_train)
    y_pred = grbt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:  # does not improve 5 iterations in a row
            break # early stopping

In [33]:
print(gbrt.n_estimators)

120


In [34]:
print("Minimum validation MSE:", min_val_error)

Minimum validation MSE: 0.002750279033345716
