In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import numpy as np

np.random.seed(42)

log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(random_state=42)

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf),
                                          ('svc', svm_clf)], voting='hard')
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42, solver='warn',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                             

In [3]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))



LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.896




In [4]:
log_clf = LogisticRegression(random_state=42)
rnd_clf = RandomForestClassifier(random_state=42)
svm_clf = SVC(probability=True, random_state=42)

voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('rf', rnd_clf),
                                          ('svc', svm_clf)], voting='soft')
voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42, solver='warn',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('rf',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                             

In [5]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))



LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.912


In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42),
                           n_estimators=500, max_samples=100,
                            bootstrap=True, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [7]:
print(accuracy_score(y_test, y_pred))

0.904


In [8]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)
y_pred = tree_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.856


In [9]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42),
                           n_estimators=500, bootstrap=True, 
                           oob_score=True, random_state=42)
bag_clf.fit(X_train, y_train)
print(bag_clf.oob_score_)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8986666666666666
0.912


In [10]:
bag_clf.oob_decision_function_[:5]

array([[0.32352941, 0.67647059],
       [0.35625   , 0.64375   ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ]])

In [11]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(splitter="random", max_leaf_nodes=16, random_state=42),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1, random_state=42)
bag_clf.fit(X_train, y_train)
y_pred_bag = bag_clf.predict(X_test)

In [12]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf.fit(X_train, y_train)
y_pred_rnd = rnd_clf.predict(X_test)

In [13]:
print(accuracy_score(y_test, y_pred_bag))
print(accuracy_score(y_test, y_pred_rnd))

0.92
0.912


In [14]:
np.sum(y_pred_bag == y_pred_rnd) / len(y_pred)

0.976

In [15]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, random_state=42)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, round(score, 5))

sepal length (cm) 0.11249
sepal width (cm) 0.02312
petal length (cm) 0.44103
petal width (cm) 0.42336


In [16]:
from sklearn.ensemble import AdaBoostClassifier

tree_clf = DecisionTreeClassifier(max_depth=1, random_state=42)
ada_clf = AdaBoostClassifier(tree_clf, n_estimators=200, algorithm='SAMME.R',
                            learning_rate=0.3)
_ = ada_clf.fit(X_train, y_train)

In [17]:
y_pred = ada_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.904


In [18]:
np.random.seed(42)
X = np.random.rand(100, 1) - 0.5
y = 3*X[:, 0]**2 + 0.05 * np.random.randn(100)

In [19]:
from sklearn.tree import DecisionTreeRegressor

tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [20]:
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg2.fit(X, y2)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [21]:
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=42)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=42, splitter='best')

In [22]:
X_new = np.array([[0.8]])
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))
print(y_pred)

[0.75026781]


In [23]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3,
                                learning_rate=1.0, random_state=42)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=1.0, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X_train, X_val, y_train, y_val = train_test_split(X, y)
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=120,
                                 random_state=42)
gbrt.fit(X_train, y_train)
errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
best_n_estimators = np.argmin(errors)
gbrt_best = GradientBoostingRegressor(max_depth=2,
                                     n_estimators=best_n_estimators,
                                     random_state=42)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=120,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [25]:
gbrt = GradientBoostingRegressor(max_depth=2, warm_start=True,
                                random_state=42)
min_val_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_val)
    val_error = mean_squared_error(y_val, y_pred)
    if val_error < min_val_error:
        min_val_error = val_error
        error_going_up = 0
        best_n_estimators = n_estimators
    else:
        error_going_up += 1
        if error_going_up == 5:
            break # early stopping
# best n_estimators
gbrt.n_estimators -= 5

In [26]:
try:
    import xgboost
except ImportError as ex:
    print("Error: the xgboost library is not installed.")
    xgboost = None

In [27]:
xgb_reg = xgboost.XGBRegressor(random_state=42)
xgb_reg.fit(X_train, y_train)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
print("Validation MSE:", val_error)

Validation MSE: 0.0027716804724210293


In [28]:
xgb_reg.fit(X_train, y_train,
            eval_set=[(X_val, y_val)], early_stopping_rounds=2)
y_pred = xgb_reg.predict(X_val)
val_error = mean_squared_error(y_val, y_pred)
print("Validation MSE:", val_error)

[0]	validation_0-rmse:0.265537
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.240383
[2]	validation_0-rmse:0.217953
[3]	validation_0-rmse:0.197928
[4]	validation_0-rmse:0.17992
[5]	validation_0-rmse:0.164447
[6]	validation_0-rmse:0.151183
[7]	validation_0-rmse:0.139264
[8]	validation_0-rmse:0.128934
[9]	validation_0-rmse:0.119841
[10]	validation_0-rmse:0.111846
[11]	validation_0-rmse:0.104896
[12]	validation_0-rmse:0.098699
[13]	validation_0-rmse:0.093029
[14]	validation_0-rmse:0.088443
[15]	validation_0-rmse:0.083631
[16]	validation_0-rmse:0.080011
[17]	validation_0-rmse:0.076111
[18]	validation_0-rmse:0.073629
[19]	validation_0-rmse:0.070929
[20]	validation_0-rmse:0.068775
[21]	validation_0-rmse:0.066377
[22]	validation_0-rmse:0.064613
[23]	validation_0-rmse:0.06317
[24]	validation_0-rmse:0.061713
[25]	validation_0-rmse:0.06016
[26]	validation_0-rmse:0.059017
[27]	validation_0-rmse:0.058073
[28]	validation_0-rmse:0.057446
[29]	validation_0-rms

In [29]:
try:
    import lightgbm
except ImportError as ex:
    print("Error: the lightgbm library is not installed.")

In [30]:
dtrain = lightgbm.Dataset(X_train, label=y_train)
dvalid = lightgbm.Dataset(X_val, label=y_val)
param = {'num_leaves':2, 'objective':'binary',
        'metric':'auc'}
num_round = 50
lgbm_clf = lightgbm.train(param, dtrain, num_round,
                         valid_sets=[dvalid],
                         early_stopping_rounds=5,
                         verbose_eval=False)

In [31]:
y_pred = lgbm_clf.predict(X_val)
print(mean_squared_error(y_val, y_pred))
print(accuracy_score(y_val, y_pred))

0.31091347426836835


ValueError: continuous is not supported

# Exercises

1) Yes, if they make different kinds of errors, but not really if they make similar types of errors. Because they trained on the exact same data, they could be too correlated to make an improvement in an ensemble.

2) Hard voting sums the predicted classes while soft voting sums the predicted probabilities.

3) We can parallelise bagging and pasting ensembles. This includes Random Forests. In Boosting, each predictor is dependent on the results of the previous predictor, so parallelising will not work. In stacking, we can parallelise between models in one layer, but a layer will depend on the results of the previous layer.

4) This means we do not need a seperate validation set to estimate the generalisation error since the models were not trained on the oob data.

5) In Random Forests, a random subset of the features is considered for splitting at each node. In Extra-Trees, it uses random thresholds for each feature rather than searching for best threshholds for each feature, making it more random (trading higher bias for lower variance) and much faster to train.

6) AdaBoost works by taking the misclassified instances from a predictor and increasing their weights to then train another model on the updated weights and so on. When underfitting, we can try increasing the number of estimators or regularise the models used less (make the decision stump a decision tree by increasing max_depth etc).

7) Decrease the learning rate and use early stopping to find the optimal number of estimators.



## 8)

In [32]:
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(np.int64)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')

In [33]:
X_train_val, X_test, y_train_val, y_test = train_test_split(
    mnist.data, mnist.target, test_size=10000, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=10000, random_state=42)

In [34]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier

In [35]:
forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=10, random_state=42)
lin_svc_clf = LinearSVC(random_state=42)
poly_svc_clf = SVC(kernel='poly', degree=3, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [36]:
estimators = [forest_clf, extra_trees_clf, lin_svc_clf, mlp_clf]
for estimator in estimators:
    print('Training:', estimator)
    estimator.fit(X_train, y_train)
print('Scores: Random Forest, Extra Trees, LinearSVC, Polynomial SVC, MLP')


Training: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
Training: ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=None, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
                     oob_score=False, random_state=42, verbose=0,
                     warm_start=False)
Training: LinearS



Training: MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)
Scores: Random Forest, Extra Trees, LinearSVC, Polynomial SVC, MLP


[0.9469, 0.9492, 0.8626, 0.9617]

In [None]:
[estimator.score(X_val, y_val) for estimator in estimators]

In [37]:
from sklearn.ensemble import VotingClassifier

estimators = [('random_forest', forest_clf), ('extra_trees', extra_trees_clf),
             ('mlp', mlp_clf)]
voting_clf = VotingClassifier(estimators, voting='soft')
voting_clf.fit(X_train, y_train)
print(voting_clf.score(X_val, y_val))

0.9698


In [38]:
# test individual classifiers on test set
[estimator.score(X_test, y_test) for estimator in voting_clf.estimators_]

[0.9437, 0.9474, 0.9599]

In [39]:
print(voting_clf.score(X_test, y_test))

0.9665


## 9)

In [41]:
# create 4 training instances: each instance is a vector of predictions for each classifier (target is y_val)
estimators = [forest_clf, extra_trees_clf, lin_svc_clf, mlp_clf]
X_val_pred = np.empty((len(X_val), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
    X_val_pred[:, index] = estimator.predict(X_val)

In [42]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_pred, y_val)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=True, random_state=42, verbose=0,
                       warm_start=False)

In [43]:
X_test_pred = np.empty((len(X_test), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
    X_test_pred[:, index] = estimator.predict(X_test)
#y_pred = rnd_forest_blender.predict(X_test_pred)
print(rnd_forest_blender.score(X_test_pred, y_test))

0.9625


In [44]:
estimators = [forest_clf, extra_trees_clf, mlp_clf]
X_val_pred = np.empty((len(X_val), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
    X_val_pred[:, index] = estimator.predict(X_val)
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=42)
rnd_forest_blender.fit(X_val_pred, y_val)
X_test_pred = np.empty((len(X_test), len(estimators)), dtype=np.float32)
for index, estimator in enumerate(estimators):
    X_test_pred[:, index] = estimator.predict(X_test)
#y_pred = rnd_forest_blender.predict(X_test_pred)
print(rnd_forest_blender.score(X_test_pred, y_test))

0.9627
