In [45]:
import numpy as np
import sklearn.ensemble as ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVC, LinearSVC
from sklearn.datasets import load_iris, make_moons, make_regression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_mldata
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
X, y = make_regression(n_samples=1000, n_features=10)
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [5]:
log_clf = LogisticRegression()
rnd_clf = ensemble.RandomForestClassifier()
svm_clf = SVC()

voting_clf = ensemble.VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('rf', rnd_clf),
        ('svc', svm_clf)
    ],
    voting='hard'
)

In [6]:
bag_clf = ensemble.BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500, max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True
)

In [7]:
iris = load_iris()
rnd_clf = ensemble.RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])
for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.11282333911948725
sepal width (cm) 0.02531533273520786
petal length (cm) 0.4348743204666668
petal width (cm) 0.4269870076786382


In [8]:
ada_clf = ensemble.AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200, algorithm="SAMME.R", learning_rate=0.5
)

In [9]:
tree_reg1 = DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X, y)

y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X, y2)

y3 = y - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X, y3)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [10]:
y_pred = sum(tree.predict([X[0]]) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [12]:
gbrt = ensemble.GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [13]:
gbrt = ensemble.GradientBoostingRegressor(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=120, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [14]:
errors = [mean_squared_error(y_test, y_pred) for y_pred in gbrt.staged_predict(X_test)]
bst_n_estimators = np.argmin(errors)

In [15]:
gbrt_best = ensemble.GradientBoostingRegressor(max_depth=2, n_estimators=bst_n_estimators)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=119, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False)

In [16]:
gbrt = ensemble.GradientBoostingRegressor(max_depth=2, warm_start=True)

min_test_error = float('inf')
error_going_up = 0
for n_estimators in range(1, 120):
    gbrt.n_estimators = n_estimators
    gbrt.fit(X_train, y_train)
    y_pred = gbrt.predict(X_test)
    test_error = mean_squared_error(y_test, y_pred)
    if test_error < min_test_error:
        min_val_error = test_error
        error_going_up = 0
    else:
        error_going_up += 1
        if error_going_up == 5:
            break

### Excercise

 Load the MNIST data (introduced in Chapter 3), and split it into a training set, a
validation set, and a test set (e.g., use 40,000 instances for training, 10,000 for
validation, and 10,000 for testing). Then train various classifiers, such as a Random
Forest classifier, an Extra-Trees classifier, and an SVM. Next, try to combine them
into an ensemble that outperforms them all on the validation set, using a soft or hard
voting classifier. Once you have found one, try it on the test setLoad the MNIST data (introduced in Chapter 3), and split it into a training set, a
validation set, and a test set (e.g., use 40,000 instances for training, 10,000 for
validation, and 10,000 for testing). Then train various classifiers, such as a Random
Forest classifier, an Extra-Trees classifier, and an SVM. Next, try to combine them
into an ensemble that outperforms them all on the validation set, using a soft or hard
voting classifier. Once you have found one, try it on the test set. How much better
does it perform compared to the individual classifiers?

In [17]:
mnist = fetch_mldata('MNIST original')
mnist

{'DESCR': 'mldata.org dataset: mnist-original',
 'COL_NAMES': ['label', 'data'],
 'target': array([0., 0., 0., ..., 9., 9., 9.]),
 'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}

In [18]:
print(mnist['data'].shape, mnist['target'].shape)

(70000, 784) (70000,)


In [54]:
def triple_split(X, y, ind1, ind2, ind3):
    shuffled_indices = np.random.permutation(len(X))
    
    X_train, y_train = X[shuffled_indices][:ind1],      y[shuffled_indices][:ind1]
    X_test,  y_test  = X[shuffled_indices][ind1:ind2], y[shuffled_indices][ind1:ind2]
    X_val,   y_val   = X[shuffled_indices][ind2:ind3], y[shuffled_indices][ind2:ind3]

    return X_train, X_test, X_val, y_train, y_test, y_val
  
X_train, X_test, X_val, y_train, y_test, y_val = triple_split(mnist['data'], mnist['target'], 1600, 2000, 2400)    

In [55]:
svm = SVC(kernel='linear', probability=True)
rf = ensemble.RandomForestClassifier()
et = ensemble.ExtraTreesClassifier()

svm.fit(X_train, y_train)
rf.fit(X_train, y_train)
et.fit(X_train, y_train)

print("svm: %f" % accuracy_score(y_test, svm.predict(X_test)))
print("rf: %f" % accuracy_score(y_test, rf.predict(X_test)))
print("et: %f" % accuracy_score(y_test, et.predict(X_test)))

svm: 0.915000
rf: 0.820000
et: 0.847500


In [56]:
voting_clf = ensemble.VotingClassifier(
    estimators=[
#         ('svm', LinearSVC()),
#         ('rf', ensemble.RandomForestClassifier()),
#         ('et', ensemble.ExtraTreesClassifier())
        ('svm', svm),
        ('rf', rf),
        ('et', et)
    ],
    voting='hard')

voting_clf.fit(X_train, y_train)
print("svm+rf+et hard: %f" % accuracy_score(y_test, voting_clf.predict(X_test)))

voting_clf.voting='soft'
voting_clf.fit(X_train, y_train)
print("svm+rf+et soft: %f" % accuracy_score(y_test, voting_clf.predict(X_test)))

svm+rf+et hard: 0.897500
svm+rf+et soft: 0.935000


In [57]:
print("svm: %f" % accuracy_score(y_val, svm.predict(X_val)))
print("rf: %f" % accuracy_score(y_val, rf.predict(X_val)))
print("et: %f" % accuracy_score(y_val, et.predict(X_val)))
print("svm+rf+et soft: %f" % accuracy_score(y_val, voting_clf.predict(X_val)))

svm: 0.887500
rf: 0.830000
et: 0.867500
svm+rf+et soft: 0.910000


Run the individual classifiers from the previous exercise to make predictions on the
validation set, and create a new training set with the resulting predictions: each
training instance is a vector containing the set of predictions from all your classifiers
for an image, and the target is the image’s class. Congratulations, you have just
trained a blender, and together with the classifiers they form a stacking ensemble!
Now let’s evaluate the ensemble on the test set. For each image in the test set, make
predictions with all your classifiers, then feed the predictions to the blender to get the
ensemble’s predictions. How does it compare to the voting classifier you trained
earlier?

In [67]:
# X_test_stack = np.zeros((X_test.shape[0], len(voting_clf.estimators_)))
X_test_stack = np.empty((X_test.shape[0],))

for e in voting_clf.estimators_:
    X_test_stack = np.hstack([X_test_stack, e.predict_proba(X_test)])

ValueError: all the input arrays must have same number of dimensions