# Ex. 8

In [27]:
import numpy as np
import sklearn
from sklearn.datasets import fetch_openml

X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False,
                                parser='auto')

In [32]:
X_train, y_train = X_mnist[:50000], y_mnist[:50000]
X_val, y_val = X_mnist[50000:60000], y_mnist[50000:60000]
X_test, y_test = X_mnist[60000:], y_mnist[60000:]

In [68]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

random_forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf = LinearSVC(dual=True, max_iter=100, tol=20, random_state=42)
mlp_clf = MLPClassifier(random_state=42)

In [69]:
estimators = [random_forest_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the LinearSVC(dual=True, max_iter=100, random_state=42, tol=20)
Training the MLPClassifier(random_state=42)


In [70]:
[estimator.score(X_val, y_val) for estimator in estimators]

[0.9736, 0.9743, 0.8662, 0.9622]

In [71]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random_forest_clf", random_forest_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf)]

voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train, y_train)

In [72]:
voting_clf.score(X_val, y_val)

0.975

cloning:

In [73]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_val_encoded = encoder.fit_transform(y_val)

In [74]:
y_val_encoded = y_val.astype(np.int64)

In [75]:
[estimator.score(X_val, y_val_encoded) for estimator in voting_clf.estimators_]

[0.9736, 0.9743, 0.8662, 0.9622]

In [76]:
voting_clf.set_params(svm_clf="drop")

In [77]:
voting_clf.estimators

[('random_forest_clf', RandomForestClassifier(random_state=42)),
 ('extra_trees_clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', 'drop'),
 ('mlp_clf', MLPClassifier(random_state=42))]

In [78]:
svm_clf_trained = voting_clf.named_estimators_.pop("svm_clf")
voting_clf.estimators_.remove(svm_clf_trained)

In [79]:
voting_clf.score(X_val, y_val)

0.977

In [80]:
voting_clf.voting = "soft"

In [81]:
voting_clf.score(X_val, y_val)

0.9712

In [82]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9728

In [83]:
[estimator.score(X_test, y_test.astype(np.int64)) for estimator in voting_clf.estimators_]

[0.968, 0.9703, 0.9616]

# Ex. 9

In [85]:
X_valid_predictions = np.empty((len(X_val), len(estimators)), dtype=object)

for index, estimator in enumerate(estimators):
    X_valid_predictions[:, index] = estimator.predict(X_val)

In [86]:
X_valid_predictions

array([['3', '3', '3', '3'],
       ['8', '8', '8', '8'],
       ['6', '6', '6', '6'],
       ...,
       ['5', '5', '5', '5'],
       ['6', '6', '6', '6'],
       ['8', '8', '8', '8']], dtype=object)

In [87]:
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True,
                                            random_state=42)
rnd_forest_blender.fit(X_valid_predictions, y_val)

In [88]:
rnd_forest_blender.oob_score_

0.9722

In [89]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=object)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [90]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [92]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9685

In [93]:
rnd_forest_blender.score(X_test_predictions, y_test)

0.9685

In [94]:
X_train_full, y_train_full = X_mnist[:60_000], y_mnist[:60_000]

In [96]:
from sklearn.ensemble import StackingClassifier

stack_clf = StackingClassifier(named_estimators,
                               final_estimator=rnd_forest_blender)
stack_clf.fit(X_train_full, y_train_full)

In [97]:
stack_clf.score(X_test, y_test)

0.9793