# MNIST Dataset w/ Voting Classifier

In [1]:
from sklearn.datasets import fetch_openml

X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False)

  warn(


In [2]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X_mnist, y_mnist, test_size=0.2, stratify=y_mnist)

In [3]:
X_train_f, X_valid, y_train_f, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train)

In [4]:
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

In [5]:
rs = 42

rf_clf = RandomForestClassifier(n_estimators=100, random_state=rs)
extra_trees_clf = ExtraTreesClassifier(n_estimators=100, random_state=rs)
svm_clf = LinearSVC(max_iter=100, tol=20, random_state=rs)
mlp_clf = MLPClassifier(random_state=rs)

In [6]:
estimators = [rf_clf, extra_trees_clf, svm_clf, mlp_clf]
for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train_f, y_train_f)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the LinearSVC(max_iter=100, random_state=42, tol=20)




Training the MLPClassifier(random_state=42)


In [7]:
ind_scores = [round(estimator.score(X_valid, y_valid), 3) for estimator in estimators]

In [8]:
ind_scores

[0.965, 0.968, 0.809, 0.96]

In [9]:
import numpy as np
np.mean(ind_scores)

0.9255

In [10]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("random_forest_clf", rf_clf),
    ("extra_trees_clf", extra_trees_clf),
    ("svm_clf", svm_clf),
    ("mlp_clf", mlp_clf),
]

voting_clf = VotingClassifier(named_estimators)
voting_clf.fit(X_train_f, y_train_f)



In [11]:
voting_clf.score(X_valid, y_valid)

0.9673214285714286

In [12]:
# Convert the classes to class indices, to evaluate the clones made by VotingClassifier
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_valid_encoded = encoder.fit_transform(y_valid)

In [13]:
#or

y_valid_encoded = y_valid.astype(np.int64)

In [14]:
encoded_scores = [estimator.score(X_valid, y_valid_encoded) for estimator in voting_clf.estimators_]
encoded_scores

[0.9652678571428571, 0.9680357142857143, 0.809375, 0.9601785714285714]

In [15]:
np.mean(encoded_scores)

0.9257142857142858

In [16]:
# Remove SVM
voting_clf.set_params(svm_clf='drop')

In [17]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(max_iter=100, random_state=42, tol=20),
 MLPClassifier(random_state=42)]

In [18]:
# Remove SVM from the list of trained estimators
svm_clf_trained = voting_clf.named_estimators_.pop("svm_clf")
voting_clf.estimators_.remove(svm_clf_trained)

In [19]:
voting_clf.score(X_valid, y_valid)

0.9694642857142857

In [20]:
# Try Soft Voting
voting_clf.voting = "soft"
voting_clf.score(X_valid, y_valid)

0.9659821428571429

In [21]:
# Hard Voting is better
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9720714285714286

In [22]:
[estimator.score(X_test, y_test.astype(np.int64)) for estimator in voting_clf.estimators_]

[0.9665714285714285, 0.9693571428571428, 0.9578571428571429]

In [24]:
# Manual Implementation of Stacking Classifier
X_valid_predictions = np.empty((len(X_valid), len(estimators)), dtype=object)

for index, estimator in enumerate(estimators):
    X_valid_predictions[:, index] = estimator.predict(X_valid)

In [25]:
X_valid_predictions.shape

(11200, 4)

In [26]:
# Create random forest blender
rnd_forest_blender = RandomForestClassifier(n_estimators=200, oob_score=True, random_state=rs)
rnd_forest_blender.fit(X_valid_predictions, y_valid)

In [28]:
# Out-of-bag evaluation Score
rnd_forest_blender.oob_score_

0.9650892857142858

In [29]:
X_test_predictions = np.empty((len(X_test), len(estimators)), dtype=object)

for index, estimator in enumerate(estimators):
    X_test_predictions[:, index] = estimator.predict(X_test)

In [30]:
y_pred = rnd_forest_blender.predict(X_test_predictions)

In [32]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.9689285714285715

In [33]:
# Using sklearn's stacking classifier API

from sklearn.ensemble import StackingClassifier

stack_clf = StackingClassifier(
    named_estimators,
    final_estimator=rnd_forest_blender
)

stack_clf.fit(X_train, y_train)



In [34]:
stack_clf.score(X_test, y_test)

0.9786428571428571