In [60]:
# Import the MNIST dataset
from sklearn.datasets import fetch_openml
mnist = fetch_openml("mnist_784", as_frame=False)

X, y = mnist.data, mnist.target
print(f"Shape of the full dataset: {X.shape}")
print(f"Shape of the full label set: {y.shape}")

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)
print(f"Shape of the training data: {X_train.shape}")
print(f"Shape of the training labels: {y_train.shape}")

print(f"Shape of the validation data: {X_val.shape}")
print(f"Shape of the validation labels: {y_val.shape}")

print(f"Shape of the test data: {X_test.shape}")
print(f"Shape of the test labels: {y_test.shape}")

Shape of the full dataset: (70000, 784)
Shape of the full label set: (70000,)
Shape of the training data: (50575, 784)
Shape of the training labels: (50575,)
Shape of the validation data: (8925, 784)
Shape of the validation labels: (8925,)
Shape of the test data: (10500, 784)
Shape of the test labels: (10500,)


In [61]:
# Load in the trained models
import joblib

voting_clf_with_mlp = joblib.load('exercise_models/voting_clf_with_mlp.pkl')

In [62]:
###### How to evaluate each model ######
# Internally, each the voting classifier converts the labels to integer indices
# e.g. ['dog', 'cat', 'bird'] become [2, 1, 0]
# So when evaluating each model, we need to do the same conversion
encoder.fit_transform(['dog', 'cat', 'bird'])

array([2, 1, 0])

In [63]:
y_val

array(['3', '8', '4', ..., '7', '6', '0'], dtype=object)

In [64]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

encoder = LabelEncoder()
y_val_encoded = encoder.fit_transform(y_val) # Converts to integer

In [65]:
y_val_encoded

array([3, 8, 4, ..., 7, 6, 0])

In [66]:
# Now we can evaluate each model
for est in voting_clf_with_mlp.named_estimators_.items():
    print(f"{est[0]} validation score : {est[1].score(X_val, y_val_encoded)}")
print(f"voting_clf_with_mlp validation score : {voting_clf_with_mlp.score(X_val, y_val)}")

forest_clf validation score : 0.9687394957983193
extra_clf validation score : 0.9718767507002801
sgd_clf validation score : 0.8677871148459384
mlp_clf validation score : 0.9611204481792717
voting_clf_with_mlp validation score : 0.9019607843137255


  prob /= prob.sum(axis=1).reshape((prob.shape[0], -1))


In [67]:
voting_clf_with_mlp.named_estimators_

{'forest_clf': RandomForestClassifier(random_state=42),
 'extra_clf': ExtraTreesClassifier(random_state=42),
 'sgd_clf': SGDClassifier(loss='log_loss', random_state=42),
 'mlp_clf': MLPClassifier(random_state=42)}

In [68]:
# Evaluating the voting classifier using hard voting
voting_clf_with_mlp.voting = "hard"
voting_clf_with_mlp.score(X_val, y_val)

0.9710924369747899

In [69]:
# It is still not as good as the extra trees classifier.
# Maybe if we drop the weakest link (sgd classifier) the accuracy will improve

# Removing the SGD from the original estimators 
voting_clf_with_mlp.set_params(sgd_clf="drop")
print(voting_clf_with_mlp.estimators)
print(voting_clf_with_mlp.estimators_)
print(voting_clf_with_mlp.named_estimators_)

[('forest_clf', RandomForestClassifier(random_state=42)), ('extra_clf', ExtraTreesClassifier(random_state=42)), ('sgd_clf', 'drop'), ('mlp_clf', MLPClassifier(random_state=42))]
[RandomForestClassifier(random_state=42), ExtraTreesClassifier(random_state=42), SGDClassifier(loss='log_loss', random_state=42), MLPClassifier(random_state=42)]
{'forest_clf': RandomForestClassifier(random_state=42), 'extra_clf': ExtraTreesClassifier(random_state=42), 'sgd_clf': SGDClassifier(loss='log_loss', random_state=42), 'mlp_clf': MLPClassifier(random_state=42)}


In [70]:
# So it is dropped from the original list of estimators but not the list of trained estimators

# Dropping the sgd clf from the list of trained estimators
svm_clf_trained = voting_clf_with_mlp.named_estimators_.pop('sgd_clf')
voting_clf_with_mlp.estimators_.remove(svm_clf_trained)

print(voting_clf_with_mlp.estimators)
print(voting_clf_with_mlp.estimators_)

[('forest_clf', RandomForestClassifier(random_state=42)), ('extra_clf', ExtraTreesClassifier(random_state=42)), ('sgd_clf', 'drop'), ('mlp_clf', MLPClassifier(random_state=42))]
[RandomForestClassifier(random_state=42), ExtraTreesClassifier(random_state=42), MLPClassifier(random_state=42)]


In [71]:
# Now we re-evaluate the voting classifier 
voting_clf_with_mlp.score(X_val, y_val)

0.9732212885154061

In [None]:
# Boom shakalaka

In [72]:
joblib.dump(voting_clf_with_mlp, "exercise_models/voting_clf_without_sgd.pkl")

['exercise_models/voting_clf_without_sgd.pkl']