In [17]:
import sklearn

from sklearn.datasets import fetch_openml



X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False,
                                parser='auto')

In [18]:
print(mnist.DESCR)

**Author**: Yann LeCun, Corinna Cortes, Christopher J.C. Burges  
**Source**: [MNIST Website](http://yann.lecun.com/exdb/mnist/) - Date unknown  
**Please cite**:  

The MNIST database of handwritten digits with 784 features, raw data available at: http://yann.lecun.com/exdb/mnist/. It can be split in a training set of the first 60,000 examples, and a test set of 10,000 examples  

It is a subset of a larger set available from NIST. The digits have been size-normalized and centered in a fixed-size image. It is a good database for people who want to try learning techniques and pattern recognition methods on real-world data while spending minimal efforts on preprocessing and formatting. The original black and white (bilevel) images from NIST were size normalized to fit in a 20x20 pixel box while preserving their aspect ratio. The resulting images contain grey levels as a result of the anti-aliasing technique used by the normalization algorithm. the images were centered in a 28x28 image b

In [22]:
#Split into train and test set

from sklearn.model_selection import train_test_split

X_train, y_train = X_mnist[:50_000], y_mnist[:50_000]
X_valid, y_valid = X_mnist[50_000:60_000], y_mnist[50_000:60_000]
X_test, y_test = X_mnist[60_000:], y_mnist[60_000:]



In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import LinearSVC

rndf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
ext_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
svm_clf =LinearSVC(max_iter=100, dual=True, random_state=42)
                                  


In [30]:
estimators = [rndf_clf, ext_clf, svm_clf]

for estimator in estimators:
    print("Training the", estimator)
    estimator.fit(X_train, y_train)

Training the RandomForestClassifier(random_state=42)
Training the ExtraTreesClassifier(random_state=42)
Training the LinearSVC(dual=True, max_iter=100, random_state=42)




In [32]:
[estimator.score(X_valid, y_valid) for estimator in estimators]

[0.9736, 0.9743, 0.8685]

In [38]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("rndf_clf", rndf_clf),
    ("ext_clf", ext_clf),
    ("svm_clf", svm_clf)
]

In [40]:
voting_clf = VotingClassifier(named_estimators)

In [42]:
voting_clf.fit(X_train, y_train)



In [44]:
voting_clf.score(X_valid, y_valid)

0.9743

In [46]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_valid_encoded = encoder.fit_transform(y_valid)

In [50]:
import numpy as np
y_valid_encoded = y_valid.astype(np.int64)

In [52]:
[estimator.score(X_valid, y_valid_encoded)
 for estimator in voting_clf.estimators_]

[0.9736, 0.9743, 0.8685]

In [54]:
voting_clf.set_params(svm_clf="drop")

In [56]:
voting_clf.estimators

[('rndf_clf', RandomForestClassifier(random_state=42)),
 ('ext_clf', ExtraTreesClassifier(random_state=42)),
 ('svm_clf', 'drop')]

In [58]:
voting_clf.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 LinearSVC(dual=True, max_iter=100, random_state=42)]

In [60]:
voting_clf.named_estimators_

{'rndf_clf': RandomForestClassifier(random_state=42),
 'ext_clf': ExtraTreesClassifier(random_state=42),
 'svm_clf': LinearSVC(dual=True, max_iter=100, random_state=42)}

In [62]:
svm_clf_trained = voting_clf.named_estimators_.pop("svm_clf")
voting_clf.estimators_.remove(svm_clf_trained)

In [64]:
voting_clf.score(X_valid, y_valid)

0.9735

In [None]:
# SVM improves the score seems best to add it back

In [128]:
from sklearn.calibration import CalibratedClassifierCV

base_lsvc = LinearSVC(max_iter=100, dual=True, random_state=42)

lsvc_calibrated = CalibratedClassifierCV(base_lsvc, cv=5)

In [130]:
from sklearn.ensemble import VotingClassifier

named_estimators = [
    ("rndf_clf", rndf_clf),
    ("ext_clf", ext_clf),
    ("lsvc_calibrated", lsvc_calibrated)
]

In [132]:
voting_clf = VotingClassifier(named_estimators)

In [134]:
voting_clf.fit(X_train, y_train)



In [136]:
voting_clf.score(X_valid, y_valid)

0.974

In [138]:
voting_clf.voting = "soft"

In [141]:
voting_clf.score(X_valid, y_valid)

0.9657

In [143]:
voting_clf.voting = "hard"
voting_clf.score(X_test, y_test)

0.9682

In [145]:
[estimator.score(X_test, y_test.astype(np.int64))
 for estimator in voting_clf.estimators_]

[0.968, 0.9703, 0.9114]