In [1]:
# importing the needed packages
from sklearn.datasets import fetch_openml
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.ensemble import StackingClassifier

In [2]:
# reading the data
X_mnist, y_mnist = fetch_openml('mnist_784', return_X_y=True, as_frame=False, parser="auto")

In [3]:
# splitting the data into training and test sets
X_train, y_train = X_mnist[:60_000]/255., y_mnist[:60_000]
X_test, y_test = X_mnist[60_000:]/255., y_mnist[60_000:]

In [4]:
# transforming the data using PCA
pca = PCA(0.90)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [5]:
# creating the individual classifiers
dec_trees_clf = DecisionTreeClassifier(max_depth = 10, random_state = 42)
random_forest_clf = RandomForestClassifier(n_estimators = 50, random_state = 42)
ada_clf = AdaBoostClassifier(n_estimators = 50, algorithm = "SAMME", random_state = 42)
svm_clf = LinearSVC(max_iter = 500, dual = "auto", random_state = 42)
log_reg = LogisticRegression(max_iter = 500, random_state = 42)

In [6]:
# combining them in a list of tuples with the name and the estimator
named_estimators = [("random_forest", random_forest_clf),
                    ("ada_clf", ada_clf),
                    ("svm", svm_clf),
                    ("dec_trees", dec_trees_clf),
                    ("log_reg", log_reg)]

In [7]:
# training and calculating the score of each one of the estimators on the test set
scores = {}
for named_estimator in named_estimators:
    name, estimator = named_estimator
    print("Training a", estimator)
    estimator.fit(X_train_pca, y_train)
    score = estimator.score(X_test_pca, y_test)
    scores[name] = score

Training a RandomForestClassifier(n_estimators=50, random_state=42)
Training a AdaBoostClassifier(algorithm='SAMME', random_state=42)
Training a LinearSVC(max_iter=500, random_state=42)
Training a DecisionTreeClassifier(max_depth=10, random_state=42)
Training a LogisticRegression(max_iter=500, random_state=42)


In [8]:
print("Scores:")
for clf_name, clf_score in scores.items():
    print (f"{clf_name}: {clf_score:.4f}")

Scores:
random_forest: 0.9471
ada_clf: 0.6259
svm: 0.9115
dec_trees: 0.7980
log_reg: 0.9194


In [9]:
# creating the stacking classifier with 5-fold cross validation and 
# a Random Forest Classifier as the final estimator and train it.
stacking_clf = StackingClassifier(
    estimators = named_estimators,
    final_estimator = RandomForestClassifier(random_state = 42),
    cv = 3)

stacking_clf.fit(X_train_pca, y_train)

In [10]:
stacking_score = stacking_clf.score(X_test_pca, y_test)
print (f"Stacking score: {stacking_score}")

Stacking score: 0.9555


In [11]:
print("The stacking classifier performs better by:")
for clf_name, clf_score in scores.items():
    print (f"{100*(stacking_score-clf_score)/stacking_score:.1f}% than the {clf_name}")

The stacking classifier performs better by:
0.9% than the random_forest
34.5% than the ada_clf
4.6% than the svm
16.5% than the dec_trees
3.8% than the log_reg
