# HANDWRITTEN DIGITS PREDICTION USING VOTING CLASSIFICATION
_Builds an ensemble by implementing a voting classifier over multiple classifiers to experiment if it can outperform each individual classifier in classifying handwritten digits from MNIST dataset._


In [67]:
# Imports required modules and methods

from sklearn.datasets import fetch_openml

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier

## Retrieving the Data

In [16]:
# Retrieves the dataset
X_mnist, y_mnist = fetch_openml("mnist_784", return_X_y=True, as_frame=False)

In [17]:
# Checks the shape of the datazz
X_mnist.shape

(70000, 784)

## Data Preparation

In [18]:
# Checks the label of the dataset
print(y_mnist)

['5' '0' '4' ... '4' '5' '6']


In [54]:
# Converts type of the targets into integer
y_mnist = y_mnist.astype("int")

In [55]:
# Splits the dataset into train, validation and test set

X_train, X_test, y_train, y_test = train_test_split(X_mnist, y_mnist, test_size=10_000, random_state=42, stratify=y_mnist)

In [56]:
# Further, seperates validation set from the train set

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=10_000, random_state=42, stratify=y_train)

In [57]:
# prints the shape of all the datasets
print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

Train set shape: (50000, 784)
Validation set shape: (10000, 784)
Test set shape: (10000, 784)


## Modeling

In [81]:
# Initializes the individual classifiers

lin_svc_clf = LinearSVC(tol=0.001, max_iter=1000, random_state=42)
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
et_clf = ExtraTreesClassifier(n_estimators=100, random_state=42)

In [82]:
# Initializes the voting classifier
voting_clf = VotingClassifier(estimators=
    [("lin_svc_clf", lin_svc_clf), ("rf_clf", rf_clf), ("et_clf", et_clf)], 
    voting="hard", n_jobs=-1, verbose=True)

# Fits the estimators
# NOTE: The following step may take several minutes to complete
voting_clf.fit(X_train, y_train)

[Voting] ................... (3 of 3) Processing et_clf, total=  27.2s
[Voting] ................... (2 of 3) Processing rf_clf, total=  36.4s
[Voting] .............. (1 of 3) Processing lin_svc_clf, total= 5.0min


In [88]:
# Evaluates the individual classifier's performance on validation data
print("Linear SVC Validation Score:", voting_clf.estimators_[0].score(X_val, y_val))
print("Random Forest Classifier's Validation Score:", voting_clf.estimators_[1].score(X_val, y_val))
print("Extra Trees Classifier's Validation Score:", voting_clf.estimators_[2].score(X_val, y_val))

Linear SVC Validation Score: 0.9154
Random Forest Classifier's Validation Score: 0.9709
Extra Trees Classifier's Validation Score: 0.9731


It is evident that Linear SVC was outperformed by the other two classifiers.

In [89]:
# Now, evaluates the performance of the voting classifier
voting_clf.score(X_val, y_val)

0.9714

The performance of the ensemble was found to be lesser than the performance of one of the individual classifiers.

In [90]:
# So, let's now remove the low performing Linear SVC from the ensemble
voting_clf = voting_clf.set_params(lin_svc_clf="drop")

# Retrains the voting classifier
voting_clf.fit(X_train, y_train)

[('lin_svc_clf', 'drop'),
 ('rf_clf', RandomForestClassifier(random_state=42)),
 ('et_clf', ExtraTreesClassifier(random_state=42))]

In [99]:
# Now, re-evaluates the performance of the voting classifier
voting_clf.score(X_val, y_val)

0.9714

No performance improvement of the voting classifier after the removal of the low performing linear SVC was observed though.

**Soft Voting**

In [101]:
# Switches to soft voting to check of the performance improves

voting_clf.voting = "soft"

In [102]:
# Evaluates the performance of the "soft" voting classifier
voting_clf.score(X_val, y_val)

0.9736

In this case, _soft_ voting classifier performed little better than the _hard_ voting classifier.

**Evaluating Test Set Performance**

In [None]:
# Switches back to hard voting to check to compare performance of the ensemble
# with the performance of the individual classifier on the test set

voting_clf.voting = "hard"

# Finds the "hard" voting classifier on the test set
voting_clf.score(X_test, y_test)

In [104]:
print("Hard Voting Classifier Performance on Test Set:", voting_clf.score(X_val, y_val))

Hard Voting Classifier Performance on Test Set: 0.9736


In [106]:
# Evaluates the individual classifier's performance on test set
print("Random Forest Classifier Performance on Test Set:", voting_clf.estimators_[0].score(X_test, y_test))
print("Extra Trees Classifier Performance on Test Set:", voting_clf.estimators_[1].score(X_test, y_test))

Random Forest Classifier Performance on Test Set: 0.9657
Extra Trees Classifier Performance on Test Set: 0.9706


The ensembling implemented over voting classifier observed to performed better than the individual classifiers on the test set.