# HANDWRITTEN DIGITS PREDICTION USING VOTING CLASSIFICATION
_Builds an ensemble by implementing a voting classifier over multiple classifiers to experiment if it can outperform each individual classifier in classifying handwritten digits from MNIST dataset._


In [None]:
# Imports required modules, classes and methods

from sklearn.datasets import fetch_openml

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier

## Retrieving the Data

In [None]:
# Retrieves the dataset
X_mnist, y_mnist = fetch_openml("mnist_784", return_X_y=True, as_frame=False)

In [None]:
# Checks the shape of the datazz
X_mnist.shape

(70000, 784)

## Data Preparation

In [None]:
# Checks the label of the dataset
print(y_mnist)

['5' '0' '4' ... '4' '5' '6']


In [None]:
# Converts type of the targets into integer
y_mnist = y_mnist.astype("int")

In [None]:
# Splits the dataset into train and test set

X_train, X_test, y_train, y_test = train_test_split(
    X_mnist, y_mnist,   # Feature values with target
    test_size=10_000,   # Number of instances to seperate for testing
    random_state=42,    # For reproducibility for operations that rely on randomness
    stratify=y_mnist    # To ensure (near) equal class distribution across splits
    )

In [None]:
# Further, seperates validation set from the train set

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=10_000, random_state=42, stratify=y_train)

In [None]:
# prints the shape of all the datasets

print("Train set shape:", X_train.shape)
print("Validation set shape:", X_val.shape)
print("Test set shape:", X_test.shape)

Train set shape: (30000, 784)
Validation set shape: (10000, 784)
Test set shape: (10000, 784)


In [None]:
# Checks the maximum value in the data to consider for scaling
X_mnist.max()

255

In [None]:
# Normalizes input applying standardization


std_scaler = StandardScaler()

X_train = std_scaler.fit_transform(X_train)
X_val = std_scaler.transform(X_val)
X_test = std_scaler.transform(X_test)

## Modeling

### Hard Voting

In [None]:
estimators = [
    ("svc_clf", LinearSVC(tol=0.001, random_state=42)),
    ("rf_clf", RandomForestClassifier(random_state=42)),
    ("ex-tree_clf", ExtraTreesClassifier(random_state=42)),
    ("mlp_clf", MLPClassifier(random_state=42))
    ]

In [None]:
# Initializes the voting classifier
voting_clf = VotingClassifier(
    estimators=estimators,
    voting="hard",      # 'hard' uses predicted class labels for majority rule voting
    n_jobs=-1,          # Using all processors to run the jobs in parallel for fit the model
    verbose=True        # Shows time elapsed while fitting is completed.
    )


In [None]:
# Fits the voting classifier containing all the individual estimators

# NOTE: The following step may take several minutes to complete

voting_clf.fit(X_train, y_train)

[Voting] .............. (3 of 4) Processing ex-tree_clf, total=  14.3s
[Voting] ................... (2 of 4) Processing rf_clf, total=  20.6s
[Voting] .................. (4 of 4) Processing mlp_clf, total=  23.7s
[Voting] .................. (1 of 4) Processing svc_clf, total= 5.2min


**Performance Analysis of <u>Hard</u> Voting Classifier**

Performance Analysis on Validation Set:

In [None]:
print("Validation Score of individual estimator in the voting classifier:")

for estimator in voting_clf.estimators_:
    print("\t{}: {}".format(estimator, estimator.score(X_val, y_val)))

Validation Score of individual estimator in the voting classifier:
	LinearSVC(random_state=42, tol=0.001): 0.9018
	RandomForestClassifier(random_state=42): 0.9616
	ExtraTreesClassifier(random_state=42): 0.9657
	MLPClassifier(random_state=42): 0.9654


In [None]:
# Now, evaluates the validation performance of the hard voting classifier

voting_clf.score(X_val, y_val)

0.9641

Performance Analysis on Test Set:

In [None]:
print("Test Score of individual estimator in the voting classifier:")

for estimator in voting_clf.estimators_:
    print("\t{}: {}".format(estimator, estimator.score(X_test, y_test)))

Test Score of individual estimator in the voting classifier:
	LinearSVC(random_state=42, tol=0.001): 0.8987
	RandomForestClassifier(random_state=42): 0.9605
	ExtraTreesClassifier(random_state=42): 0.964
	MLPClassifier(random_state=42): 0.966


In [None]:
# Now, evaluates the test performance of the voting classifier

voting_clf.score(X_test, y_test)

0.9649

**Observations:**

1) Individual performance of Linear SVM classifier was relatively lesser than the performance of each of the other estimators in the emsembling.

2) Overall voting classifier performance is nearly same as that of remaining estimators and hence this performance of the voting classifier is dependable.

### Soft Voting

In [None]:
# As Linear SVC does not have method predict_proba() that is required for soft voting, it gets
# dropped from the list of estimators in the voting classifier by flagging it as "drop".

voting_clf_2 = voting_clf.set_params(svc_clf="drop")

In [None]:
# Enabled "soft" voting
voting_clf_2.voting = "soft"

In [None]:
# Refits the voting classifier after the removal of estimator Linear SVC
voting_clf_2.fit(X_train, y_train)

[Voting] .............. (3 of 4) Processing ex-tree_clf, total=  13.3s
[Voting] ................... (2 of 4) Processing rf_clf, total=  19.8s
[Voting] .................. (4 of 4) Processing mlp_clf, total=  22.5s


In [None]:
# Queries for estimators to ensure the estimator removal takes place
voting_clf_2.estimators_

[RandomForestClassifier(random_state=42),
 ExtraTreesClassifier(random_state=42),
 MLPClassifier(random_state=42)]

**Performance Analysis of <u>Soft</u> Voting Classifier**

In [None]:
# Evaluates performance on the validation set
voting_clf_2.score(X_val, y_val)

0.9708

In [None]:
# Evaluates performance on the test set
voting_clf_2.score(X_test, y_test)

0.9706

**Observations:**

- Soft voting performance [**0.9708**] on validation set was observed to be <u>increased</u> from that [**0.9641**] of its hard counterpart.

- Soft voting performance [**0.9706**] on test set was observed to be <u>increased</u> from that [**0.9649**] of its hard counterpart.

- Overall, soft voting classifier performed relatively <u>better</u> than the hard voting classifier.