In [None]:
%matplotlib inline

# Fetching the MNIST dataset
from sklearn.datasets import fetch_mldata

# mnist.data refers to the MNIST images
# mnist.target refers to the MNIST labels
mnist = fetch_mldata('MNIST original')

In [None]:
# Verifying the shapes of the data (images) and target (labels)

print("MNIST Images - Shape:" , mnist.data.shape)
print("MNIST Labels - Shape:", mnist.target.shape)

In [None]:
# Test_train_splitter module
# Training dataset size: 60000 images
# Test dataset size: 10000 images (1/7 of the original dataset)

from sklearn.model_selection import train_test_split
train_img, test_img, train_lbl, test_lbl = train_test_split(
    mnist.data, mnist.target, test_size=1/7.0, random_state=0)

In [None]:
# Verifying the shapes of the training and testing data sets

print("Training Data - Shape:" , train_img.shape)
print("Training Labels - Shape:" , train_lbl.shape)
print("Test Data - Shape:" , test_img.shape)
print("Test Labels - Shape:" , test_lbl.shape)

In [None]:
# Printing the first 5 images in the training dataset along with their labels

import numpy as np
import matplotlib.pyplot as plt

plt.figure(figsize=(20,4))
for index, (image, label) in enumerate(zip(train_img[0:5], train_lbl[0:5])):
    plt.subplot(1, 5, index + 1)
    plt.imshow(np.reshape(image, (28,28)), cmap=plt.cm.gray)
    plt.title('Label: %i\n' % label, fontsize = 15)

In [None]:
# Using the Logistic Regression classifier from Scikit-Learn
# Reference: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

# Properties - Penalty: L2, Dual_Formulation: False, Tolerance (for stopping criteria): 1e-4, Class_Weight: None, Maximum no. of Iterations (max_iter): 100 

from sklearn.linear_model import LogisticRegression

In [None]:
# Solver used: lbfgs
# Reference: https://en.wikipedia.org/wiki/Limited-memory_BFGS

# Properties: L2 Regularization with Primal Formulation
# Uses: Good for Multi-class problems & Handles Multinomial loss

logisticRegr = LogisticRegression(solver = 'lbfgs')

In [None]:
# Fitting the model according to the given training data
# Fit (Training Vector, Target Vector)
# Returns an object (Self)

logisticRegr.fit(train_img, train_lbl)

In [None]:
# Predict Class label for the first image in the Test dataset
# Returns a NumPy Array of predicted class label per sample

logisticRegr.predict(test_img[0].reshape(1,-1))

In [None]:
# Predict Class labels for the first 10 images in the Test dataset
# Returns a NumPy Array of predicted class label per sample

logisticRegr.predict(test_img[0:10])

In [None]:
# Predict Class labels for all the images in the Test dataset
# The returned NumPy array of class labels is stored in Predictions

predictions = logisticRegr.predict(test_img)

In [None]:
# Calculating the Mean accuracy on the given test data and labels

score = logisticRegr.score(test_img, test_lbl)
print(score)

In [None]:
# Creating Confusion Matrix based on the expected test labels and the predictions made
# Reference: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html#sklearn.metrics.confusion_matrix

import seaborn as sns
from sklearn import metrics

cm = metrics.confusion_matrix(test_lbl, predictions)

In [None]:
# Plotting the confusion matrix
# Seaborn.heatmap used for plotting rectangular data as a color-encoded matrix
# Reference: https://seaborn.pydata.org/generated/seaborn.heatmap.html

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15);

In [None]:
# Calculating Misclassified Indices based on inconsistencies between expected (test_lbl) and actual (predictions) values.

index = 0
misclassifiedIndexes = []
for label, predict in zip(test_lbl, predictions):
    if label != predict: 
        misclassifiedIndexes.append(index)
    index +=1

In [None]:
# Printing Misclassified Images

plt.figure(figsize=(20,4))
for plotIndex, badIndex in enumerate(misclassifiedIndexes[0:5]):
    plt.subplot(1, 5, plotIndex + 1)
    plt.imshow(np.reshape(test_img[badIndex], (28,28)), cmap=plt.cm.gray)
    plt.title('Predicted: {}, Actual: {}'.format(predictions[badIndex], test_lbl[badIndex]), fontsize = 15)