Classify hand-written letters through MNIST dataset.

In [1]:
import sys
import os

import numpy as np
import pandas as pd

# 1. Import the MNIST dataset

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

See a sample example:

In [5]:
import matplotlib.pyplot as plt

sample = X[16]
sample_image = sample.reshape(28, 28)

plt.figure(figsize=(3,3))
plt.imshow(sample_image, cmap="binary")
plt.axis("off")
plt.show()

<Figure size 300x300 with 1 Axes>

In [6]:
y[16]

'2'

Note the label is a string. Cast y to integers

In [7]:
y = y.astype(np.uint8)

## Split into training set and test set

In [8]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# 2. Training a Binary Classifier
An example binary classifier that identifies if the image is 2 or not.

In [9]:
y_train_2 = (y_train == 2)
y_test_2 = (y_test == 2)
y_train_2[16]

True

## Stochastic Gradient Descent 
Using `SGDClassifier`

In [10]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd.fit(X_train, y_train_2)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=1000,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=42, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [11]:
sgd.predict([sample])

array([ True])

## Evaluating using Cross-Validation

In [12]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd, X_train, y_train_2, cv=3, scoring="accuracy")

array([0.97255, 0.90635, 0.97075])

## Confusion Matrix

In [13]:
from sklearn.model_selection import cross_val_predict

y_train_pred = cross_val_predict(sgd, X_train, y_train_2, cv=3)

In [14]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_2, y_train_pred)

array([[51645,  2397],
       [  610,  5348]])

true negative, false positive

false negative, true positive

## Precision, Recall, F1 Score
Due to sampling bias (few 2 examples than others), we need to check the precision, recall and F1 score.

In [15]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_train_2, y_train_pred)

0.6905100064557779

In [16]:
recall_score(y_train_2, y_train_pred)

0.8976166498825109

In [17]:
from sklearn.metrics import f1_score

f1_score(y_train_2, y_train_pred)

0.7805590016784646

In [18]:
y_scores = cross_val_predict(sgd, X_train, y_train_2, cv=3,
                             method="decision_function")
y_scores

array([-29080.97115191, -31641.49972934, -22539.69760534, ...,
       -28402.71973754, -17271.55779269, -16958.72642768])

## ROC Curve (Receiver Operating Characteristic)

The ROC curve plots the true positive rate (another name for recall) against the false positive rate. The FPR is the ratio of negative instances that are incorrectly classified as positive. It is equal to one minus the true negative rate, which is the ratio of negative instances that are correctly classified as negative. The TNR is also called specificity. Hence the ROC curve plots $Recall/ (1 – specificity)$.

In [19]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_2, y_scores)