In [2]:
# Multiclass classification.

# We previously compared two binary classifiers: SGD and RF.
# Multiclass (or multinomial) means more than 2 labels across dataset.
# Multilabel means 1 or more labels per sample.
# Multioutput means multilabel and multiclass (e.g. 3 possible labels, each sample has 2).

# Repeat steps from previous notebook.
import ssl
import tensorflow
from tensorflow.keras.datasets import mnist
(X_train, y_train), (X_test, y_test) = mnist.load_data()
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42) 
# Transform 28 * 28 pixel images into 784 pixel array
num_pixels = X_train.shape[1]*X_train.shape[2]
X_train1D = X_train.reshape(X_train.shape[0],num_pixels)
X_test1D = X_test.reshape(X_test.shape[0],num_pixels)
sample_0 = X_train1D[0].reshape(1,-1)
# numpy ravel() returns contiguous flattened array. 
y_train1D = y_train.ravel() # otherwise SVC complains that shape is (1,60K)
y_train1D

array([5, 0, 4, ..., 5, 6, 8], dtype=uint8)

In [3]:
# Two ways to do multiclass with a binary classifier.
# One-vs-Rest e.g. take best score of is-0, is-1, is-2, etc., Uses N classifiers. Usually preferred.
# One-vs-One e.g. take class with most pairwise wins. Uses N*(N-1)/2 classifiers. 
# OvO is good for SVM because SVM does not scale.

# SKLearn has a class called OneVsRestClassifier.
# SKLearn SVC automatically uses OvO for multiclass SVM.
from sklearn.svm import SVC
svm_clf=SVC()
# This takes a long time! 10 min? This trains 10*9/2=45 classifiers.
svm_clf.fit(X_train1D, y_train1D)
svm_clf.predict(sample_0)

array([5], dtype=uint8)

In [4]:
# Model computes 10 scores per instance i.e. this vs other.
sample_0_scores=svm_clf.decision_function(sample_0)
sample_0_scores

array([[ 1.72501977,  2.72809088,  7.2510018 ,  8.3076379 , -0.31087254,
         9.3132482 ,  1.70975103,  2.76765202,  6.23049537,  4.84771048]])

In [7]:
# The SGDClassifier detects multiclass labels and switches to one-vs-rest.
# StandardScaler converts actual scores to z-scores; 5% improvement on this data.
import numpy as np
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
X_train_scaled = scaler.fit_transform(X_train1D.astype(np.float64))
# The cross validation takes a long time! 15 min?
# First, score accuracy.
from sklearn.model_selection import cross_val_score
accuracy=cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")
accuracy

array([0.8983, 0.891 , 0.9018])

In [13]:
# Second, the actual predictions.
from sklearn.model_selection import cross_val_predict
predictions=cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
predictions

array([3, 0, 4, ..., 5, 6, 8], dtype=uint8)

In [14]:
# Third, the 10x10 confusion matrix.
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_train,predictions)
cm

array([[5577,    0,   22,    5,    8,   43,   36,    6,  225,    1],
       [   0, 6400,   37,   24,    4,   44,    4,    7,  212,   10],
       [  27,   27, 5220,   92,   73,   27,   67,   36,  378,   11],
       [  22,   17,  117, 5227,    2,  203,   27,   40,  403,   73],
       [  12,   14,   41,    9, 5182,   12,   34,   27,  347,  164],
       [  27,   15,   30,  168,   53, 4444,   75,   14,  535,   60],
       [  30,   15,   42,    3,   44,   97, 5552,    3,  131,    1],
       [  21,   10,   51,   30,   49,   12,    3, 5684,  195,  210],
       [  17,   63,   48,   86,    3,  126,   25,   10, 5429,   44],
       [  25,   18,   30,   64,  118,   36,    1,  179,  371, 5107]])

In [None]:
# Book uses KNN with SKLearn KNeighborsClassier for multilabel & multioutput.
# One form of multi-label: label each number as small/large and even/odd.
# Label every pixel white/grey/black (multiclass).
# Label every image by all of its pixel labels (multilabel).

# The exercises show how to enhance image training with augmentation.
# Take every image and translate it up, down, left & right by 1 pixel each way.
# Add those to training set with the same labels as before.
# A further possibility is to rotate or flip each image.
# Example of why this works: 3 vs 5 is hard to learn, need more examples.
# A further possibility is adding toplogy features like # closed loops (8 has 2).
# Another possibility is data cleaning / dimensionality reduction:
# book shows reducing greyscale to fewer values using KNN.