In [1]:
from sklearn.datasets import fetch_openml
import numpy as np

In [3]:
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
X = mnist['data']
y = mnist['target']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
some_digit = X[0]
some_digit_image = some_digit.reshape(28,28)

In [None]:
plt.imshow(some_digit_image, cmap='binary')
plt.axis(False)
plt.show()

In [None]:
y[0]

In [None]:
y = y.astype(np.uint8)

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# Binary Classifier

In [None]:
y_train_5 = (y_train == 5)  # True for all 5s, False for all other digits
y_test_5 = (y_test == 5)

In [None]:
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(n_jobs = -1,random_state=42)

In [None]:
sgd_clf.fit(X_train,y_train_5)

In [None]:
sgd_clf.predict([some_digit])

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(sgd_clf, X_train, y_train_5, cv = 10, n_jobs = -1,scoring='accuracy')

In [None]:
cross_val_score(sgd_clf, X_train, y_train_5, cv = 3,n_jobs = -1, scoring='accuracy')

In [None]:
from sklearn.base import BaseEstimator

class Never5Classifier(BaseEstimator):
    def fit(self,X,y=None):
        return self
    def predict(self, X):
        return np.zeros((len(X),1), dtype=bool)

In [None]:
n5c = Never5Classifier()

In [None]:
cross_val_score(n5c, X_train, y_train_5, cv=3, scoring="accuracy")

In [None]:
# confusion Matrix

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_5,n_jobs = -1, cv=3)

In [None]:
y_train_pred

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
confusion_matrix(y_train_5,y_train_pred)

In [None]:
from sklearn.metrics import precision_score, recall_score

In [None]:
precision_score(y_train_5, y_train_pred)

In [None]:
recall_score(y_train_5, y_train_pred)

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_train_5,y_train_pred)

## Precision/Recall Trade-off

In [None]:
y_scores = sgd_clf.decision_function([some_digit])
y_scores

In [None]:
threshold = 0

In [None]:
y_some_digit_pred = (y_scores > threshold)

In [None]:
y_some_digit_pred

In [None]:
threshold = 8000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
y_scores = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3,method="decision_function")

In [None]:
y_scores

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

In [None]:
recalls

In [None]:
thresholds

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    fig = plt.figure(figsize= (16,9))
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.grid(True)
    plt.legend()

In [None]:
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.show()

In [None]:
threshold_90_precision = thresholds[np.argmax(precisions >= 0.90)]

In [None]:
y_train_pred_90 = (y_scores >= threshold_90_precision)

In [None]:
threshold_90_precision

In [None]:
y_train_pred_90

In [None]:
precision_score(y_train_5, y_train_pred_90)

In [None]:
recall_score(y_train_5, y_train_pred_90)

# ROC curve

The receiver operating characteristic (ROC) curve is another common tool used with binary classifiers. It is very similar to the precision/recall curve, but instead of plotting precision versus recall, the ROC curve plots the true positive rate (another name for recall) against the false positive rate (FPR). The FPR is the ratio of negative instances that are incorrectly classified as positive. It is equal to 1 – the true negative rate (TNR), which is the ratio of negative instances that are correctly classified as negative. The TNR is also called specificity. Hence, the ROC curve plots sensitivity (recall) versus 1 – specificity.

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_5, y_scores)

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    fig = plt.figure(figsize = (16,9))
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') # Dashed diagonal
    plt.grid(True)

plot_roc_curve(fpr, tpr)
plt.show()

One way to compare classifiers is to measure the area under the curve (AUC). A perfect classifier will have a ROC AUC equal to 1, whereas a purely random classifier will have a ROC AUC equal to 0.5. Scikit-Learn provides a function to compute the ROC AUC:

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_train_5, y_scores)

# Multiclass Classification

In [None]:
from sklearn.svm import SVC

In [None]:
svm_clf = SVC()

In [None]:
svm_clf.fit(X_train, y_train)

In [None]:
svm_clf.predict([some_digit])

In [None]:
some_digit_score = svm_clf.decision_function([some_digit])
some_digit_score

In [None]:
np.argmax(some_digit_score)

In [None]:
svm_clf.classes_

In [None]:
from sklearn.multiclass import OneVsRestClassifier

In [None]:
ovr_clf = OneVsRestClassifier(SVC(),n_jobs = -1)
ovr_clf.fit(X_train,y_train)

In [None]:
ovr_clf.predict([some_digit])

In [None]:
ovr_clf.decision_function([some_digit])

## Training SGD Classifier

In [None]:
sgd_clf.fit(X_train, y_train)

In [None]:
sgd_clf.predict([some_digit])

In [None]:
sgd_clf.decision_function([some_digit])

In [None]:
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring='accuracy')

### Pre-processing and increasing accuracy

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))

In [None]:
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv = 3)

In [None]:
y_train_pred.shape

In [None]:
conf_mx = confusion_matrix(y_train, y_train_pred)

In [None]:
plt.matshow(conf_mx)
plt.show()

## Errors

Let’s focus the plot on the errors. First, you need to divide each value in the confusion matrix by the number of images in the corresponding class so that you can compare error rates instead of absolute numbers of errors (which would make abundant classes look unfairly bad):

In [None]:
row_sums = conf_mx.sum(axis = 1, keepdims = True)
norm_conf_mx = conf_mx / row_sums

In [None]:
np.fill_diagonal(norm_conf_mx,0)
plt.matshow(norm_conf_mx)

In [None]:
import matplotlib

In [None]:
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = matplotlib.cm.binary, **options)
    plt.axis("off")

In [None]:
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train == cl_a) & (y_train_pred == cl_a)]
X_ab = X_train[(y_train == cl_a) & (y_train_pred == cl_b)]
X_ba = X_train[(y_train == cl_b) & (y_train_pred == cl_a)]
X_bb = X_train[(y_train == cl_b) & (y_train_pred == cl_b)]

plt.figure(figsize=(8,8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)
plt.show()

# Multilabel Classification

Until now each instance has always been assigned to just one class. In some cases you may want your classifier to output multiple classes for each instance. Consider a face-recognition classifier: what should it do if it recognizes several people in the same picture? It should attach one tag per person it recognizes. Say the classifier has been trained to recognize three faces, Alice, Bob, and Charlie. Then when the classifier is shown a picture of Alice and Charlie, it should output [1, 0, 1] (meaning “Alice yes, Bob no, Charlie yes”). Such a classification system that outputs multiple binary tags is called a multilabel classification system.

We won’t go into face recognition just yet, but let’s look at a simpler example, just for illustration purposes:

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
y_train_large = (y_train >= 7)
y_train_large

In [None]:
y_train.shape

In [None]:
y_train_odd = (y_train %2 == 1)

In [None]:
y_multilabel = np.c_[y_train_large, y_train_odd]

In [None]:
knn_clf = KNeighborsClassifier(n_jobs = -1)
knn_clf.fit(X_train,y_multilabel)

In [None]:
knn_clf.predict([some_digit])

In [None]:
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_multilabel, cv=3)
f1_score(y_multilabel, y_train_knn_pred, average = 'macro')

# Multioutput–Multiclass classification (or simply Multioutput classification)

In [None]:
noise = np.random.randint(0,100,(len(X_train),784))
X_train_mod = X_train + noise
noise = np.random.randint(0,100,(len(X_test),784))
X_test_mod = X_test + noise
y_train_mod = X_train
y_test_mod = X_test

In [None]:
some_index = 10

In [None]:
noise_digit = X_train_mod[some_index]
noice_digit_image = noise_digit.reshape(28,28)
plt.imshow(noice_digit_image, cmap='binary')
plt.axis(False)
plt.show()

In [None]:
clean_digit = X_train[some_index]
clean_digit_image = clean_digit.reshape(28,28)
plt.imshow(clean_digit_image, cmap='binary')
plt.axis(False)
plt.show()

In [None]:
knn_clf.fit(X_train_mod, y_train_mod)
clean_digit = knn_clf.predict([X_test_mod[some_index]])
plot_digits(clean_digit)

In [None]:
plot_digits([noice_digit_image, clean_digit_image])

In [None]:
plot_digits([X_test_mod[some_index],clean_digit])