Import the appropriate libraries to start with.

In [1]:
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate,cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix



We use fetch_openml to fetch the MNIST dataset from OpenML.org


In [2]:
mnist = fetch_openml('mnist_784', as_frame=False)

We explore the generated datasets a bit, just to confirm that are NumPy arrays and their shape and the type of their context.

In [4]:
X,y = mnist.data, mnist.target

In [5]:
X.shape, y.shape, type(X[0]),type(y[0])

((70000, 784), (70000,), numpy.ndarray, str)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 42)

In [7]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((59500, 784), (59500,), (10500, 784), (10500,))

In [8]:
len(X)*0.85, len(X)*0.15

(59500.0, 10500.0)

We can confirm that all the classes have the adequate number of instances. 

We define the function tht will display the feature vector of an instance using Matplotlib’s imshow() function. It is reshaping it to a 28 × 28 array, and display it. We use cmap="binary" to get a
grayscale color map where 0 is white and 255 is black:

In [None]:
def plot_digit(image_data):
 image = image_data.reshape(28, 28)
 plt.imshow(image, cmap="binary")
 plt.axis("off")


Then we plot the first 8 images of the Training and Test set.

In [None]:

fig, axes = plt.subplots(2, 4, figsize=(10, 5))
fig.suptitle("First 8 Images from the Training Set")

for i in range(8):
    plt.subplot(2, 4, i+1)
    plot_digit(X_train[i])
    plt.title(y_train[i])


In [None]:
fig, axes = plt.subplots(2, 4, figsize=(10, 5))
fig.suptitle("First 8 Images from the Test Set")

for i in range(8):
    plt.subplot(2, 4, i+1)
    plot_digit(X_test[i])
    plt.title(y_test[i]) 



Since this is a classification problem of distinguishing between two classes: even and odd numbers, we will first create the target vectors for this classification task. To achieve this, we will turn y_train and y_test from string type to integers to allow us do the computations.


In [None]:
y_train_int = y_train.astype(int) 
y_test_int = y_test.astype(int)

y_train_even = (y_train_int%2 == 0) # True for all even numbers, False for all other digits
y_test_even = (y_test_int%2 == 0)
print(type(y_train_even), y_train_even.shape, y_train_even[:10])
print(type(y_test_even), y_test_even.shape, y_test_even[:10])

We create an Stohastic Gradient binary classifier (SGDClassifier) and train it on the whole training set.


In [None]:
sgd_clf = SGDClassifier(random_state=42)


sgd_clf.fit(X_train, y_train_even)


Check if working


In [None]:
some_digit = X_test[3]
check = sgd_clf.predict([some_digit])
print(y_test[3], check)


We will use the MinMaxScaler as a normalization technique.


In [None]:
min_max_scaler = MinMaxScaler()
#stdScaler = StandardScaler()


We introduce our pipeline 


In [None]:
sgdc_pipeline = Pipeline([
    ('min_max_scaler', MinMaxScaler()),
    ('sgd_clf', SGDClassifier(random_state=42))
])

# sgdc_pipeline = Pipeline([
#     ('std_scaler', StandardScaler()),
#     ('sgd_clf', SGDClassifier(random_state=42))
# ])


We use the pipeline to fit the training data.


In [None]:
X_train_prepared = sgdc_pipeline.fit(X_train,y_train_even)


In [None]:

X_train_prepared


And use the pipeline to predict on test data


In [None]:
sgd_preds = sgdc_pipeline.predict(X_test)
sgd_preds[:8]


We observe that only the second value is predicted faulty.



We use 3-fold cross validation and evaluate your classification pipeline by calculating the next metrics: accuracy, recall, and precision. 


In [None]:
cv_sgdc_accuracy = cross_val_score(sgdc_pipeline, X_train, y_train_even, cv=3, scoring='accuracy')
print('Cross validation accuracy scores:', cv_sgdc_accuracy)


In [None]:
cv_sgdc_precision = cross_val_score(sgdc_pipeline, X_train, y_train_even, cv=3, scoring='precision')
print('Cross validation precision scores:', cv_sgdc_precision)


In [None]:
cv_sgdc_recall = cross_val_score(sgdc_pipeline, X_train, y_train_even, cv=3, scoring='recall')
print('Cross validation recall scores:', cv_sgdc_recall)


We introduce  a dummy model that always guesses that an image belongs to the even category to compare it with our model.


In [None]:
dummy_clf = DummyClassifier(strategy="constant", constant=True)
dummy_clf.fit(X_train, y_train_even)
dummy_clf_predictions = dummy_clf.predict(X_test)
print(dummy_clf_predictions[:20])



And we evaluate it with the same metrics as above(accuracy, precision, recall)


In [None]:
dummy_clf_accuracy = cross_val_score(dummy_clf, X_train, y_train_even, cv=3, scoring='accuracy')
print('dummy_clf cross validation accuracy scores:', dummy_clf_accuracy)


In [None]:
dummy_clf_precision = cross_val_score(dummy_clf, X_train, y_train_even, cv=3, scoring='precision')
print('dummy_clf cross validation precision scores:', dummy_clf_precision)


In [None]:
dummy_clf_recall = cross_val_score(dummy_clf, X_train, y_train_even, cv=3, scoring='recall')
print('dummy_clf cross validation recall scores:', dummy_clf_recall)


We can observe that the accuracy and recall metrics are better on our model. The recall = 1 alone doesn't say anything, and is the value we expected as it's predicting all the even numbers correctly and never predicts a FN instance.



We calculate the confusion matrix for the training set, following the same 3-fold cross validation protocol.


In [None]:
y_train_cv_predictions = cross_val_predict(sgdc_pipeline, X_train, y_train_even, cv=3) #predictions on each instance
comfusion_matrix_cv = confusion_matrix(y_train_even, y_train_cv_predictions) 
comfusion_matrix_cv


Observing the confusion matrix we see that we have 27476 True Negative predictions, 2653 False Positive predictions, 3548 False Negative predictions and 25823 True Positive predictions.  


The precision score


In [None]:
precision_score(y_train_even, y_train_cv_predictions)


And the recall score


In [None]:
recall_score(y_train_even, y_train_cv_predictions)


In [None]:
accuracy_score(y_train_even, y_train_cv_predictions)
f1_score(y_train_even,y_train_cv_predictions)



We train the sgdc_pipeline with all the training data and do the predictions also.(We had already calculated them before though and it's the X_train_prepared and sgd_preds)


In [None]:
X_train_prepared = sgdc_pipeline.fit(X_train,y_train_even)
sgd_preds = sgdc_pipeline.predict(X_test)


We extract the confusion matrix using these predictions and the test data.


In [None]:
comfusion_matrix_sgd = confusion_matrix(y_test_even, sgd_preds) 
comfusion_matrix_sgd


We compute the precision, recall and accuracy scores.


In [None]:
precision_score(y_test_even, sgd_preds)
recall_score(y_test_even, sgd_preds)
accuracy_score(y_test_even, sgd_preds)
f1_score(y_test_even, sgd_preds)


Comparing the results of the metrics on the training and test data accordingly, we see that precision is lower on the test set and recall is higher. Accuracy and F1 metrics are almost the same. 



We compute the False Positives (FP) and False Negatives (FN) for the predictions and the test set. 


In [None]:
false_positives = np.where((y_test_even == 0) & (sgd_preds == 1))[0]  
false_negatives = np.where((y_test_even == 1) & (sgd_preds == 0))[0]  



We pick a random instance for each one of them


In [None]:
rnm_fp_instance = np.random.choice(false_positives)
rnm_fn_instance = np.random.choice(false_negatives)


We plot their original images in seperate figures


In [None]:
some_digit = X_test[rnm_fp_instance] #The value of the random fp instance
plt.figure(figsize=(5, 5))
plot_digit(some_digit)
plt.show()


In [None]:
some_digit = X_test[rnm_fn_instance] #The value of the random fn instance
plt.figure(figsize=(5, 5))
plot_digit(some_digit)
plt.show()