# Hands-on ML Chapter 3 - Exercises

## Exercise 1 - MNIST Classifier with 97%+ Accuracy

In [1]:
# Confirm Python version >= 3.5
import sys
assert sys.version_info >= (3, 5)

# Confirm sklearn version >= 0.20
import sklearn
assert sklearn.__version__ >= "0.20"

# helper libraries
import numpy as np
import pandas as pd 

# importing and configuring matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline 
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

### Importing Data

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)

In [3]:
# Extract target and features
X, y = mnist['data'], mnist['target']

# Confirm shapes
print("X shape: ", X.shape)
print("y shape: ", y.shape)

X shape:  (70000, 784)
y shape:  (70000,)


In [4]:
# Cast labels to unsigned 8 bit integers
y = y.astype(np.uint8)

In [5]:
# Train-test split: first 60k are training, next 10k are test
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [6]:
# Feature scaling for training and test features
X_train_mean = np.mean(X_train)
X_train_scaled = (X_train - X_train_mean) / 255.0
X_test_scaled = (X_test - X_train_mean) / 255.0

### Building Classifier

In [7]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

Of all the hyperparameters of the `KNeighborsClassifier`, we are choosing to optimise only 2.
- `n_neighbors`: the number of nearest samples used to make a classification decision.
- `weights`: the function used to compute the weight of the distance between two nearby points. If `uniform`, then all points are weighted equally. If `distance`, then weight decreases the farther away a neighbor is from a given point.
Both are used to compute a within-cluster sum of squares that is then used a a loss function.

In [8]:
# Defining parameter grid for searching
param_grid =[{'weights': ['uniform', 'distance'], 
              'n_neighbors': [3, 4, 5]}]

In [None]:
# Instantiating a classifier
knn_clf = KNeighborsClassifier()

# 5-fold cross-validation for 6 different combinations of hyperparameters
# Which means 30 different models will be trained
grid_search = GridSearchCV(estimator=knn_clf, param_grid=param_grid, cv=3, verbose=4, 
                           n_jobs=-1)

# Fit all models and find the optimal combination of hyperparams
grid_search.fit(X_train_scaled, y_train)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [0]:
# What is the best combination of hyperparameters for this problem?
grid_search.best_params_

In [0]:
# What is the accuracy for this combination of hyperparameters?
grid_search.best_score_ # Default classification score metric for CV is accuracy?

In [0]:
# Carrying out predictions on the test set
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = grid_search.predict(X_test) # Will use optimal model
accuracy_score(y_test, y_pred)

# Confusion matrix
conf_mat_knn = confusion_matrix(y_test, y_pred)

## Exercise 2 - Data Augmentation
Writing a function that shifts an MNIST image in any direction (left/right/up/down) by one pixel. Then using this function to create four copies of each training set image by shifting the image in one of four possible directions. These copies are added to the training set to create an **augmented** training set.

Will then train the base model (`SGDClassifier`) on this augmented training set and evaluate on test set to confirm that data augmentation has improved model performance.

In [0]:
from scipy.ndimage.interpolation import shift

In [0]:
def shift_image(image, dx, dy):
  # 784-dimensional vector must first be transformed into square image
  image = image.reshape((28, 28))

  # Use scipy's built-in image shifting function to add specified offsets
  shifted_image = shift(image, [dy, dx], cval=0, mode='constant')

  # Return the shifted image
  return shifted_image.reshape([-1])

In [0]:
# Demonstrating the shifting process with a sample 
image = X_train_scaled[1000]
shifted_image_down = shift_image(image, 0, 5)
shifted_image_left = shift_image(image, -5, 0)

# Display original and shifted images side by side
plt.figure(figsize=(12, 3))
plt.subplot(131)
plt.title('Original', fontsize=14))
plt.imshow(image.reshape(28, 28), interpolation='nearest', cmap='Greys')
plt.subplot(132)
plt.title('Shifted Down', fontsize=14)
plt.imshow(shifted_image_down.reshape(28, 28), interpolation='nearest', cmap='Greys')
plt.subplot(133)
plt.title('Shifted Left', fontsize=14)
plt.imshow(shifted_image_left.reshape(28, 28), interpolation='nearest', cmap='Greys')

# Render all three
plt.show()

In [0]:
# Applying up/down/left/right transformations to all training set images
X_train_augmented = [image for image in X_train_scaled]
y_train_augmented = [label for label in y_train]

In [0]:
# Single pixel shifts/offsets specified using dx and dy
for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train_augmented, y_train_augmented):
    # New image
    X_train_augmented.append(shift_image(image, dx, dy))
    
    # Label will remain unchanged
    y_train_augmented.append(label))

# Cast to np arrays that can be fed to sklearn models
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

Because shifted versions of each sample will have been added to the augmented training set in sequence, it is a good idea to shuffle the dataset. This ensures that there is no 'sequence' in which traning samples will be fed to the model.

In [0]:
# Array of shuffled indices for training set
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [0]:
# Training a new KNN classifier with the hyperparams found previously
knn_clf = KNeighborsClassifier(**grid_search.best_params) # Unpacking list

In [0]:
knn_clf.fit(X_train_augmented, y_train_augmented)

In [0]:
# Prediction should have improved
y_pred = knn_clf.predict(X_test)
accuracy_score(y_test, y_pred)