In [1]:
import sklearn
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)
    
# Fetch MNIST
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, y = mnist["data"], mnist["target"]

def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")
        
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

  warn(


In [2]:
#1 Try to build a classifier that achieves over 97% accuracy. 

In [3]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()

In [4]:
# Fine-tune the Model

from sklearn.model_selection import GridSearchCV

param_grid = [
    {'weights': ['uniform', 'distance'], 
     'n_neighbors': [3,4,5]
    },
    ]
grid_search = GridSearchCV(knn_clf, param_grid, cv=3, verbose=3)
grid_search.fit(X_train, y_train)
print(grid_search.best_estimator_)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV 1/3] END ....n_neighbors=3, weights=uniform;, score=0.969 total time=   7.1s
[CV 2/3] END ....n_neighbors=3, weights=uniform;, score=0.968 total time=   6.8s
[CV 3/3] END ....n_neighbors=3, weights=uniform;, score=0.968 total time=   7.0s
[CV 1/3] END ...n_neighbors=3, weights=distance;, score=0.970 total time=   6.2s
[CV 2/3] END ...n_neighbors=3, weights=distance;, score=0.969 total time=   6.3s
[CV 3/3] END ...n_neighbors=3, weights=distance;, score=0.969 total time=   6.2s
[CV 1/3] END ....n_neighbors=4, weights=uniform;, score=0.966 total time=   6.6s
[CV 2/3] END ....n_neighbors=4, weights=uniform;, score=0.966 total time=   6.7s
[CV 3/3] END ....n_neighbors=4, weights=uniform;, score=0.967 total time=   6.6s
[CV 1/3] END ...n_neighbors=4, weights=distance;, score=0.971 total time=   6.2s
[CV 2/3] END ...n_neighbors=4, weights=distance;, score=0.970 total time=   6.2s
[CV 3/3] END ...n_neighbors=4, weights=distance;,

In [5]:
# Evaluate on Test Set

final_model = grid_search.best_estimator_

In [6]:
# Compute average F1 score across all labels
from sklearn.metrics import accuracy_score

y_pred = final_model.predict(X_test)
accuracy_score(y_test, y_pred)

0.9714

In [7]:
import joblib

joblib.dump(final_model, "final_model.pkl")

['final_model.pkl']

#2 Write a function that can shift an MNIST image in any direction (left, right, up,
or down) by one pixel. Then, for each image in the training set, create four shifted
copies (one per direction) and add them to the training set. Finally, train your
best model on this expanded training set and measure its accuracy on the test set.

In [9]:
from scipy.ndimage import shift

def set_expand(data, x, y, cval=0):
    shift_up = shift(X_train, (y, 0))
    shift_down = shift(X_train, (-y, 0))
    shift_left = shift(X_train, (0, -x))
    shift_right = shift(X_train, (0, x))
    frames = [data, shift_right, shift_down, shift_left, shift_up]
    expanded_set = np.concatenate(frames, axis=0)
    y_train_mod = np.concatenate([y_train,y_train,y_train,y_train,y_train], axis=0)
    shuffle_idx = np.random.permutation(len(expanded_set))
    expanded_set = expanded_set[shuffle_idx]
    y_train_mod = y_train_mod[shuffle_idx]
    
    return expanded_set, y_train_mod

In [10]:
X_train_mod, y_train_mod = set_expand(X_train, 1, 1)

In [11]:
import joblib
knn_clf_mod = joblib.load('final_model.pkl')

knn_clf_mod.fit(X_train_mod, y_train_mod)

In [12]:
y_pred_mod = knn_clf_mod.predict(X_test) 
accuracy_score(y_test, y_pred_mod)

0.9001