In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

In [2]:
import keras
from keras.datasets import mnist
from keras.layers import Input, Dense, Dropout, Flatten, MaxPooling2D, MaxPooling1D, Conv2D, BatchNormalization
from keras.models import Model, Sequential
import numpy as np

Keras is a great place to start, it has a relatively simple design, and uses TensorFlow under the hood. And -- it has the mnist data already available.


In [3]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [4]:
x_train.shape #60000 IMAGES AT 28X28 PIXELS

(60000, 28, 28)

We do a tiny bit of data prep -- normalization, meaning we divide by the max value to make all the pixes on the range of 0 - 1, this helps the model learn faster. For fun you can take out the ` / np.max(...)` and see how much longer it takes for the accuracy rise.

And -- we reshape. This is about meeting Keras expectations. The convolution layers are set up for 3D data -- meaning (x, y, color) channel pixels. Since the source mnist data is just (x, y), we have to shape the grey scale into the color channel position in the matrix, adding one additional dimension.

In [5]:
x_train = np.expand_dims(x_train / np.max(x_train), -1)
x_test = np.expand_dims(x_test / np.max(x_test), -1)
x_train.shape #we need to reshape the data to meet keras expectations as it looks for a 4d :colour

(60000, 28, 28, 1)

For the output y labels, we need to convert the digit identifiers 0, 1, ... 8, 9 to one hot encodings where they are 10 slots, with a 0 or one acting as a flag.

In [6]:
y_train[0:10]

array([5, 0, 4, 1, 9, 2, 1, 3, 1, 4], dtype=uint8)

In [7]:
train_labels = keras.utils.to_categorical(y_train, 10)
test_labels = keras.utils.to_categorical(y_test, 10)

train_labels[0:10]

array([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]], dtype=float32)

And now -- an actual deep network, this is a now classic design, using convolution, pooling, and dropout. Finally, the model ends in a dense layer with softmax -- seems familiar, this softmax output is just like our logistic regression. 

The difference here is -- we have created a deep learning model with many layers.

And now, using the Keras, build a model that has convolution, pooling, dropout and a final softmax classification.


One thing to note here is Flatten. Because our images are two dimensional *x,y* pairs, and our output is one dimension -- a class 0-9, Flatten is needed to reduce the dimensions.


In [8]:
input_shape = x_train[0].shape
num_classes = 10
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3), #look at a 3x3 patch
                 activation='relu', #squish into a single pixel
                 input_shape=input_shape))
model.add(Conv2D(64, (3, 3), activation='relu')) #repeat
model.add(MaxPooling2D(pool_size=(2, 2))) #looks for strongest outputs from prior Conv2D and crushes it down to a 2x2 patch
model.add(Dropout(0.25)) #avoid overfitting your model and makes model aware to unseen data
model.add(Flatten()) #2D image but 1D output so flatten lines up shape
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5)) #avoid overfitting
model.add(Dense(num_classes, activation='softmax')) #logistic regression from before, turning output labels to a set of probabilities

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 26, 26, 32)        320       
                                                                 
 conv2d_1 (Conv2D)           (None, 24, 24, 64)        18496     
                                                                 
 max_pooling2d (MaxPooling2D  (None, 12, 12, 64)       0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 12, 12, 64)        0         
                                                                 
 flatten (Flatten)           (None, 9216)              0         
                                                                 
 dense (Dense)               (None, 128)               1179776   
                                                        

With the model assembled, we compile it, which prepares the model for execution with a solver. And then we fit it -- using the training data and labels to learn parameters, and the testing data and labels to check how well the model works.

This is an important point -- holding out part of the data to test. If you use all of you data in training, you can end up with a model that merely memorizes your input data, but cannot make predictions about new, unseen data. This is a phenomena known as *overfitting*.

In [9]:
model.compile(loss='categorical_crossentropy', #generally used for classification problems
              optimizer='adam', #optimiser to update numbers inside model
              metrics=['accuracy'])

history = model.fit(x_train, train_labels,
                    batch_size=64,
                    epochs=8,
                    verbose=1,
                    validation_data=(x_test, test_labels))

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


And now we'll get a report as to how well we're classifying.

In [12]:
import sklearn.metrics


In [17]:
predictions = (model.predict(x_test) > 0.5).astype("int32")
print(sklearn.metrics.classification_report(test_labels, predictions))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99       980
           1       1.00      1.00      1.00      1135
           2       0.99      0.99      0.99      1032
           3       0.99      1.00      0.99      1010
           4       0.99      1.00      0.99       982
           5       1.00      0.98      0.99       892
           6       1.00      0.99      0.99       958
           7       0.99      0.99      0.99      1028
           8       0.99      0.99      0.99       974
           9       1.00      0.98      0.99      1009

   micro avg       0.99      0.99      0.99     10000
   macro avg       0.99      0.99      0.99     10000
weighted avg       0.99      0.99      0.99     10000
 samples avg       0.99      0.99      0.99     10000



  _warn_prf(average, modifier, msg_start, len(result))
