# Convolution Neural Network: MNIST

In [1]:
import numpy as np
import os

import tensorflow as tf

In [2]:
import warnings
warnings.filterwarnings('ignore')

#### to make this notebook's output stable across runs

In [3]:
def reset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

## the baseline model

### set major parameters

In [4]:
# input layer
height = 28
width = 28
channels = 1
n_inputs = height * width

# 1st conv layer
conv1_fmaps = 32
conv1_ksize = 3
conv1_stride = 1
conv1_pad = "SAME"

# 2nd conv layer
conv2_fmaps = 64
conv2_ksize = 3
conv2_stride = 2
conv2_pad = "SAME"

# pool layer
pool3_fmaps = conv2_fmaps

# fc layer
n_fc1 = 64

# output layer
n_outputs = 10

### construct the graph

In [5]:
reset_graph()

# input layer
with tf.name_scope("inputs"):
    X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, channels])
    y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")
    
# 1st conv layer
conv1 = tf.layers.conv2d(X_reshaped, filters=conv1_fmaps, kernel_size=conv1_ksize, 
                         strides=conv1_stride, padding=conv1_pad, activation=tf.nn.relu, name="conv1")

# 2nd conv layer
conv2 = tf.layers.conv2d(conv1, filters=conv2_fmaps, kernel_size=conv2_ksize, 
                         strides=conv2_stride, padding=conv2_pad, activation=tf.nn.relu, name="conv2")

# pool layer
with tf.name_scope("pool3"):
    pool3 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
    pool3_flat = tf.reshape(pool3, shape=[-1, pool3_fmaps * 7 * 7])
    
# fc layer
fc1 = tf.layers.dense(pool3_flat, n_fc1, activation=tf.nn.relu, name="fc1")
    
# output layer
with tf.name_scope("output"):
    logits = tf.layers.dense(fc1, n_outputs, name="output")
    Y_proba = tf.nn.softmax(logits, name="Y_proba")

### design the training process

In [6]:
with tf.name_scope("train"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy)
    optimizer = tf.train.AdamOptimizer()
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
with tf.name_scope("init_and_save"):
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

### load MNIST data

In [7]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/")

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [8]:
print mnist.train.num_examples
print mnist.test.num_examples

55000
10000


### train model

In [10]:
%%time
n_epochs = 10
batch_size = 100

with tf.Session() as sess:
    with tf.device("/gpu:0"):
        init.run()
        for epoch in range(n_epochs):
            for iteration in range(mnist.train.num_examples // batch_size):
                X_batch, y_batch = mnist.train.next_batch(batch_size)
                sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
            acc_valid = accuracy.eval(feed_dict={X: mnist.validation.images, y: mnist.validation.labels})
            print(epoch, "Train accuracy", acc_train, "Valid accuracy", acc_valid)

            save_path = saver.save(sess, "./my_mnist_model")

(0, 'Train accuracy', 0.97000003, 'Valid accuracy', 0.96959966)
(1, 'Train accuracy', 1.0, 'Valid accuracy', 0.98299974)
(2, 'Train accuracy', 0.99000001, 'Valid accuracy', 0.98699963)
(3, 'Train accuracy', 1.0, 'Valid accuracy', 0.98759973)
(4, 'Train accuracy', 0.97000003, 'Valid accuracy', 0.98899972)
(5, 'Train accuracy', 1.0, 'Valid accuracy', 0.98719978)
(6, 'Train accuracy', 1.0, 'Valid accuracy', 0.98819971)
(7, 'Train accuracy', 1.0, 'Valid accuracy', 0.98819971)
(8, 'Train accuracy', 1.0, 'Valid accuracy', 0.9891997)
(9, 'Train accuracy', 1.0, 'Valid accuracy', 0.9877997)
CPU times: user 33.2 s, sys: 4.49 s, total: 37.7 s
Wall time: 32.8 s


### evaluate on test set

In [11]:
with tf.Session() as sess:
    saver.restore(sess, "./my_mnist_model")
    acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
    print('Test accuracy', acc_test)

INFO:tensorflow:Restoring parameters from ./my_mnist_model
('Test accuracy', 0.98780012)


## better model with more tricks

### set major parameters

In [12]:
# input layer
height = 28
width = 28
channels = 1
n_inputs = height * width

# 1st conv layer
conv1_fmaps = 32
conv1_ksize = 3
conv1_stride = 1
conv1_pad = "SAME"

# 2nd conv layer
conv2_fmaps = 64
conv2_ksize = 3

# use stride = 1 instead of 2
conv2_stride = 1
conv2_pad = "SAME"

# add dropout for 2nd conv layer
conv2_dropout_rate = 0.25

# pool layer
pool3_fmaps = conv2_fmaps

# fc layer (larger number of units)
n_fc1 = 128

# add dropout for fc layer
fc1_dropout_rate = 0.5

# output layer
n_outputs = 10

### construct the graph

In [13]:
reset_graph()

# input layer
with tf.name_scope("inputs"):
    X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, channels])
    y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")
    # add training indicator for dropout
    training = tf.placeholder_with_default(False, shape=[], name="training")
    
# 1st conv layer
conv1 = tf.layers.conv2d(X_reshaped, filters=conv1_fmaps, kernel_size=conv1_ksize, 
                         strides=conv1_stride, padding=conv1_pad, activation=tf.nn.relu, name="conv1")

# 2nd conv layer
conv2 = tf.layers.conv2d(conv1, filters=conv2_fmaps, kernel_size=conv2_ksize, 
                         strides=conv2_stride, padding=conv2_pad, activation=tf.nn.relu, name="conv2")

# pool layer
with tf.name_scope("pool3"):
    pool3 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
    pool3_flat = tf.reshape(pool3, shape=[-1, pool3_fmaps * 14 * 14])
    # add dropout for pool3_flat
    pools_flat_drop = tf.layers.dropout(pool3_flat, conv2_dropout_rate, training=training)
    
# fc layer
with tf.name_scope("fc1"):
    fc1 = tf.layers.dense(pool3_flat, n_fc1, activation=tf.nn.relu, name="fc1")
    # add dropout for fc layer
    fc1_drop = tf.layers.dropout(fc1, fc1_dropout_rate, training=training)
    
# output layer
with tf.name_scope("output"):
    logits = tf.layers.dense(fc1, n_outputs, name="output")
    Y_proba = tf.nn.softmax(logits, name="Y_proba")

### design the training process

In [14]:
with tf.name_scope("train"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy)
    optimizer = tf.train.AdamOptimizer()
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
with tf.name_scope("init_and_save"):
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

### useful functions for early stop!
1. `get_model_params()`: gets the model's state (i.e., the value of all the variables)
2. `restore_model_params()`: restores a previous state. 

This is used to speed up early stopping: instead of storing the best model found so far to disk, we just save it to memory.  
At the end of training, we roll back to the best model found.

In [15]:
def get_model_params():
    gvars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
    return {gvar.op.name: value for gvar, value in zip(gvars, tf.get_default_session().run(gvars))}

def restore_model_params(model_params):
    gvar_names = list(model_params.keys())
    assign_ops = {gvar_name: tf.get_default_graph().get_operation_by_name(gvar_name + "/Assign")
                  for gvar_name in gvar_names}
    init_values = {gvar_name: assign_op.inputs[1] for gvar_name, assign_op in assign_ops.items()}
    feed_dict = {init_values[gvar_name]: model_params[gvar_name] for gvar_name in gvar_names}
    tf.get_default_session().run(assign_ops, feed_dict=feed_dict)

### train model with more tricks!
* every 100 training iterations, it evaluates the model on the validation set,
* if the model performs better than the best model found so far, then it saves the model to RAM,
* if there is no progress for 100 evaluations in a row, then training is interrupted,
* after training, the code restores the best model found.

In [17]:
# more number of epochs and smaller batch size
n_epochs = 1000
batch_size = 50

# for early stop
best_loss_val = np.infty
check_interval = 500
checks_since_last_progress = 0
max_checks_without_progress = 20
best_model_params = None 

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True})
            if iteration % check_interval == 0:
                loss_val = loss.eval(feed_dict={X: mnist.validation.images,
                                                y: mnist.validation.labels})
                if loss_val < best_loss_val:
                    best_loss_val = loss_val
                    checks_since_last_progress = 0
                    best_model_params = get_model_params()
                else:
                    checks_since_last_progress += 1
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
                                           y: mnist.validation.labels})
        print("Epoch {}, train accuracy: {:.4f}%, valid. accuracy: {:.4f}%, valid. best loss: {:.6f}".format(
                  epoch, acc_train * 100, acc_val * 100, best_loss_val))
        if checks_since_last_progress > max_checks_without_progress:
            print("Early stopping!")
            break

Epoch 0, train accuracy: 98.0000%, valid. accuracy: 98.6200%, valid. best loss: 0.055988
Epoch 1, train accuracy: 98.0000%, valid. accuracy: 98.7400%, valid. best loss: 0.042223
Epoch 2, train accuracy: 98.0000%, valid. accuracy: 98.7000%, valid. best loss: 0.041539
Epoch 3, train accuracy: 100.0000%, valid. accuracy: 98.5800%, valid. best loss: 0.041539
Epoch 4, train accuracy: 100.0000%, valid. accuracy: 99.1000%, valid. best loss: 0.041539
Epoch 5, train accuracy: 100.0000%, valid. accuracy: 98.8600%, valid. best loss: 0.039102
Epoch 6, train accuracy: 100.0000%, valid. accuracy: 98.9400%, valid. best loss: 0.039102
Epoch 7, train accuracy: 100.0000%, valid. accuracy: 98.8600%, valid. best loss: 0.039102
Epoch 8, train accuracy: 100.0000%, valid. accuracy: 99.0600%, valid. best loss: 0.039102
Epoch 9, train accuracy: 100.0000%, valid. accuracy: 99.1400%, valid. best loss: 0.039102
Epoch 10, train accuracy: 100.0000%, valid. accuracy: 99.0600%, valid. best loss: 0.039102
Epoch 11, tr

### evaluate on test set

In [18]:
with tf.Session() as sess:
    if best_model_params:
        restore_model_params(best_model_params)
    acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
    print("Final accuracy on test set:", acc_test)
    save_path = saver.save(sess, "./my_mnist_model")

('Final accuracy on test set:', 0.98840016)


## try more! :/

### others are the same

In [38]:
# input layer
height = 28
width = 28
channels = 1
n_inputs = height * width

# 1st conv layer
conv1_fmaps = 32
conv1_ksize = 3
conv1_stride = 1
conv1_pad = "SAME"

# 2nd conv layer
conv2_fmaps = 64
conv2_ksize = 3

# use stride = 1 instead of 2
conv2_stride = 1
conv2_pad = "SAME"

# add dropout for 2nd conv layer
conv2_dropout_rate = 0.5

# pool layer
pool3_fmaps = conv2_fmaps

# fc layer (larger number of units)
n_fc1 = 128

# add dropout for fc layer
fc1_dropout_rate = 0.5

# output layer
n_outputs = 10

reset_graph()

# input layer
with tf.name_scope("inputs"):
    X = tf.placeholder(dtype=tf.float32, shape=(None, n_inputs), name="X")
    X_reshaped = tf.reshape(X, shape=[-1, height, width, channels])
    y = tf.placeholder(dtype=tf.int32, shape=(None), name="y")
    # add training indicator for dropout
    training = tf.placeholder_with_default(False, shape=[], name="training")
    
# 1st conv layer
conv1 = tf.layers.conv2d(X_reshaped, filters=conv1_fmaps, kernel_size=conv1_ksize, 
                         strides=conv1_stride, padding=conv1_pad, activation=tf.nn.relu, name="conv1")

# 2nd conv layer
conv2 = tf.layers.conv2d(conv1, filters=conv2_fmaps, kernel_size=conv2_ksize, 
                         strides=conv2_stride, padding=conv2_pad, activation=tf.nn.relu, name="conv2")

# pool layer
with tf.name_scope("pool3"):
    pool3 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="VALID")
    pool3_flat = tf.reshape(pool3, shape=[-1, pool3_fmaps * 14 * 14])
    # add dropout for pool3_flat
    pools_flat_drop = tf.layers.dropout(pool3_flat, conv2_dropout_rate, training=training)
    
# fc layer
with tf.name_scope("fc1"):
    fc1 = tf.layers.dense(pool3_flat, n_fc1, activation=tf.nn.relu, name="fc1")
    # add dropout for fc layer
    fc1_drop = tf.layers.dropout(fc1, fc1_dropout_rate, training=training)
    
# output layer
with tf.name_scope("output"):
    logits = tf.layers.dense(fc1, n_outputs, name="output")
    Y_proba = tf.nn.softmax(logits, name="Y_proba")

### make learning rate changeable

In [39]:
with tf.name_scope("train"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y)
    loss = tf.reduce_mean(xentropy)
    learning_rate = tf.placeholder(tf.float32, [], name="lr")
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
    
with tf.name_scope("init_and_save"):
    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

#### at the previous stopping point, we half the learning rate

In [40]:
# more number of epochs and smaller batch size
n_epochs = 20
batch_size = 50

# for early stop
best_loss_val = np.infty
check_interval = 500
checks_since_last_progress = 0
max_checks_without_progress = 20
best_model_params = None 

# for decay learning rate
lr = 0.001

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch, training: True, learning_rate: lr})
            if iteration % check_interval == 0:
                loss_val = loss.eval(feed_dict={X: mnist.validation.images,
                                                y: mnist.validation.labels})
                if loss_val < best_loss_val:
                    best_loss_val = loss_val
                    checks_since_last_progress = 0
                    best_model_params = get_model_params()
                else:
                    checks_since_last_progress += 1
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
                                           y: mnist.validation.labels})
        print("Epoch {}, learning rate: {:.6f}, train accuracy: {:.4f}%, valid. accuracy: {:.4f}%, valid. best loss: {:.6f}".format(
                  epoch, lr, acc_train * 100, acc_val * 100, best_loss_val))
        if checks_since_last_progress > max_checks_without_progress:
            checks_since_last_progress = 0
            lr *= 0.5
            print("half the learning rate!")

Epoch 0, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 98.4000%, valid. best loss: 0.055442
Epoch 1, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 98.8400%, valid. best loss: 0.042894
Epoch 2, learning rate: 0.001000, train accuracy: 98.0000%, valid. accuracy: 98.4600%, valid. best loss: 0.039560
Epoch 3, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 99.0800%, valid. best loss: 0.038976
Epoch 4, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 98.8400%, valid. best loss: 0.037326
Epoch 5, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 98.6400%, valid. best loss: 0.037326
Epoch 6, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 98.9600%, valid. best loss: 0.037326
Epoch 7, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 98.8000%, valid. best loss: 0.037326
Epoch 8, learning rate: 0.001000, train accuracy: 100.0000%, valid. accuracy: 99.

In [37]:
with tf.Session() as sess:
    if best_model_params:
        restore_model_params(best_model_params)
    acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
    print("Final accuracy on test set:", acc_test)

('Final accuracy on test set:', 0.98810017)
