In [1]:
#Load MNIST Data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


In [2]:
import tensorflow as tf
sess = tf.InteractiveSession()

In [3]:
# Placeholders are variables that we feed Tensorflow each time we want to run a computation
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])

In [6]:
# Variables are different from placeholders since they live between computations, 
# and these computations can change the variables. Weights and biases are typically
# set as variables

W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))

# Variables need to be initialized within a session before they can be used (not sure why)
sess.run(tf.global_variables_initializer())

In [7]:
# Forward propagation is easy..
y = tf.matmul(x,W) + b

# Defining the loss function

# tf.nn.softmax_cross_entropy_with_logits applies the softmax on the model's unnormalized model prediction 
# (each of the 10 possible results) and sums over all classes (digits 0-10)
# tf.reduce_mean takes the average over these sums. (Is this summing over an epoch? Makes sense to take 
# the average of that..)
cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

In [10]:
# Training the model
# Tensorflow uses automatic differentiation (still unsure what this is exactly.) 
# to find the gradients of the loss with respect to each of the variables

# Going to use steepest gradient descent to train this one. Tensorflow apparently has a variety of built-in
# optimization algorithms. (may want to check this out)
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)

# Tutorial says that the above line adds a few new operations to the computation graph (there is a way to view
# this if I've understood correctly using TensorBoard) that include computing gradients, computing
# the parameter updates, and applying the updates to the parameters.

# train_step will apply the gradient descent updates to the parameters. Training is accomplished by repeatedly
# running train_step.
for _ in range(1000):
    batch = mnist.train.next_batch(100)
    train_step.run(feed_dict={x: batch[0], y_: batch[1]})

In [11]:
# Evaluating the model

# y is the prediction, y_ is the true value
# tf.argmax finds the index of the highest value of the tensor along the specified axis
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_,1))

# correct_prediction is a boolean array. The code below recasts "True" to 1 and "False" to 0.
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

# prints the percent accuracy of the trained network.
print(accuracy.eval(feed_dict={x: mnist.test.images, y_: mnist.test.labels}))

0.9175


## Adding convolutional layers

In [12]:
# The neuron type used for the convolutional layers is different from the sigmoid one. See ReLU (rectifier neurons)
# The code below initializes the weights and biases for the convolutional part of the network
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

In [13]:
# Convolution and pooling (downsampling the output image by applying some operation on a slice (from what I
# understand, a slice is the output image of a filter) e.g. taking the maximum of 2x2 squares) 
# they use the term "Stride size" (how much the filter
# moves between calculations of a pixel in the output) and "padding" (applying filters shrinks the size of the image
# and padding keeps the image a little larger by adding zeros. In the case below, I think it's keeping the 28x28
# size between convolutions).

def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
                         strides=[1, 2, 2, 1], padding='SAME')

In [14]:
# First convolutional layer

# The first layer will compute 32 features for a 5x5 patch.
# The arguments here are the patch size: 5,5; the number of input channels (color?): 1; 
# and the number of output channels: 32

# (I'm not so sure about the rationale here. Why 32 features from 25 pixels?)
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])

# the final dimension here is the # of color channels..
x_image = tf.reshape(x, [-1, 28, 28, 1])

h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)

In [15]:
# Second convolutional layer

# This layer will have 64 features for each 5x5 patch..
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])

h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

In [16]:
# Densely connected layer

# The tutorial notes that at this point the image has been reduced to 7x7. The next layer will be a fully connected
# layer with 1024 neurons.

W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

In [17]:
# Dropout

# to reduce overfitting. Tensorflow automates the process of scaling the neuron activation and masking them. Not
# even sure how dropout (computes a probability that a neuron's output is kept) helps.

keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)

In [18]:
# Readout layer

W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

In [19]:
# Train and Evaluate

# Things that are different from the previous part:
# -Using a different optimizer here called ADAM (tutorial says it's more sophisticated)
# -have an additional parameter, keep_prob, to control the dropout rate
# -Add logging to every 100th iteration

cross_entropy = tf.reduce_mean(
    tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(2000):
        batch = mnist.train.next_batch(50)
        if i % 100 == 0:
            train_accuracy = accuracy.eval(feed_dict={
                x: batch[0], y_: batch[1], keep_prob: 1.0})
            print('step: %d, training accuracy %g' % (i, train_accuracy))
        train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
    
    print('test accuracy: %g' % accuracy.eval(feed_dict={
        x: batch[0], y_: batch[1], keep_prob: 1}))

step: 0, training accuracy 0.1
step: 100, training accuracy 0.8
step: 200, training accuracy 0.88
step: 300, training accuracy 0.88
step: 400, training accuracy 0.86
step: 500, training accuracy 0.94
step: 600, training accuracy 1
step: 700, training accuracy 0.96
step: 800, training accuracy 0.92
step: 900, training accuracy 0.98
step: 1000, training accuracy 1
step: 1100, training accuracy 0.9
step: 1200, training accuracy 1
step: 1300, training accuracy 1
step: 1400, training accuracy 0.92
step: 1500, training accuracy 0.98
step: 1600, training accuracy 0.96
step: 1700, training accuracy 0.96
step: 1800, training accuracy 1
step: 1900, training accuracy 1
test accuracy: 0.94
