In [1]:
import tensorflow as tf
from math import exp

In [2]:
from tensorflow.examples.tutorials.mnist import input_data as mnist_data

# Download images and labels into mnist.test (10K images+labels) and mnist.train (60K images+labels)
mnist = mnist_data.read_data_sets("data", one_hot=True, reshape=False, validation_size=0)

Extracting data/train-images-idx3-ubyte.gz


Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz


In [3]:
# Constants
final_nodes = 10
width = 28
height = 28
area = width * height
lr = .003

In [4]:
# Clear Tensor Names. ( Not required)
tf.reset_default_graph()

In [66]:
# Placeholders
# - Parameters for future functions that will require data to be passed
X = tf.placeholder(tf.float32, [None, width, height, 1]) # image (28 x 28 px) / grey scale (1 channel)

Y_ = tf.placeholder(tf.float32, [None, final_nodes]) # Target results

In [67]:
# Model
# - Flatten image
XX = tf.reshape(X, [-1, area])

In [68]:
# Weights and Bias (v1)
# - Variables for use by the network. These get initialized at the start
# and do not require additional data to be passed.

# 1) A two single arrays to be initialized as zero
W = tf.Variable(tf.zeros([area, final_nodes]))
B = tf.Variable(tf.zeros([final_nodes]))

In [69]:
# Activation Functions
# - Requires more layers

In [70]:
# Regression Function (V1)
Y = tf.nn.softmax(tf.matmul(XX, W) + B)

In [71]:
# Loss Function (V1-3)
cross_entropy = -tf.reduce_sum(Y_ * tf.log(Y))

In [72]:
# Optimizer (V1-3)
# - Back-propagation function for adjusting weights and biases
optimizer = tf.train.GradientDescentOptimizer(lr)
train_step = optimizer.minimize(cross_entropy)

In [73]:
# Start Tensor Graph  (V1-5)
# init = tf.initialize_all_variables() # Deprecated
init = tf.global_variables_initializer()

sess = tf.Session()
sess.run(init)

In [74]:
# Training  (V1-4)
for i in range(1000):
    # load batch of images and correct answers
    batch_X, batch_Y = mnist.train.next_batch(100)
    train_data={X: batch_X, Y_: batch_Y}

    # train
    sess.run(train_step, feed_dict=train_data)

In [75]:
# Accuracy (V1-6)
# - % of correct answers found in batch
is_correct = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))

In [76]:
# Test  (V1-6)
test_data={X: mnist.test.images, Y_: mnist.test.labels}
a,c = sess.run([accuracy, cross_entropy], feed_dict=test_data)
print(a)

0.9201


## Version 2: Another layer and better starting points
* This section will add another layer of nodes in the middle (a hidden layer).
* New starting points for weights and biases

In [14]:
# Constants (v2)
hidden_layer = 200

### Weights and Bias (v2)
* Adding Layers: The more layers, the more filters to pick up features in your data.
* Weights: truncated_normal() is used to provide various starting weights.
* Bias: tf.ones() is now used giving the init values an average starting point instead of zero

In [None]:
# Weights and Bias (v2)

W1 = tf.Variable(tf.truncated_normal([area, hidden_layer], stddev=0.1))
W2 = tf.Variable(tf.truncated_normal([hidden_layer, final_nodes], stddev=0.1))

B1 = tf.Variable(tf.ones([hidden_layer])/10)
B2 = tf.Variable(tf.zeros([final_nodes]))

In [16]:
# Activation Function (V2)
# - Activation functions are used on layers to determine the importance of information.
# - Products of nodes with small values have increased chances of being ignored.

Y = tf.nn.sigmoid(tf.matmul(XX, W1) + B1)

In [None]:
# Regression Function V2
Y = tf.nn.softmax(tf.matmul(Y, W2) + B2)

## Version 3
* This section will show how to add x amount of layers.
* **NOTE:** These new layers will cause a drop in accuracy. Why?

In [18]:
# Constants (v3)
# - Dynamic number of layers
layers = [
        area,
        200,
        100,
        60,
        30,
        10
    ]

In [19]:
# Weights and Bias (v3-5)
# - A list of Weights and Biases that loop through the layers

WW = [
    tf.Variable(tf.truncated_normal(
        [layers[i], layers[i+1]],
        stddev=0.1,
        name="Weight" + str(i)
    ))
    for i in range(len(layers)-1)
]

BB = [
    tf.Variable(tf.ones([layers[i]])/10, "Bias" + str(i))
    for i in range(1, len(layers))
]

In [18]:
# Activation Function (v3)
# - Looping Activations

Y = XX

i = 0
for i in range(len(layers)-2):
    name = "activate_" + str(i)
    Y = tf.nn.sigmoid(tf.matmul(Y, WW[i], name=name) + BB[i])

In [None]:
# Regression Function (v3)
# - Formats the output into a format we can use for training against the target

Y = tf.nn.softmax(tf.matmul(Y, WW[i+1]) + BB[i+1])

## Version 4: Function Swapping
*  Activation: Replaces `sigmoid` with `relu`
* Loss Function: Use activatino results instead of softmax. Adjust function to handle 0 which `softmax` never returned
* Optimizer: replace `GradientDescentOptimiser` with `AdamOptimizer`

In [20]:
# Activation Functions (v4)
# - Using Relu

Y = XX
i = 0
for i in range(len(layers)-2):
    name = "activate_" + str(i)
    Y = tf.nn.relu(tf.matmul(Y, WW[i], name=name) + BB[i])

In [21]:
# Regression Functions (v4)
# - Break out logits for loss function

Ylogits = tf.matmul(Y, WW[i+1]) + BB[i+1]
Y = tf.nn.softmax(Ylogits)

In [22]:
# Loss Function (V4)
# - Loss function based upon Activation and not Regression
# - Fixes the issue where the tf.log function tries to compute 0
logits = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
cross_entropy = tf.reduce_mean(logits) * 100

In [23]:
# Optimizer (V4)
# - tf has many optimizers
# - AdamOptimizer works well with large dimensional layers.

optimizer = tf.train.AdamOptimizer(lr)
train_step = optimizer.minimize(cross_entropy)

## Version 5: Learning Rates
* Provides a dynamic learning rate that decreases over time.
* Without decay more recent data has a bigger impact.

In [24]:
# Constants (v5)

lrmax = 0.003
lrmin = 0.00001
decay_speed = 2000.0

In [25]:
# Placeholders (V5)
# - For Learning Curve
L = tf.placeholder(tf.float32)

In [37]:
# Training (V5-6)
# - Learning rate decreases as time goes on.

for i in range(1000):
    batch_X, batch_Y = mnist.train.next_batch(100)
    learning_rate = lrmin + (lrmax - lrmin) * exp(-i / decay_speed)
    train_data = {X: batch_X, Y_: batch_Y, L: learning_rate}

    # train
    sess.run(train_step, feed_dict=train_data)

## Version 6: Dropoff
* Randomly shuts off nodes 
* Prevents over fitting (The network could find unrelated data important)

In [None]:
# Constants (V6)
keep_ratio = 0.9

In [31]:
# Placeholders (V6)
# - For dropoff
pkeep = tf.placeholder(tf.float32)

In [None]:
# Activation Functions (v6)
# - Turns off some nodes. Prevents false positives from being piked up

Y = XX
i = 0
for i in range(len(layers)-2):
    name = "activate_" + str(i)
    Y = tf.nn.relu(tf.matmul(Y, WW[i], name=name) + BB[i])
    Y = tf.nn.dropout(Y, pkeep)

In [None]:
# Training (V6)
# - Turns off some nodes. Prevents false positives from being piked up

for i in range(1000):
    batch_X, batch_Y = mnist.train.next_batch(100)
    learning_rate = lrmin + (lrmax - lrmin) * exp(-i / decay_speed)
    train_data = {
        X: batch_X,
        Y_: batch_Y,
        L: learning_rate,
        pkeep: keep_ratio
    }

    sess.run(train_step, feed_dict=train_data)

In [None]:
# Testing V6
# - Test with all nodes on.
test_data = {X: mnist.test.images, Y_: mnist.test.labels, pkeep: 1.0}
a,c = sess.run([accuracy, cross_entropy], feed_dict=test_data)
print(a)

In [50]:
# Final Constants

# Image Format
width = 28
height = 28
area = width * height

# Learning Rate Values
lrmax = 0.003
lrmin = 0.00001
decay_speed = 2000.0

# Layers
layers = [area, 200, 100, 60, 30, 10]

# Dropoff
keep_ratio = 0.9

In [41]:
# Placeholders
X = tf.placeholder(tf.float32, [None, 28, 28, 1])
Y_ = tf.placeholder(tf.float32, [None, 10])
L = tf.placeholder(tf.float32)
pkeep = tf.placeholder(tf.float32)

In [42]:
# Weights and Bias
WW = [
    tf.Variable(tf.truncated_normal(
        [layers[i], layers[i+1]],
        stddev=0.1,
        name="Weight" + str(i)
    ))
    for i in range(len(layers)-1)
]

BB = [
    tf.Variable(tf.ones([layers[i]])/10, "Bias" + str(i))
    for i in range(1, len(layers))
]

In [43]:
# Model
# - Flatten image
Y = tf.reshape(X, [-1, area])

In [44]:
# Activation Function
i = 0
for i in range(len(layers)-2):
    name = "activate_" + str(i)
    Y = tf.nn.relu(tf.matmul(Y, WW[i], name=name) + BB[i])
    Y = tf.nn.dropout(Y, pkeep)

In [45]:
# Regression Functions
Ylogits = tf.matmul(Y, WW[i+1]) + BB[i+1]
Y = tf.nn.softmax(Ylogits)

In [46]:
# Loss Function
logits = tf.nn.softmax_cross_entropy_with_logits(logits=Ylogits, labels=Y_)
cross_entropy = tf.reduce_mean(logits) * 100

In [47]:
# Optimizer
optimizer = tf.train.AdamOptimizer(lr)
train_step = optimizer.minimize(cross_entropy)

In [48]:
# Start Tensor Graph
init = tf.global_variables_initializer()

sess = tf.Session()
sess.run(init)

In [51]:
# Training
for i in range(1000):
    batch_X, batch_Y = mnist.train.next_batch(100)
    learning_rate = lrmin + (lrmax - lrmin) * exp(-i / decay_speed)
    train_data = {
        X: batch_X,
        Y_: batch_Y,
        L: learning_rate,
        pkeep: 0.9
    }

    sess.run(train_step, feed_dict=train_data)

In [55]:
# Accuracy
is_correct = tf.equal(tf.argmax(Y, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))

In [56]:
# Testing
test_data={X: mnist.test.images, Y_: mnist.test.labels, pkeep: 1.0}
a,c = sess.run([accuracy, cross_entropy], feed_dict=test_data)
print(a)

0.9645
