In [1]:
import tensorflow as tf
import numpy as np

In [2]:
# Create basic computation graph as run

# Create graph

a = tf.constant(3.0)
b = tf.constant(4.0)
c = a + b # simplified for tf.add(a, b)

In [3]:
sess = tf.Session()
sess.run(c)

7.0

In classical program, we can just run as we create the computation flow. However in tensorflow, the computation graph is built first. The whole graph is then put into a memory for running

In [4]:
# Create placeholder as an interface for variables
# For place holder, we need to specific the data type, or shape, or name
a = tf.placeholder(tf.float32, name='a')
b = tf.placeholder(tf.float32, name='b')
c = a + b

In [5]:
# In this case, when we tried to run a certain computatio node, we need to feed with dictionary to the placeholder
sess=tf.Session()
print(sess.run(c, {a: 12.0, b: 13.0}))
print(sess.run(c, {a: [1,2], b:[3,4]}))

25.0
[ 4.  6.]


In [6]:
# In machine learning, variables are used to indicate parameters that could be updated
# The most frequently used variables are weights and biases
# For placeholder, we need to specific the datatype, shape of the variable and optionally name
# Shape and initial value could be specific simultaneously or seperately
W = tf.Variable([.3], dtype=tf.float32)
b = tf.Variable([-.3], dtype=tf.float32)

Let's try to fit a simple function
$2x_1 + 3 x_2 + 3 = y$

In [7]:
n_input = 2
n_output = 1
W = tf.Variable(initial_value=np.random.randn(n_input, n_output), dtype=tf.float32)
b = tf.Variable(-0.3, dtype=tf.float32)
X = tf.placeholder(shape=(n_input,None), dtype=tf.float32)     # Pay attention that in tensorflow, we use None to inidicate the a number is not specific 
z = tf.matmul(tf.transpose(X), W)+b

In [8]:
# Initialize all global variables
init = tf.global_variables_initializer()
sess.run(init)

In [9]:
sess.run(z, {X: np.array([[1.0, -1.0, 2.0],[3.0, 4.0, 0.0]])})

array([[ 4.42543316],
       [ 2.40192223],
       [ 2.78456187]], dtype=float32)

In [10]:
# To evalaute the model prediction with supervised method, we need to calculate the loss function
# Create placeholder for inputting the targets
y = tf.placeholder(shape=(None, n_output), dtype=tf.float32, name='y')
loss = tf.reduce_sum(tf.square(y - z))
sess.run(loss, {X: np.array([[1.0, -1.0, 2.0],[3.0, 4.0, 0.0]]), y: [[14], [13], [7]]})

221.76149

In [11]:
# Try with target variable value
fixW = tf.assign(W, np.array([[2],[3]]))
fixb = tf.assign(b, 3)
sess.run([fixW, fixb])
sess.run(loss, {X: np.array([[1.0, -1.0, 2.0],[3.0, 4.0, 0.0]]), y: [[14], [13], [7]]})

0.0

We have tested that with the correct parameter, loss is 0. Now we need to utilize the training operation to obtain the correct parameter.

In [12]:
optimizier = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizier.minimize(loss)

In [13]:
sess.run(tf.global_variables_initializer())
for i in range(2000):
    sess.run(train_op, {X: np.array([[1.0, -1.0, 2.0],[3.0, 4.0, 0.0]]), y: [[14], [13], [7]]})

In [14]:
print(sess.run([W, b]))

[array([[ 2.27479506],
       [ 3.21417928]], dtype=float32), 2.2863684]


In the following, we will give the complete example with only one weight variable to train with limited amount of data

In [15]:
# Create placeholder for input, and output
n_input = 1   # Input dimension
n_output = 1    # Output dimension
epochs = 1000
X_feed = np.linspace(1, 10, 10).reshape(1, -1)
y_feed = 2 * X_feed + 1

X = tf.placeholder(shape=(n_input, None), dtype=tf.float32, name='X')
y = tf.placeholder(shape=(n_output, None), dtype=tf.float32, name='y')

# Create variable for model : 2x+1=y
w = tf.Variable([-0.3], dtype=tf.float32, name='w')
b = tf.Variable([0.3], dtype=tf.float32, name='b')

# Generate output
output = w * X + b

# Calculate the loss
loss = tf.reduce_sum(tf.square(output-y))

# Prepare the training parameters
optimizer = tf.train.GradientDescentOptimizer(0.001)
train_op = optimizer.minimize(loss)

# Create the session
with tf.Session() as sess:
    # Initialize all the variable
    sess.run(tf.global_variables_initializer())
    
    for i in range(epochs):
        sess.run(train_op, {X:X_feed, y:y_feed})
        
    print(sess.run([w, b]))
    print(sess.run(loss, {X:X_feed, y:y_feed}))

[array([ 2.00077343], dtype=float32), array([ 0.99461448], dtype=float32)]
6.21487e-05


### We will now look into the classical MNIST classification example

In [16]:
from tensorflow.examples.tutorials.mnist import input_data

In [17]:
mnist=input_data.read_data_sets('MNIST_data', one_hot=True)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


Previously, we run the tensorflow following the order of creating the graph, initialize all variables and run the graph in a certain sessin. Here we will try interactive approach using tf.InteractiveSession()

Build a softmax regression model.

In [18]:
sess = tf.InteractiveSession()

In [19]:
# We will first create placeholder for input images and output class labels
X = tf.placeholder(dtype=tf.float32, shape=[None, 784])
y_ = tf.placeholder(dtype=tf.int32,  shape=[None, 10])

In [20]:
# Define variables that live in the tensorflow graph and get updated during training
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([1,10]))

In [21]:
# Generate prediction
y = tf.matmul(X, W) + b

In [22]:
# Calculate loss function for training
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y))

Here, we will encounter the common loss function calculation with tf.nn.softmax_cross_entropy_with logits, or tf.nn.spare_softmax_cross_entropy_with_logit

The function takes the output of the fully connected output layer, which has either positive or negative value, and calculate the cross entropies by first take the exponential of each output to convert into probabilities and apply softmax function. At the end, cross_entropy was calculated with labels.

In [23]:
# Specify the training parameters
train_step = tf.train.GradientDescentOptimizer(0.001).minimize(cross_entropy)

In [24]:
# Now we could start training the model
epochs = 1000
# Assgine initial values to variable
sess.run(tf.global_variables_initializer())
for _ in range(epochs):
    # Obtain the data
    batch = mnist.train.next_batch(100)
    sess.run(train_step, {X:batch[0], y_:batch[1]})

In [25]:
# Evaluate the model
correct_prediction = tf.equal(tf.arg_max(y, 1), tf.arg_max(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [26]:
print(accuracy.eval(feed_dict={X: mnist.test.images, y_: mnist.test.labels}))

0.7923


### Improve the MNIST classification accuracy with small convoluational network

The target convnet we are thinking having the following architecture:

Image (m, 28, 28) -> Reshape (m, 784) -> Conv-Relu1 -> Maxpool1 -> Conv-Relu2 -> Maxpool2 -> FC1 -> Dropout -> FC2 -> Softmax

We now have more variables to train. To eliminate the necessity to enter the initializatoin value every time, we will build functions to initialize the variables every time.

In [27]:
X = tf.placeholder(dtype=tf.float32, shape=[None, 784])
y_ = tf.placeholder(dtype=tf.float32, shape=[None, 10])

In [28]:
def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev=0.1)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.1, shape=shape)
    return tf.Variable(initial)

We will also use function to build the convolution and pooling layer

In [29]:
def conv2d(x, W):
    return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME')

In [30]:
# Use 32 5 * 5 size kernels to to generate the feature maps of convlayer2
W_conv1 = weight_variable([5,5,1,32])   # Filter shape should be filter_height, filter_width, input_channel, output_channel
b_conv1 = bias_variable([32])

In [31]:
# Reshape the image to prepare for convolution
x_image = tf.reshape(X, [-1, 28, 28, 1])

Original Input shape is : 28 x 28 x 1

In [32]:
# Implement the fist convolution layer
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)   # A feature map shares the bias term
h_pool1 = max_pool_2x2(h_conv1)

Conv1 output shape: 28 x 28 x 32

Pool1 output shape: (28 - 2)/2 + 1 = 14 x 14 x 32

In [33]:
# Use 32 5 * 5 size kernels to to generate the feature maps of convlayer2
W_conv2 = weight_variable([5,5,32,64])   # Filter shape should be filter_height, filter_width, input_channel, output_channel
b_conv2 = bias_variable([64])

In [34]:
# Implement the second convolution layer
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)

Conv2 output shape: 28 x 28 x 64
Pool2 output shape: (14 - 2)/2 + 1 = 7 x 7 x 64

In [35]:
# Implement the densely connected layer
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc2 = bias_variable([1024])

h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1)+b_fc2)

In [36]:
# Apply dropout before readout layer
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob=keep_prob)

In [37]:
# Create the readout layer
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])

In [38]:
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2

Now it's time to train the model. There are several improvements mentioned by the tutorial that we need to realize:
1. Use Adamoptimizer
2. Pay attention to including the dropout probability
3. Add logging to every 100th iteration to prepare for visualization

In [39]:
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))

In [40]:
lr = 0.001
train_op = tf.train.AdamOptimizer(lr).minimize(cross_entropy)

In [47]:
# To be able to evaluate the perfornace, let's define the accuracy here
correct_prediction = tf.equal(tf.arg_max(y_conv, 1), tf.arg_max(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

In [48]:
epochs = 10000
batch_size = 128

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    for _ in range(epochs):
        X_batch, y_batch = mnist.train.next_batch(batch_size)
        if _ % 1000 == 0:
            sess.run(accuracy, feed_dict={X:X_batch, y_:y_batch, keep_prob: 1.0})
    
        sess.run(train_op, feed_dict={X: X_batch, y_: y_batch, keep_prob: 1.0})

KeyboardInterrupt: 