
# Deep Learning
## Assignment 3

Previously in 2_fullyconnected.ipynb, you trained a logistic regression and a neural network model.

The goal of this assignment is to explore regularization techniques.


In [1]:
# These are all the modules we'll be using later. Make sure you can import them
# before proceeding further.
import numpy as np
import tensorflow as tf
from six.moves import cPickle as pickle

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


First reload the data we generated in notmist.ipynb.

In [2]:
pickle_file = 'notMNIST.pickle'

with open(pickle_file, 'rb') as f:
  save = pickle.load(f)
  train_dataset = save['train_dataset']
  train_labels = save['train_labels']
  valid_dataset = save['valid_dataset']
  valid_labels = save['valid_labels']
  test_dataset = save['test_dataset']
  test_labels = save['test_labels']
  del save  # hint to help gc free up memory
  print('Training set', train_dataset.shape, train_labels.shape)
  print('Validation set', valid_dataset.shape, valid_labels.shape)
  print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 28, 28) (200000,)
Validation set (10000, 28, 28) (10000,)
Test set (10000, 28, 28) (10000,)


Reformat into a shape that's more adapted to the models we're going to train:

* data as a flat matrix,
* labels as float 1-hot encodings.

In [3]:
image_size = 28
num_labels = 10

def reformat(dataset, labels):
  dataset = dataset.reshape((-1, image_size * image_size)).astype(np.float32)
  # Map 2 to [0.0, 1.0, 0.0 ...], 3 to [0.0, 0.0, 1.0 ...]
  labels = (np.arange(num_labels) == labels[:,None]).astype(np.float32)
  return dataset, labels
train_dataset, train_labels = reformat(train_dataset, train_labels)
valid_dataset, valid_labels = reformat(valid_dataset, valid_labels)
test_dataset, test_labels = reformat(test_dataset, test_labels)
print('Training set', train_dataset.shape, train_labels.shape)
print('Validation set', valid_dataset.shape, valid_labels.shape)
print('Test set', test_dataset.shape, test_labels.shape)

Training set (200000, 784) (200000, 10)
Validation set (10000, 784) (10000, 10)
Test set (10000, 784) (10000, 10)


In [4]:
def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

### Problem 1

Introduce and tune L2 regularization for both logistic and neural network models. Remember that L2 amounts to adding a penalty on the norm of the weights to the loss. In TensorFlow, you can compue the L2 loss for a tensor t using nn.l2_loss(t). The right amount of regularization should improve your validation / test accuracy.


###  logistic regression with l2 loss function

regularizing with beta = 0.01


In [25]:
# multinomial logistic regression 
train_subset = 10000
beta = 0.008

graph = tf.Graph()
with graph.as_default():

    # Input data.
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables    
    weights = tf.Variable(tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.
    logits = tf.nn.softmax(tf.matmul(tf_train_dataset, weights) + biases)
    
    # loss function using l2
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels) )
    loss = tf.reduce_mean(loss + beta * tf.nn.l2_loss(weights) )
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax( tf.matmul(tf_valid_dataset, weights) + biases )
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [26]:
num_steps = 801

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print('Initialized')
    for step in range(num_steps):
        
        _, l, predictions = session.run([optimizer, loss, train_prediction])

        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(
            predictions, train_labels[:train_subset, :]))
            print('Validation accuracy: %.1f%%' % accuracy( valid_prediction.eval(), valid_labels) )

    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 26.480192
Training accuracy: 8.8%
Validation accuracy: 9.5%
Loss at step 100: 13.035336
Training accuracy: 23.8%
Validation accuracy: 23.9%
Loss at step 200: 6.849258
Training accuracy: 46.6%
Validation accuracy: 47.0%
Loss at step 300: 4.012233
Training accuracy: 65.0%
Validation accuracy: 64.9%
Loss at step 400: 2.774629
Training accuracy: 70.8%
Validation accuracy: 70.6%
Loss at step 500: 2.232380
Training accuracy: 72.7%
Validation accuracy: 72.2%
Loss at step 600: 1.947636
Training accuracy: 80.2%
Validation accuracy: 79.5%
Loss at step 700: 1.839847
Training accuracy: 81.0%
Validation accuracy: 80.3%
Loss at step 800: 1.793015
Training accuracy: 81.2%
Validation accuracy: 80.6%
Test accuracy: 87.6%


### neural network with l2 loss function

In [14]:
batch_size = 128
beta = 0.01

graph = tf.Graph()
with graph.as_default():

    # Input data.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables.
    weights = tf.Variable( tf.truncated_normal([image_size * image_size, num_labels]) )
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits = logits,labels = tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax( tf.matmul(tf_valid_dataset, weights) + biases)
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [29]:
batch_size = 128
n_input = image_size * image_size
n_hidden_1 = 1024
n_classes = 10
beta = 0.005


graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, n_input))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, n_classes))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  weights = {
      'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
      'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
  }
  biases = {
      'b1': tf.Variable(tf.random_normal([n_hidden_1])),
      'out': tf.Variable(tf.random_normal([n_classes]))
  }

    
  # Training computation.
  layer_1 = tf.add(tf.matmul(tf_train_dataset, weights['h1']), biases['b1'])
  tf_layer = tf.nn.relu(layer_1)
  logits = tf.matmul(tf_layer, weights['out']) + biases['out']
  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
  loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights['out']) )
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
    
  valid_relu = tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset, weights['h1']), biases['b1']))
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(valid_relu, weights['out']), biases['out']))

  test_relu = tf.nn.relu(tf.add(tf.matmul(tf_test_dataset, weights['h1']), biases['b1']))
  test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights['out']) +  biases['out'])

In [31]:

num_steps = 5001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 521.647644
Minibatch accuracy: 3.9%
Validation accuracy: 25.7%
Minibatch loss at step 500: 51.746742
Minibatch accuracy: 64.8%
Validation accuracy: 76.9%
Minibatch loss at step 1000: 11.192792
Minibatch accuracy: 78.1%
Validation accuracy: 78.4%
Minibatch loss at step 1500: 3.274352
Minibatch accuracy: 89.1%
Validation accuracy: 79.8%
Minibatch loss at step 2000: 5.070520
Minibatch accuracy: 80.5%
Validation accuracy: 80.8%
Minibatch loss at step 2500: 6.661446
Minibatch accuracy: 78.1%
Validation accuracy: 80.2%
Minibatch loss at step 3000: 3.849973
Minibatch accuracy: 80.5%
Validation accuracy: 80.3%
Minibatch loss at step 3500: 5.995305
Minibatch accuracy: 76.6%
Validation accuracy: 77.4%
Minibatch loss at step 4000: 4.879106
Minibatch accuracy: 80.5%
Validation accuracy: 79.3%
Minibatch loss at step 4500: 4.956207
Minibatch accuracy: 82.0%
Validation accuracy: 80.5%
Minibatch loss at step 5000: 1.945428
Minibatch accuracy: 85.9%
Validation accu

### Problem 2

Let's demonstrate an extreme case of overfitting. Restrict your training data to just a few batches. What happens?

### log reg with l2 loss and small training data

In [32]:
# multinomial logistic regression 
train_subset = 100
beta = 0.01

graph = tf.Graph()
with graph.as_default():

    # Input data.
    tf_train_dataset = tf.constant(train_dataset[:train_subset, :])
    tf_train_labels = tf.constant(train_labels[:train_subset])
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)
  
    # Variables    
    weights = tf.Variable(
    tf.truncated_normal([image_size * image_size, num_labels]))
    biases = tf.Variable(tf.zeros([num_labels]))
  
    # Training computation.
    logits = tf.matmul(tf_train_dataset, weights) + biases
    
    # loss function using l2
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits = logits, labels = tf_train_labels) )
    loss = tf.reduce_mean(loss + beta * tf.nn.l2_loss(weights) )
    
    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_prediction = tf.nn.softmax( tf.matmul(tf_valid_dataset, weights) + biases )
    test_prediction = tf.nn.softmax(tf.matmul(tf_test_dataset, weights) + biases)

In [36]:
num_steps = 801

def accuracy(predictions, labels):
  return (100.0 * np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
          / predictions.shape[0])
init = tf.global_variables_initializer()
with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    for step in range(num_steps):
        
        _, l, predictions = session.run([optimizer, loss, train_prediction])

        if (step % 100 == 0):
            print('Loss at step %d: %f' % (step, l))
            print('Training accuracy: %.1f%%' % accuracy(
            predictions, train_labels[:train_subset, :]))
            print('Validation accuracy: %.1f%%' % accuracy( valid_prediction.eval(), valid_labels) )

    print('Test accuracy: %.1f%%' % accuracy(test_prediction.eval(), test_labels))

Initialized
Loss at step 0: 46.285088
Training accuracy: 9.0%
Validation accuracy: 14.2%
Loss at step 100: 10.687040
Training accuracy: 100.0%
Validation accuracy: 49.0%
Loss at step 200: 3.993535
Training accuracy: 100.0%
Validation accuracy: 56.1%
Loss at step 300: 1.549081
Training accuracy: 100.0%
Validation accuracy: 62.5%
Loss at step 400: 0.658004
Training accuracy: 100.0%
Validation accuracy: 65.6%
Loss at step 500: 0.333275
Training accuracy: 100.0%
Validation accuracy: 66.5%
Loss at step 600: 0.214715
Training accuracy: 100.0%
Validation accuracy: 66.9%
Loss at step 700: 0.171290
Training accuracy: 100.0%
Validation accuracy: 66.9%
Loss at step 800: 0.155299
Training accuracy: 100.0%
Validation accuracy: 67.0%
Test accuracy: 74.0%


Training accuracy goes to 100% due to the small sample size, however, the model is not as good the validation samples.


### neural network wiht l2 loss and small training data

In [37]:
batch_size = 128
n_input = image_size * image_size
n_hidden_1 = 1024
n_classes = 10
beta = 0.005


graph = tf.Graph()
with graph.as_default():

  # Input data. For the training data, we use a placeholder that will be fed
  # at run time with a training minibatch.
  tf_train_dataset = tf.placeholder(tf.float32,
                                    shape=(batch_size, n_input))
  tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, n_classes))
  tf_valid_dataset = tf.constant(valid_dataset)
  tf_test_dataset = tf.constant(test_dataset)
  
  weights = {
      'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
      'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
  }
  biases = {
      'b1': tf.Variable(tf.random_normal([n_hidden_1])),
      'out': tf.Variable(tf.random_normal([n_classes]))
  }

    
  # Training computation.
  layer_1 = tf.add(tf.matmul(tf_train_dataset, weights['h1']), biases['b1'])
  tf_layer = tf.nn.relu(layer_1)
  logits = tf.matmul(tf_layer, weights['out']) + biases['out']
  
  loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=logits))
  loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights['out']) )
  
  # Optimizer.
  optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
  
  # Predictions for the training, validation, and test data.
  train_prediction = tf.nn.softmax(logits)
    
  valid_relu = tf.nn.relu(tf.add(tf.matmul(tf_valid_dataset, weights['h1']), biases['b1']))
  valid_prediction = tf.nn.softmax(tf.add(tf.matmul(valid_relu, weights['out']), biases['out']))

  test_relu = tf.nn.relu(tf.add(tf.matmul(tf_test_dataset, weights['h1']), biases['b1']))
  test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights['out']) +  biases['out'])

In [38]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 487.220001
Minibatch accuracy: 9.4%
Validation accuracy: 34.1%
Minibatch loss at step 500: 29.892189
Minibatch accuracy: 73.4%
Validation accuracy: 79.2%
Minibatch loss at step 1000: 8.690788
Minibatch accuracy: 80.5%
Validation accuracy: 78.8%
Minibatch loss at step 1500: 6.251252
Minibatch accuracy: 84.4%
Validation accuracy: 79.8%
Minibatch loss at step 2000: 5.135604
Minibatch accuracy: 82.0%
Validation accuracy: 77.8%
Minibatch loss at step 2500: 7.449934
Minibatch accuracy: 77.3%
Validation accuracy: 79.7%
Minibatch loss at step 3000: 3.698263
Minibatch accuracy: 82.8%
Validation accuracy: 80.4%
Test accuracy: 87.6%


In [39]:
num_steps = 3001

train_dataset_2 = train_dataset[:500, :]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
        
        batch_data = train_dataset_2[offset:(offset + batch_size), :]
        batch_labels = train_labels_2[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 542.964111
Minibatch accuracy: 7.0%
Validation accuracy: 32.1%
Minibatch loss at step 500: 2.294342
Minibatch accuracy: 100.0%
Validation accuracy: 74.9%
Minibatch loss at step 1000: 0.188208
Minibatch accuracy: 100.0%
Validation accuracy: 75.1%
Minibatch loss at step 1500: 0.017448
Minibatch accuracy: 100.0%
Validation accuracy: 75.7%
Minibatch loss at step 2000: 0.005176
Minibatch accuracy: 100.0%
Validation accuracy: 76.8%
Minibatch loss at step 2500: 0.004388
Minibatch accuracy: 100.0%
Validation accuracy: 76.7%
Minibatch loss at step 3000: 0.004237
Minibatch accuracy: 100.0%
Validation accuracy: 76.6%
Test accuracy: 84.1%


We see overfitting occuring on neural network model when the training set is 3 times smaller.

## Problem 3

Introduce Dropout on the hidden layer of the neural network. Remember: Dropout should only be introduced during training, not evaluation, otherwise your evaluation results would be stochastic as well. TensorFlow provides nn.dropout() for that, but you have to make sure it's only inserted during training.

What happens to our extreme overfitting case?

In [40]:


batch_size = 128
beta = 0.001

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer
    hidden_nodes = 1024
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)

    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop, weights) + biases
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_relu = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, weights) + biases) 

    test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights) + biases)



In [41]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]

        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
        _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy( valid_prediction.eval(), valid_labels) )
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 395.268829
Minibatch accuracy: 14.8%
Validation accuracy: 31.1%
Minibatch loss at step 500: 16.951561
Minibatch accuracy: 69.5%
Validation accuracy: 79.9%
Minibatch loss at step 1000: 17.043293
Minibatch accuracy: 65.6%
Validation accuracy: 78.7%
Minibatch loss at step 1500: 7.137679
Minibatch accuracy: 75.8%
Validation accuracy: 79.3%
Minibatch loss at step 2000: 11.012438
Minibatch accuracy: 75.0%
Validation accuracy: 79.8%
Minibatch loss at step 2500: 9.976499
Minibatch accuracy: 70.3%
Validation accuracy: 79.3%
Minibatch loss at step 3000: 11.211316
Minibatch accuracy: 75.8%
Validation accuracy: 79.7%
Test accuracy: 86.9%


In [44]:
num_steps = 3001

train_dataset_2 = train_dataset[:500, :]
train_labels_2 = train_labels[:500]

with tf.Session(graph=graph) as session:
    tf.global_variables_initializer().run()
    print("Initialized")
    for step in range(num_steps):
        offset = (step * batch_size) % (train_labels_2.shape[0] - batch_size)
        
        batch_data = train_dataset_2[offset:(offset + batch_size), :]
        batch_labels = train_labels_2[offset:(offset + batch_size), :]
        
        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.4}
        _, l, predictions = session.run(
          [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
            print("Minibatch loss at step %d: %f" % (step, l))
            print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
            print("Validation accuracy: %.1f%%" % accuracy(
        valid_prediction.eval(), valid_labels))
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 519.101990
Minibatch accuracy: 9.4%
Validation accuracy: 20.3%
Minibatch loss at step 500: 4.203056
Minibatch accuracy: 99.2%
Validation accuracy: 77.9%
Minibatch loss at step 1000: 2.073818
Minibatch accuracy: 100.0%
Validation accuracy: 77.8%
Minibatch loss at step 1500: 1.335716
Minibatch accuracy: 100.0%
Validation accuracy: 77.9%
Minibatch loss at step 2000: 0.885346
Minibatch accuracy: 100.0%
Validation accuracy: 78.0%
Minibatch loss at step 2500: 0.640769
Minibatch accuracy: 99.2%
Validation accuracy: 78.3%
Minibatch loss at step 3000: 0.450377
Minibatch accuracy: 100.0%
Validation accuracy: 78.0%
Test accuracy: 85.3%



### Problem 4

Try to get the best performance you can using a multi-layer model! The best reported test accuracy using a deep network is 97.1%.

One avenue you can explore is to add multiple layers.

Another one is to use learning rate decay:

    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, step, ...)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)




### trying smaller beta and bigger num steps


In [46]:


batch_size = 128
beta = 0.0001

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer
    hidden_nodes = 1024
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)

    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop, weights) + biases
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    global_step = tf.Variable(0, trainable=False)  # count the number of steps taken.
    starter_learning_rate = 0.5
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,100000, 0.96, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_relu = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, weights) + biases) 

    test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights) + biases)



In [47]:
num_steps = 6001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]

        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.5}
        _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy( valid_prediction.eval(), valid_labels) )
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 450.186035
Minibatch accuracy: 12.5%
Validation accuracy: 27.9%
Minibatch loss at step 500: 18.637424
Minibatch accuracy: 73.4%
Validation accuracy: 79.1%
Minibatch loss at step 1000: 14.024601
Minibatch accuracy: 68.8%
Validation accuracy: 78.7%
Minibatch loss at step 1500: 7.348411
Minibatch accuracy: 78.1%
Validation accuracy: 78.1%
Minibatch loss at step 2000: 10.609426
Minibatch accuracy: 70.3%
Validation accuracy: 80.0%
Minibatch loss at step 2500: 10.423684
Minibatch accuracy: 67.2%
Validation accuracy: 79.7%
Minibatch loss at step 3000: 3.972826
Minibatch accuracy: 75.0%
Validation accuracy: 79.5%
Minibatch loss at step 3500: 13.815743
Minibatch accuracy: 63.3%
Validation accuracy: 80.4%
Minibatch loss at step 4000: 6.062233
Minibatch accuracy: 76.6%
Validation accuracy: 80.0%
Minibatch loss at step 4500: 5.052235
Minibatch accuracy: 76.6%
Validation accuracy: 79.7%
Minibatch loss at step 5000: 0.900898
Minibatch accuracy: 83.6%
Validation 

### trying keep prob of 0.1

In [48]:
batch_size = 128
beta = 0.0001

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer
    hidden_nodes = 1024
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)

    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop, weights) + biases
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits,labels=tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    global_step = tf.Variable(0, trainable=False)  # count the number of steps taken.
    starter_learning_rate = 0.5
    learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step,100000, 0.96, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_relu = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, weights) + biases) 

    test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights) + biases)

In [49]:
num_steps = 3001

with tf.Session(graph=graph) as session:
    tf.initialize_all_variables().run()
    print("Initialized")
    for step in range(num_steps):
        # Pick an offset within the training data, which has been randomized.
        # Note: we could use better randomization across epochs.
        offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
        # Generate a minibatch.
        batch_data = train_dataset[offset:(offset + batch_size), :]
        batch_labels = train_labels[offset:(offset + batch_size), :]

        feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 0.1}
        _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict)
        if (step % 500 == 0):
          print("Minibatch loss at step %d: %f" % (step, l))
          print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
          print("Validation accuracy: %.1f%%" % accuracy( valid_prediction.eval(), valid_labels) )
    print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

Initialized
Minibatch loss at step 0: 1105.149170
Minibatch accuracy: 11.7%
Validation accuracy: 25.5%
Minibatch loss at step 500: 163221.000000
Minibatch accuracy: 49.2%
Validation accuracy: 73.0%
Minibatch loss at step 1000: 9635577.000000
Minibatch accuracy: 41.4%
Validation accuracy: 63.4%
Minibatch loss at step 1500: 504115456.000000
Minibatch accuracy: 44.5%
Validation accuracy: 70.1%
Minibatch loss at step 2000: 26336778240.000000
Minibatch accuracy: 43.0%
Validation accuracy: 67.9%
Minibatch loss at step 2500: 1150298619904.000000
Minibatch accuracy: 50.0%
Validation accuracy: 71.2%
Minibatch loss at step 3000: 38355446267904.000000
Minibatch accuracy: 46.9%
Validation accuracy: 69.9%
Test accuracy: 76.1%


### trying with learning rate and multiple keep probs

In [51]:
batch_size = 128
beta = 0.001

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer
    hidden_nodes = 1024
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)

    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop, weights) + biases
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    global_step = tf.Variable(0)  # count the number of steps taken.
    learning_rate = tf.train.exponential_decay(0.5, global_step, 100000, 0.95, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss, global_step=global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_relu = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, weights) + biases) 

    test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights) + biases)

In [53]:
num_steps = 3001

for kp in [0.5, 0.6, 0.7, 0.9, 0.9, 1.0]:
    print( ">>with keep prob of "+ str(kp))
    with tf.Session(graph=graph) as session:
        tf.initialize_all_variables().run()
        print("Initialized")
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]

            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : kp}
            _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if (step % 500 == 0):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy( valid_prediction.eval(), valid_labels) )
        print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

>>with keep prob of 0.5
Initialized
Minibatch loss at step 0: 464.414581
Minibatch accuracy: 10.2%
Validation accuracy: 39.0%
Minibatch loss at step 500: 29.257612
Minibatch accuracy: 65.6%
Validation accuracy: 79.2%
Minibatch loss at step 1000: 14.665298
Minibatch accuracy: 73.4%
Validation accuracy: 78.8%
Minibatch loss at step 1500: 8.486413
Minibatch accuracy: 78.9%
Validation accuracy: 78.7%
Minibatch loss at step 2000: 9.903625
Minibatch accuracy: 68.8%
Validation accuracy: 80.1%
Minibatch loss at step 2500: 8.026438
Minibatch accuracy: 71.1%
Validation accuracy: 78.6%
Minibatch loss at step 3000: 5.399191
Minibatch accuracy: 75.0%
Validation accuracy: 79.7%
Test accuracy: 86.8%
>>with keep prob of 0.6
Initialized
Minibatch loss at step 0: 551.104553
Minibatch accuracy: 3.1%
Validation accuracy: 31.9%
Minibatch loss at step 500: 25.116768
Minibatch accuracy: 76.6%
Validation accuracy: 79.8%
Minibatch loss at step 1000: 10.915282
Minibatch accuracy: 74.2%
Validation accuracy: 80.4


### trying different learning rates

In [54]:
batch_size = 128
beta = 0.001

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer
    hidden_nodes = 1024
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)

    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop, weights) + biases
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    global_step = tf.Variable(0)  # count the number of steps taken.
    learnr = tf.placeholder("float")
    learning_rate = tf.train.exponential_decay(learnr, global_step, 100000, 0.95, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(0.5).minimize(loss, global_step=global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)
    valid_relu = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu, weights) + biases) 

    test_relu = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_prediction = tf.nn.softmax(tf.matmul(test_relu, weights) + biases)

In [57]:
for lr in np.arange(0.0001, 0.001, 0.0001).tolist():

    print('>> trying learning rate of ' + str(lr))
    with tf.Session(graph=graph) as session:
        tf.global_variables_initializer().run()
        print("Initialized")
        for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]

            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 1.0, learnr : lr}
            _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict)
            if (step % 500 == 0):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy( valid_prediction.eval(), valid_labels) )
        print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))


>> trying learning rate of 0.0001
Initialized
Minibatch loss at step 0: 379.620422
Minibatch accuracy: 10.9%
Validation accuracy: 35.7%
Minibatch loss at step 500: 10.390085
Minibatch accuracy: 78.9%
Validation accuracy: 80.6%
Minibatch loss at step 1000: 9.908013
Minibatch accuracy: 75.0%
Validation accuracy: 78.7%
Minibatch loss at step 1500: 3.756989
Minibatch accuracy: 85.9%
Validation accuracy: 80.2%
Minibatch loss at step 2000: 4.072092
Minibatch accuracy: 83.6%
Validation accuracy: 81.4%
Minibatch loss at step 2500: 6.320272
Minibatch accuracy: 76.6%
Validation accuracy: 81.0%
Minibatch loss at step 3000: 3.303612
Minibatch accuracy: 85.2%
Validation accuracy: 80.8%
Test accuracy: 87.6%
>> trying learning rate of 0.0002
Initialized
Minibatch loss at step 0: 366.295380
Minibatch accuracy: 11.7%
Validation accuracy: 22.0%
Minibatch loss at step 500: 7.961016
Minibatch accuracy: 81.2%
Validation accuracy: 81.3%
Minibatch loss at step 1000: 9.721804
Minibatch accuracy: 77.3%
Validat

### trying two layer neural network

In [59]:
batch_size = 128
beta = 0.001

hidden_nodes1 = 1024
hidden_nodes2 = 512

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer 1
    
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes1]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes1]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)
    
    # new hidden layer 2
    hidden_weights2 = tf.Variable( tf.truncated_normal([hidden_nodes1, hidden_nodes2]) )
    hidden_biases2 = tf.Variable( tf.zeros([hidden_nodes2]))
    hidden_layer2 = tf.nn.relu( tf.matmul( hidden_layer_drop, hidden_weights2) + hidden_biases2)
    
    # add dropout on hidden layer
    hidden_layer_drop2 = tf.nn.dropout(hidden_layer2, keep_prob)
    
    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes2, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop2, weights) + biases
    
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    global_step = tf.Variable(0)  # count the number of steps taken.
    learnr = tf.placeholder("float")
    learning_rate = tf.train.exponential_decay(learnr, global_step, 100000, 0.95, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step= global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)    
    
    valid_relu1 = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)    
    valid_relu2 = tf.nn.relu(  tf.matmul(valid_relu1, hidden_weights2) + hidden_biases2)    
    
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu2, weights) + biases) 
    
    test_relu1 = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_relu2 = tf.nn.relu( tf.matmul( test_relu1, hidden_weights2) + hidden_biases2)   
    
    test_prediction = tf.nn.softmax(tf.matmul(test_relu2, weights) + biases)


In [60]:
num_steps = 3001

with tf.Session(graph=graph) as session:
  tf.global_variables_initializer().run()
  print("Initialized")
  for step in range(num_steps):
    # Pick an offset within the training data, which has been randomized.
    # Note: we could use better randomization across epochs.
    offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
    # Generate a minibatch.
    batch_data = train_dataset[offset:(offset + batch_size), :]
    batch_labels = train_labels[offset:(offset + batch_size), :]
    # Prepare a dictionary telling the session where to feed the minibatch.
    # The key of the dictionary is the placeholder node of the graph to be fed,
    # and the value is the numpy array to feed to it.
    feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : 1.0, learnr : 0.001}
    _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict )
    if (step % 500 == 0):
      print("Minibatch loss at step %d: %f" % (step, l))
      print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
      print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
  print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))


Initialized
Minibatch loss at step 0: 5591.298340
Minibatch accuracy: 5.5%
Validation accuracy: 10.2%
Minibatch loss at step 500: 360.979828
Minibatch accuracy: 77.3%
Validation accuracy: 76.1%
Minibatch loss at step 1000: 197.053528
Minibatch accuracy: 79.7%
Validation accuracy: 78.4%
Minibatch loss at step 1500: 140.675903
Minibatch accuracy: 86.7%
Validation accuracy: 79.2%
Minibatch loss at step 2000: 194.144760
Minibatch accuracy: 78.9%
Validation accuracy: 78.6%
Minibatch loss at step 2500: 190.313065
Minibatch accuracy: 78.9%
Validation accuracy: 79.9%
Minibatch loss at step 3000: 103.133148
Minibatch accuracy: 83.6%
Validation accuracy: 80.2%
Test accuracy: 87.4%


In [63]:
batch_size = 128
beta = 0.001

hidden_nodes1 = 1024
hidden_nodes2 = 512

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer 1
    
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes1]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes1]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)
    
    # new hidden layer 2
    hidden_weights2 = tf.Variable( tf.truncated_normal([hidden_nodes1, hidden_nodes2]) )
    hidden_biases2 = tf.Variable( tf.zeros([hidden_nodes2]))
    hidden_layer2 = tf.nn.relu( tf.matmul( hidden_layer_drop, hidden_weights2) + hidden_biases2)
    
    # add dropout on hidden layer
    hidden_layer_drop2 = tf.nn.dropout(hidden_layer2, keep_prob)
    
    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes2, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop2, weights) + biases
    
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits = logits,labels= tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    global_step = tf.Variable(0)  # count the number of steps taken.
    learnr = tf.placeholder("float")
    learning_rate = tf.train.exponential_decay(learnr, global_step, 100000, 0.95, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step= global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)    
    
    valid_relu1 = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)    
    valid_relu2 = tf.nn.relu(  tf.matmul(valid_relu1, hidden_weights2) + hidden_biases2)    
    
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu2, weights) + biases) 
    
    test_relu1 = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_relu2 = tf.nn.relu( tf.matmul( test_relu1, hidden_weights2) + hidden_biases2)   
    
    test_prediction = tf.nn.softmax(tf.matmul(test_relu2, weights) + biases)

In [66]:
num_steps = 3001
scores = {}
for kp in [0.5, 0.6, 0.7, 0.9, 0.9, 1.0]:
    for lr in np.arange(0.0001, 0.001, 0.0001).tolist():
        print(">> with keep prob of " + str(kp))
        print(">> with learning rate  " + str(lr))
        with tf.Session(graph=graph) as session:
          tf.initialize_all_variables().run()
          print("Initialized")
          for step in range(num_steps):
            # Pick an offset within the training data, which has been randomized.
            # Note: we could use better randomization across epochs.
            offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
            # Generate a minibatch.
            batch_data = train_dataset[offset:(offset + batch_size), :]
            batch_labels = train_labels[offset:(offset + batch_size), :]
            # Prepare a dictionary telling the session where to feed the minibatch.
            # The key of the dictionary is the placeholder node of the graph to be fed,
            # and the value is the numpy array to feed to it.
            feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, keep_prob : kp, learnr : lr}
            _, l, predictions = session.run( [optimizer, loss, train_prediction], feed_dict=feed_dict )
            if (step % 500 == 0):
              print("Minibatch loss at step %d: %f" % (step, l))
              print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
              print("Validation accuracy: %.1f%%" % accuracy(valid_prediction.eval(), valid_labels))
          acc = accuracy(test_prediction.eval(), test_labels)
          scores[(kp, lr)] = acc
          print("Test accuracy: %.1f%%" % acc)
print(scores)

>> with keep prob of 0.5
>> with learning rate  0.0001
Initialized
Minibatch loss at step 0: 10255.742188
Minibatch accuracy: 9.4%
Validation accuracy: 12.8%
Minibatch loss at step 500: 3285.767090
Minibatch accuracy: 45.3%
Validation accuracy: 70.3%
Minibatch loss at step 1000: 1768.007324
Minibatch accuracy: 55.5%
Validation accuracy: 75.3%
Minibatch loss at step 1500: 1532.116699
Minibatch accuracy: 60.2%
Validation accuracy: 77.5%
Minibatch loss at step 2000: 1514.243164
Minibatch accuracy: 55.5%
Validation accuracy: 78.5%
Minibatch loss at step 2500: 1642.928711
Minibatch accuracy: 58.6%
Validation accuracy: 79.0%
Minibatch loss at step 3000: 1280.700317
Minibatch accuracy: 65.6%
Validation accuracy: 79.5%
Test accuracy: 86.5%
>> with keep prob of 0.5
>> with learning rate  0.0002
Initialized
Minibatch loss at step 0: 9713.031250
Minibatch accuracy: 5.5%
Validation accuracy: 5.6%
Minibatch loss at step 500: 2307.656494
Minibatch accuracy: 48.4%
Validation accuracy: 74.9%
Minibatch

Validation accuracy: 9.4%
Minibatch loss at step 500: 1408.159302
Minibatch accuracy: 60.2%
Validation accuracy: 77.5%
Minibatch loss at step 1000: 970.783386
Minibatch accuracy: 64.1%
Validation accuracy: 79.3%
Minibatch loss at step 1500: 559.201172
Minibatch accuracy: 74.2%
Validation accuracy: 80.5%
Minibatch loss at step 2000: 801.077515
Minibatch accuracy: 70.3%
Validation accuracy: 80.4%
Minibatch loss at step 2500: 803.508301
Minibatch accuracy: 68.8%
Validation accuracy: 80.7%
Minibatch loss at step 3000: 552.789185
Minibatch accuracy: 71.1%
Validation accuracy: 81.1%
Test accuracy: 87.8%
>> with keep prob of 0.6
>> with learning rate  0.0004
Initialized
Minibatch loss at step 0: 7368.016602
Minibatch accuracy: 9.4%
Validation accuracy: 10.6%
Minibatch loss at step 500: 1265.839844
Minibatch accuracy: 57.0%
Validation accuracy: 78.0%
Minibatch loss at step 1000: 956.933655
Minibatch accuracy: 66.4%
Validation accuracy: 79.7%
Minibatch loss at step 1500: 367.532745
Minibatch ac

Validation accuracy: 78.8%
Minibatch loss at step 1000: 562.237427
Minibatch accuracy: 69.5%
Validation accuracy: 80.2%
Minibatch loss at step 1500: 316.937744
Minibatch accuracy: 76.6%
Validation accuracy: 80.7%
Minibatch loss at step 2000: 465.422119
Minibatch accuracy: 69.5%
Validation accuracy: 80.5%
Minibatch loss at step 2500: 401.451782
Minibatch accuracy: 71.1%
Validation accuracy: 80.8%
Minibatch loss at step 3000: 272.632294
Minibatch accuracy: 75.0%
Validation accuracy: 81.2%
Test accuracy: 87.9%
>> with keep prob of 0.7
>> with learning rate  0.0006000000000000001
Initialized
Minibatch loss at step 0: 6635.467285
Minibatch accuracy: 4.7%
Validation accuracy: 10.3%
Minibatch loss at step 500: 877.913147
Minibatch accuracy: 62.5%
Validation accuracy: 79.3%
Minibatch loss at step 1000: 471.596985
Minibatch accuracy: 68.0%
Validation accuracy: 80.6%
Minibatch loss at step 1500: 252.216537
Minibatch accuracy: 85.2%
Validation accuracy: 81.0%
Minibatch loss at step 2000: 329.4057

Minibatch loss at step 1000: 362.060211
Minibatch accuracy: 73.4%
Validation accuracy: 80.0%
Minibatch loss at step 1500: 181.248505
Minibatch accuracy: 86.7%
Validation accuracy: 80.6%
Minibatch loss at step 2000: 266.221893
Minibatch accuracy: 77.3%
Validation accuracy: 80.8%
Minibatch loss at step 2500: 284.595520
Minibatch accuracy: 75.8%
Validation accuracy: 81.2%
Minibatch loss at step 3000: 140.987610
Minibatch accuracy: 82.0%
Validation accuracy: 81.3%
Test accuracy: 88.1%
>> with keep prob of 0.9
>> with learning rate  0.0008
Initialized
Minibatch loss at step 0: 6331.291504
Minibatch accuracy: 12.5%
Validation accuracy: 11.1%
Minibatch loss at step 500: 672.411926
Minibatch accuracy: 67.2%
Validation accuracy: 78.8%
Minibatch loss at step 1000: 293.635803
Minibatch accuracy: 76.6%
Validation accuracy: 79.9%
Minibatch loss at step 1500: 150.989502
Minibatch accuracy: 82.0%
Validation accuracy: 80.5%
Minibatch loss at step 2000: 266.244965
Minibatch accuracy: 78.1%
Validation a

Validation accuracy: 80.3%
Minibatch loss at step 1500: 191.750977
Minibatch accuracy: 83.6%
Validation accuracy: 80.7%
Minibatch loss at step 2000: 250.453552
Minibatch accuracy: 76.6%
Validation accuracy: 81.1%
Minibatch loss at step 2500: 218.897110
Minibatch accuracy: 74.2%
Validation accuracy: 81.2%
Minibatch loss at step 3000: 133.114609
Minibatch accuracy: 77.3%
Validation accuracy: 80.9%
Test accuracy: 88.4%
>> with keep prob of 1.0
>> with learning rate  0.0001
Initialized
Minibatch loss at step 0: 5712.914062
Minibatch accuracy: 7.8%
Validation accuracy: 9.7%
Minibatch loss at step 500: 754.090149
Minibatch accuracy: 60.2%
Validation accuracy: 65.2%
Minibatch loss at step 1000: 614.372986
Minibatch accuracy: 69.5%
Validation accuracy: 70.0%
Minibatch loss at step 1500: 280.915863
Minibatch accuracy: 78.9%
Validation accuracy: 72.0%
Minibatch loss at step 2000: 485.508514
Minibatch accuracy: 74.2%
Validation accuracy: 73.1%
Minibatch loss at step 2500: 456.068329
Minibatch acc

### trying 2 NN with different loss function

In [None]:
batch_size = 128
beta = 0.001

hidden_nodes1 = 1024
hidden_nodes2 = 512

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer 1
    
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes1]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes1]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)
    
    # new hidden layer 2
    hidden_weights2 = tf.Variable( tf.truncated_normal([hidden_nodes1, hidden_nodes2]) )
    hidden_biases2 = tf.Variable( tf.zeros([hidden_nodes2]))
    hidden_layer2 = tf.nn.relu( tf.matmul( hidden_layer_drop, hidden_weights2) + hidden_biases2)
    
    # add dropout on hidden layer
    hidden_layer_drop2 = tf.nn.dropout(hidden_layer2, keep_prob)
    
    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes2, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop2, weights) + biases
    
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * ( tf.nn.l2_loss(weights) + tf.nn.l2_loss(hidden_weights) + tf.nn.l2_loss(hidden_weights2) ))

    # Optimizer.
    global_step = tf.Variable(0)  # count the number of steps taken.
    learnr = tf.placeholder("float")
    learning_rate = tf.train.exponential_decay(learnr, global_step, 100000, 0.95, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step= global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)    
    
    valid_relu1 = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)    
    valid_relu2 = tf.nn.relu(  tf.matmul(valid_relu1, hidden_weights2) + hidden_biases2)    
    
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu2, weights) + biases) 
    
    test_relu1 = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_relu2 = tf.nn.relu( tf.matmul( test_relu1, hidden_weights2) + hidden_biases2)   
    
    test_prediction = tf.nn.softmax(tf.matmul(test_relu2, weights) + biases)


### trying NN with 5 layers

In [None]:
batch_size = 128
beta = 0.001

hidden_nodes1 = 1024
hidden_nodes2 = 512
hidden_nodes3 = 256
hidden_nodes4 = 128
hidden_nodes5 = 64

graph = tf.Graph()
with graph.as_default():

    # Input data. For the training data, we use a placeholder that will be fed
    # at run time with a training minibatch.
    tf_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, image_size * image_size))
    tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
    tf_valid_dataset = tf.constant(valid_dataset)
    tf_test_dataset = tf.constant(test_dataset)

    # new hidden layer 1
    
    hidden_weights = tf.Variable( tf.truncated_normal([image_size * image_size, hidden_nodes1]) )
    hidden_biases = tf.Variable( tf.zeros([hidden_nodes1]))
    hidden_layer = tf.nn.relu( tf.matmul( tf_train_dataset, hidden_weights) + hidden_biases)
    
    # add dropout on hidden layer
    keep_prob = tf.placeholder("float")
    hidden_layer_drop = tf.nn.dropout(hidden_layer, keep_prob)
    
    # new hidden layer 2
    hidden_weights2 = tf.Variable( tf.truncated_normal([hidden_nodes1, hidden_nodes2]) )
    hidden_biases2 = tf.Variable( tf.zeros([hidden_nodes2]))
    hidden_layer2 = tf.nn.relu( tf.matmul( hidden_layer_drop, hidden_weights2) + hidden_biases2)
    
    # add dropout on hidden layer 2
    hidden_layer_drop2 = tf.nn.dropout(hidden_layer2, keep_prob)
    
    # new hidden layer 3
    hidden_weights3 = tf.Variable( tf.truncated_normal([hidden_nodes2, hidden_nodes3]) )
    hidden_biases3 = tf.Variable( tf.zeros([hidden_nodes3]))
    hidden_layer3 = tf.nn.relu( tf.matmul( hidden_layer_drop2, hidden_weights3) + hidden_biases3)
    
    # add dropout on hidden layer 3
    hidden_layer_drop3 = tf.nn.dropout(hidden_layer3, keep_prob)
    
    # new hidden layer 4
    hidden_weights4 = tf.Variable( tf.truncated_normal([hidden_nodes3, hidden_nodes4]) )
    hidden_biases4 = tf.Variable( tf.zeros([hidden_nodes4]))
    hidden_layer4 = tf.nn.relu( tf.matmul( hidden_layer_drop3, hidden_weights4) + hidden_biases4)
    
    # add dropout on hidden layer 4
    hidden_layer_drop4 = tf.nn.dropout(hidden_layer4, keep_prob)
    
    # new hidden layer 5
    hidden_weights5 = tf.Variable( tf.truncated_normal([hidden_nodes4, hidden_nodes5]) )
    hidden_biases5 = tf.Variable( tf.zeros([hidden_nodes5]))
    hidden_layer5 = tf.nn.relu( tf.matmul( hidden_layer_drop4, hidden_weights5) + hidden_biases5)
    
    # add dropout on hidden layer 5
    hidden_layer_drop5 = tf.nn.dropout(hidden_layer5, keep_prob)
    
    # Variables.
    weights = tf.Variable( tf.truncated_normal([hidden_nodes5, num_labels])) 
    biases = tf.Variable(tf.zeros([num_labels]))

    # Training computation.
    logits = tf.matmul(hidden_layer_drop5, weights) + biases
    
    loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels) )
    loss = tf.reduce_mean( loss + beta * tf.nn.l2_loss(weights) )

    # Optimizer.
    global_step = tf.Variable(0)  # count the number of steps taken.
    learnr = tf.placeholder("float")
    learning_rate = tf.train.exponential_decay(learnr, global_step, 100000, 0.95, staircase=True)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step= global_step)

    # Predictions for the training, validation, and test data.
    train_prediction = tf.nn.softmax(logits)    
    
    valid_relu1 = tf.nn.relu(  tf.matmul(tf_valid_dataset, hidden_weights) + hidden_biases)    
    valid_relu2 = tf.nn.relu(  tf.matmul(valid_relu1, hidden_weights2) + hidden_biases2)  
    valid_relu3 = tf.nn.relu(  tf.matmul(valid_relu2, hidden_weights3) + hidden_biases3)   
    valid_relu4 = tf.nn.relu(  tf.matmul(valid_relu3, hidden_weights4) + hidden_biases4)
    valid_relu5 = tf.nn.relu(  tf.matmul(valid_relu4, hidden_weights5) + hidden_biases5)   
    
    valid_prediction = tf.nn.softmax( tf.matmul(valid_relu5, weights) + biases) 
    
    test_relu1 = tf.nn.relu( tf.matmul( tf_test_dataset, hidden_weights) + hidden_biases)
    test_relu2 = tf.nn.relu( tf.matmul( test_relu1, hidden_weights2) + hidden_biases2)   
    test_relu3 = tf.nn.relu( tf.matmul( test_relu2, hidden_weights3) + hidden_biases3)  
    test_relu4 = tf.nn.relu( tf.matmul( test_relu3, hidden_weights4) + hidden_biases4)   
    test_relu5 = tf.nn.relu( tf.matmul( test_relu4, hidden_weights5) + hidden_biases5)  
    
    test_prediction = tf.nn.softmax(tf.matmul(test_relu5, weights) + biases)
