In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import gzip
import os
import sys
import time

import numpy
import pandas as pd
import tensorflow as tf
from six.moves import urllib
from six.moves import xrange
from sklearn import utils
from sklearn.cross_validation import train_test_split

In [2]:
WORK_DIRECTORY = 'input'
IMAGE_SIZE = 28
NUM_CHANNELS = 1
PIXEL_DEPTH = 255
NUM_LABELS = 10
VALIDATION_SIZE = 5000  # Size of the validation set.
SEED = 66478  # Set to None for random seed.
BATCH_SIZE = 64
NUM_EPOCHS = 10
EVAL_BATCH_SIZE = 64
EVAL_FREQUENCY = 100  # Number of steps between evaluations.

FLAGS = tf.app.flags.FLAGS

In [3]:
def data_type():
    return tf.float32

def extract_data(filename):
    """
    Extract the images into a 4D tensor [image index, y, x, channels].
    Values are rescaled from [0, 255] down to [-0.5, 0.5].
    """    
    df = pd.read_csv('train.csv');
    labels = numpy.array(df.iloc[:,0], dtype=numpy.int32)
    samples = numpy.array(df.iloc[:,1:], dtype=numpy.float32) / 255
    samples = samples.reshape(samples.shape[0], IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS)
    print("Labels shape:", labels.shape)
    print("Samples shape:", samples.shape)
    return samples, labels

In [4]:
# Get the data.
train_data_filename = 'train.csv'

# Extract it into numpy arrays.
samples, labels = extract_data(train_data_filename)

#train_data, validation_data, train_labels, validation_labels = train_test_split(samples, labels, train_size=0.6, random_state=42)
#validation_data, test_data, validation_labels, test_labels = train_test_split(validation_data, validation_labels, test_size=0.65, random_state=42)
train_data = samples[:25200]
validation_data = samples[25200:31080]
test_data = samples[31080:]

train_labels = labels[:25200]
validation_labels = labels[25200:31080]
test_labels = labels[31080:]

print("Train set:", train_data.shape, train_labels.shape)
print("Validation set:", validation_data.shape, validation_labels.shape)
print("Test set:", test_data.shape, test_labels.shape)

num_epochs = NUM_EPOCHS
train_size = train_labels.shape[0]

Labels shape: (42000,)
Samples shape: (42000, 28, 28, 1)
Train set: (25200, 28, 28, 1) (25200,)
Validation set: (5880, 28, 28, 1) (5880,)
Test set: (10920, 28, 28, 1) (10920,)


In [5]:
# This is where training samples and labels are fed to the graph.
# These placeholder nodes will be fed a batch of training data at each
# training step using the {feed_dict} argument to the Run() call below.
train_data_node = tf.placeholder(data_type(), shape=(BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))
train_labels_node = tf.placeholder(tf.int64, shape=(BATCH_SIZE,))
eval_data = tf.placeholder(data_type(), shape=(EVAL_BATCH_SIZE, IMAGE_SIZE, IMAGE_SIZE, NUM_CHANNELS))

# The variables below hold all the trainable weights. They are passed an
# initial value which will be assigned when we call:
# {tf.initialize_all_variables().run()}
conv1_weights = tf.Variable(tf.truncated_normal([5, 5, NUM_CHANNELS, 32], # 5x5 filter, depth 32.
                                                stddev=0.1, seed=SEED, dtype=data_type()))
conv1_biases = tf.Variable(tf.zeros([32], dtype=data_type()))

conv2_weights = tf.Variable(tf.truncated_normal([5, 5, 32, 64], stddev=0.1,
                                                seed=SEED, dtype=data_type()))
conv2_biases = tf.Variable(tf.constant(0.1, shape=[64], dtype=data_type()))

fc1_weights = tf.Variable(tf.truncated_normal([IMAGE_SIZE // 4 * IMAGE_SIZE // 4 * 64, 512], # fully connected, depth 512.
                                              stddev=0.1, seed=SEED, dtype=data_type()))
fc1_biases = tf.Variable(tf.constant(0.1, shape=[512], dtype=data_type()))

fc2_weights = tf.Variable(tf.truncated_normal([512, NUM_LABELS], stddev=0.1, seed=SEED, dtype=data_type()))
fc2_biases = tf.Variable(tf.constant(0.1, shape=[NUM_LABELS], dtype=data_type()))

In [6]:
def model(data, train=False):
    """The Model definition."""
    # 2D convolution, with 'SAME' padding (i.e. the output feature map has
    # the same size as the input). Note that {strides} is a 4D array whose
    # shape matches the data layout: [image index, y, x, depth].
    conv = tf.nn.conv2d(data, conv1_weights, strides=[1, 1, 1, 1], padding='SAME')
    # Bias and rectified linear non-linearity.
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv1_biases))
    # Max pooling. The kernel size spec {ksize} also follows the layout of
    # the data. Here we have a pooling window of 2, and a stride of 2.
    pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

    conv = tf.nn.conv2d(pool, conv2_weights, strides=[1, 1, 1, 1], padding='SAME')
    relu = tf.nn.relu(tf.nn.bias_add(conv, conv2_biases))
    pool = tf.nn.max_pool(relu, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
    
    # Reshape the feature map cuboid into a 2D matrix to feed it to the fully connected layers.
    pool_shape = pool.get_shape().as_list()
    reshape = tf.reshape(pool, [pool_shape[0], pool_shape[1] * pool_shape[2] * pool_shape[3]])
    # Fully connected layer. Note that the '+' operation automatically broadcasts the biases.
    hidden = tf.nn.relu(tf.matmul(reshape, fc1_weights) + fc1_biases)
    # Add a 50% dropout during training only. Dropout also scales
    # activations such that no rescaling is needed at evaluation time.
    if train:
        hidden = tf.nn.dropout(hidden, 0.5, seed=SEED)
        
    return tf.matmul(hidden, fc2_weights) + fc2_biases

In [7]:
# Training computation: logits + cross-entropy loss.
logits = model(train_data_node, True)
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits, train_labels_node))

# L2 regularization for the fully connected parameters.
regularizers = (tf.nn.l2_loss(fc1_weights) + tf.nn.l2_loss(fc1_biases) +
                tf.nn.l2_loss(fc2_weights) + tf.nn.l2_loss(fc2_biases))
# Add the regularization term to the loss.
loss += 5e-4 * regularizers

# Optimizer: set up a variable that's incremented once per batch and
# controls the learning rate decay.
batch = tf.Variable(0, dtype=data_type())
# Decay once per epoch, using an exponential schedule starting at 0.01.
learning_rate = tf.train.exponential_decay(0.01,                # Base learning rate.
                                           batch * BATCH_SIZE,  # Current index into the dataset.
                                           train_size,          # Decay step.
                                           0.95,                # Decay rate.
                                           staircase=True)
# Use simple momentum for the optimization.
optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9).minimize(loss, global_step=batch)

# Predictions for the current training minibatch.
train_prediction = tf.nn.softmax(logits)

# Predictions for the test and validation, which we'll compute less often.
eval_prediction = tf.nn.softmax(model(eval_data))

In [8]:
# Small utility function to evaluate a dataset by feeding batches of data to
# {eval_data} and pulling the results from {eval_predictions}.
# Saves memory and enables this to run on smaller GPUs.
def eval_in_batches(data, sess):
    """Get all predictions for a dataset by running it in small batches."""
    size = data.shape[0]
    if size < EVAL_BATCH_SIZE:
        raise ValueError("batch size for evals larger than dataset: %d" % size)
    predictions = numpy.ndarray(shape=(size, NUM_LABELS), dtype=numpy.float32)
    for begin in xrange(0, size, EVAL_BATCH_SIZE):
        end = begin + EVAL_BATCH_SIZE
        if end <= size:
            predictions[begin:end, :] = sess.run(
                    eval_prediction,
                    feed_dict={eval_data: data[begin:end, ...]})
        else:
            batch_predictions = sess.run(
                    eval_prediction,
                    feed_dict={eval_data: data[-EVAL_BATCH_SIZE:, ...]})
            predictions[begin:, :] = batch_predictions[begin - size:, :]
    return predictions

def error_rate(predictions, labels):
    """Return the error rate based on dense predictions and sparse labels."""
    return 100.0 - (100.0 * numpy.sum(numpy.argmax(predictions, 1) == labels) / predictions.shape[0])

In [9]:
# Create a local session to run the training.
start_time = time.time()
with tf.Session() as sess:
    # Run all the initializers to prepare the trainable parameters.
    tf.initialize_all_variables().run()
    print('Initialized!')
    # Loop through training steps.
    for step in xrange(int(num_epochs * train_size) // BATCH_SIZE):
        # Compute the offset of the current minibatch in the data.
        # Note that we could use better randomization across epochs.
        offset = (step * BATCH_SIZE) % (train_size - BATCH_SIZE)
        batch_data = train_data[offset:(offset + BATCH_SIZE), ...]
        batch_labels = train_labels[offset:(offset + BATCH_SIZE)]
        # This dictionary maps the batch data (as a numpy array) to the
        # node in the graph it should be fed to.
        feed_dict = {train_data_node: batch_data, train_labels_node: batch_labels}
        # Run the graph and fetch some of the nodes.
        _, l, lr, predictions = sess.run([optimizer, loss, learning_rate, train_prediction], feed_dict=feed_dict)
        
        if step % EVAL_FREQUENCY == 0:
            elapsed_time = time.time() - start_time
            start_time = time.time()
            print('Step %d (epoch %.2f), %.1f ms' % (step, float(step) * BATCH_SIZE / train_size, 1000 * elapsed_time / EVAL_FREQUENCY))
            print('Minibatch loss: %.3f, learning rate: %.6f' % (l, lr))
            print('Minibatch error: %.1f%%' % error_rate(predictions, batch_labels))
            print('Validation error: %.1f%%' % error_rate(eval_in_batches(validation_data, sess), validation_labels))
            sys.stdout.flush()
            
    # Finally print the result!
    test_error = error_rate(eval_in_batches(test_data, sess), test_labels)
    print('Test error: %.1f%%' % test_error)

Initialized!
Step 0 (epoch 0.00), 5.4 ms
Minibatch loss: 10.219, learning rate: 0.010000
Minibatch error: 95.3%
Validation error: 81.0%
Step 100 (epoch 0.25), 298.7 ms
Minibatch loss: 3.333, learning rate: 0.010000
Minibatch error: 3.1%
Validation error: 6.3%
Step 200 (epoch 0.51), 299.9 ms
Minibatch loss: 3.499, learning rate: 0.010000
Minibatch error: 14.1%
Validation error: 4.9%
Step 300 (epoch 0.76), 298.2 ms
Minibatch loss: 3.162, learning rate: 0.010000
Minibatch error: 6.2%
Validation error: 3.8%
Step 400 (epoch 1.02), 288.8 ms
Minibatch loss: 3.154, learning rate: 0.009500
Minibatch error: 4.7%
Validation error: 2.9%
Step 500 (epoch 1.27), 299.8 ms
Minibatch loss: 3.094, learning rate: 0.009500
Minibatch error: 6.2%
Validation error: 2.6%
Step 600 (epoch 1.52), 288.8 ms
Minibatch loss: 2.974, learning rate: 0.009500
Minibatch error: 1.6%
Validation error: 2.8%
Step 700 (epoch 1.78), 299.1 ms
Minibatch loss: 2.952, learning rate: 0.009500
Minibatch error: 1.6%
Validation error: 