# The MNIST Application

This is a conventional TensorFlow-based MNIST application, see for example:

TensorFlow tutorial: https://www.tensorflow.org/tutorials/layers

Horovod's MNIST example: https://github.com/uber/horovod/blob/master/examples/tensorflow_mnist.py

In [1]:
import os
import time
from datetime import timedelta, datetime, tzinfo

## Train the TensorFlow single worker with the MNIST data

In [2]:
import tensorflow as tf
import mnist_app

log_string = mnist_app.get_log_string(1024)

# Extract the MNIST dataset
learn = tf.contrib.learn
mnist = learn.datasets.mnist.read_data_sets('MNIST-data')

# Build model...
import mnist_app
with tf.name_scope('input'):
    image = tf.placeholder(tf.float32, [None, 784], name='image')
    label = tf.placeholder(tf.float32, [None], name='label')
predict, loss = mnist_app.conv_model(image, label, tf.contrib.learn.ModeKeys.TRAIN)
    
global_step = tf.train.get_or_create_global_step()

opt = tf.train.RMSPropOptimizer(0.001)
train_opt = opt.minimize(loss=loss, global_step=global_step)

# The MonitoredTrainingSession takes care of session initialization,
# restoring from a checkpoint, saving to a checkpoint, and closing 
# when done or an error occurs.

checkpoint_dir = './tf-checkpoints'

hooks = [
    tf.train.StopAtStepHook(last_step=401),
    tf.train.LoggingTensorHook(tensors={'step': global_step, 'loss': loss},
                               every_n_iter=100),
]

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = "0"

with tf.train.MonitoredTrainingSession(checkpoint_dir=checkpoint_dir,
                                       hooks=hooks,
                                       config=config) as mon_sess:
    while not mon_sess.should_stop():
        # Run a training step synchronously.
        image_, label_ = mnist.train.next_batch(100)
        mon_sess.run(train_opt, feed_dict={image: image_, label: label_})

Extracting MNIST-data/train-images-idx3-ubyte.gz
Extracting MNIST-data/train-labels-idx1-ubyte.gz
Extracting MNIST-data/t10k-images-idx3-ubyte.gz
Extracting MNIST-data/t10k-labels-idx1-ubyte.gz
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Saving checkpoints for 1 into ./tf-checkpoints/model.ckpt.
INFO:tensorflow:loss = 2.313424, step = 1
INFO:tensorflow:global_step/sec: 57.9858
INFO:tensorflow:loss = 1.2679976, step = 101 (1.542 sec)
INFO:tensorflow:global_step/sec: 71.0995
INFO:tensorflow:loss = 0.12810637, step = 201 (1.406 sec)
INFO:tensorflow:global_step/sec: 71.1833
INFO:tensorflow:loss = 0.11259402, step = 301 (1.405 sec)
INFO:tensorflow:global_step/sec: 71.2567
INFO:tensorflow:loss = 0.07178667, step = 401 (1.404 sec)
INFO:tensorflow:Saving checkpoints for 401 into ./tf-checkpoints/model.ckpt.
