Test the data queuing and distortion and augmentation

In [1]:
import numpy as np
import tensorflow as tf
from local_lr import cifar10_input
import os
import tarfile
import sys
from six.moves import urllib
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from datetime import datetime
import re
import pdb

import matplotlib.pyplot as plt
% matplotlib inline

In [2]:
# Load data from Alex websited and save locally

FLAGS = tf.app.flags.FLAGS

tf.app.flags.DEFINE_string("data_dir", "/tmp/cifar10_data", """Path to the CIFAR10-data director""")
tf.app.flags.DEFINE_integer('batch_size', 128, """Numbers of images to process in a batch.""")

tf.app.flags.DEFINE_string('train_dir', '/tmp/cifar10_train',
                           """Directory where to write event logs """
                           """and checkpoint.""")
tf.app.flags.DEFINE_integer('max_steps', 1000000,
                            """Number of batches to run.""")
tf.app.flags.DEFINE_boolean('log_device_placement', False,
                            """Whether to log device placement.""")
tf.app.flags.DEFINE_integer('log_frequency', 10,
                            """How often to log results to the console.""")
tf.app.flags.DEFINE_boolean('use_fp16', False,
                            """Train the model using fp16.""")
TOWER_NAME = 'tower'

DATA_URL = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'

IMAGE_SIZE = cifar10_input.IMAGE_SIZE
NUM_CLASSES = cifar10_input.NUM_CLASSES
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL


# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.

In [3]:
if not FLAGS.data_dir:
    raise ValueError('Please supply a data_dir')
    
data_dir = os.path.join(FLAGS.data_dir, 'cifar10-batches-bin')

dest_directory = FLAGS.data_dir

if not os.path.exists(dest_directory):
    os.makedirs(dest_directory)
    
filename = DATA_URL.split('/')[-1]
filepath = os.path.join(dest_directory, filename)

if not os.path.exists(filepath):
    filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath)
    statsinfo = os.stat(filepath)
    print('Successfullly downloaded', filename, statsinfo.st_size, '.bytes')
extracted_dir_path = os.path.join(dest_directory, 'cifar-10-batches-bin')

if not os.path.exists(extracted_dir_path):
    tarfile.open(filepath, 'r:gz').extractall(dest_directory)

In [4]:
# Create shuffled or unshuffled batch
def _generate_image_and_label_batch(image, label, min_queue_examples, batch_size, shuffle):
    
    pdb.set_trace()
    
    num_process_thread = 16
    if shuffle:
        images, label_batch = tf.train.shuffle_batch([image, label], batch_size=batch_size, num_threads=num_process_thread, 
                                                    capacity=min_queue_examples + 3 * batch_size, min_after_dequeue=min_queue_examples)
    else:
        images, label_batch = tf.train.batch([image, label], batch_size=batch_size, num_threads=num_process_thread, capacity=min_queue_examples + 3 * batch_size)
    
    tf.summary.image('images', images)
    
    return images, tf.reshape(label_batch, [batch_size])

In [5]:
# Extract image data from the queue information and return object with image, label and other information

def read_cifar10(filename_queue):
    
    class CIFAR10Record(object):
        pass
    
    result = CIFAR10Record()
    
    label_bytes = 1
    result.height = 32
    result.width = 32
    result.depth = 3
    
    image_bytes = result.height * result.width + result.depth
    
    record_bytes = label_bytes + image_bytes
    
    reader = tf.FixedLengthRecordReader(record_bytes=record_bytes)
    result.key, value = reader.read(filename_queue)
    
    record_bytes = tf.decode_raw(value, tf.uint8)
    result.label = tf.cast(tf.strided_slice(record_bytes, [0], [label_bytes]), tf.int32)
    
    depth_major = tf.reshape(tf.strided_slice(record_bytes, [label_bytes], [label_bytes+image_bytes]), [result.depth, result.height, result.width])
    result.unit8image = tf.transpose(depth_major, [1,2,0])
    
    return result

In [6]:
# Generate images and labels after augmentation, pre-precessing and batching
data_dir = os.path.join(FLAGS.data_dir, 'cifar10-batches-bin')
# ages, labels = distorted_input(data_dir = data_dir, batch_size=FLAGS.batch_size)
IMAGE_SIZE = 24
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = 50000

batch_size = FLAGS.batch_size

def distorted_inputs():
    filename = [os.path.join(data_dir, 'data_batch_%d.bin' % i) for i in range(1, 6)]

    filename_queue = tf.train.string_input_producer(filename)

    with tf.name_scope('data_augmentation'):

        read_input = read_cifar10(filename_queue)
        reshaped_image = tf.cast(read_input.unit8image, tf.float32)

        height = IMAGE_SIZE
        width = IMAGE_SIZE

        distorted_image = tf.random_crop(reshaped_image, [height, width, 3])

        distorted_image = tf.image.random_flip_left_right(distorted_image)

        distorted_image = tf.image.random_brightness(distorted_image, max_delta=63)

        distorted_image = tf.image.random_contrast(distorted_image, lower=0.2, upper = 1.8)

        float_image = tf.image.per_image_standardization(distorted_image)

        float_image.set_shape([height, width, 3])
        read_input.label.set_shape([1])

        min_fraction_of_examples_in_queue = 0.4
        min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue)
        
        print('Filling queue with %d CIFAR images before starting to train.This will takge a few minutes.' % min_queue_examples)
    return _generate_image_and_label_batch(float_image, read_input.label, min_queue_examples, batch_size, shuffle=True)

In [7]:
"""Helper to create a Variable stored on CPU memory."""
def _variable_on_cpu(name, shape, initializer):
    dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
    with tf.device('/cpu:0'):
        var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype)
    return var

In [8]:
# Helper to create an initialized Variable with weight decay.
def _variable_with_weight_decay(name, shape, stddev, wd):
    dtype = tf.float16 if FLAGS.use_fp16 else tf.float32
    var = _variable_on_cpu(name, shape, tf.truncated_normal_initializer(stddev=stddev, dtype=dtype))
    if wd is not None:
        weight_decay = tf.multiply(tf.nn.l2_loss(var), wd, name='weight_loss')
        tf.add_to_collection('losses', weight_decay)
    return var

In [9]:
def _activation_summary(x):
    tensor_name = re.sub('%s_[0-9]*/' % TOWER_NAME, '', x.op.name)
    tf.summary.histogram(tensor_name + '/activations', x)
    tf.summary.scalar(tensor_name + '/sparsity', tf.nn.zero_fraction(x))

In [10]:
def inference(images):
    # conv1
    with tf.variable_scope('conv1') as scope:
        kernel = _variable_with_weight_decay('weights', shape=[5, 5, 3, 64], stddev=5e-2, wd=None)
        conv = tf.nn.conv2d(images, kernel, [1,1,1,1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv1 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv1)
        
        pool1 = tf.nn.max_pool(conv1, ksize=[1, 3, 3, 1], strides=[1, 2, 2, 1], padding='SAME', name='pool1')
        norm1 = tf.nn.lrn(pool1, 4, bias=1.0, alpha=0.001 / 9.0, beta=0.75, name='norm1')
    
    with tf.variable_scope('conv2') as scope:
        kernel = _variable_with_weight_decay('weights', shape=[5, 5, 64, 64], stddev=5e-2, wd=None)
        conv = tf.nn.conv2d(norm1, kernel, [1,1,1,1], padding='SAME')
        biases = _variable_on_cpu('biases', [64], tf.constant_initializer(0.0))
        pre_activation = tf.nn.bias_add(conv, biases)
        conv2 = tf.nn.relu(pre_activation, name=scope.name)
        _activation_summary(conv1)
        
        norm2 = tf.nn.lrn(conv2, 4, bias=1.0, alpha=0.001/9, beta=0.75, name='norm2')
        pool2 = tf.nn.max_pool(norm2, ksize=[1, 3, 3, 1], strides = [1, 2, 2, 1], padding='SAME', name='pool2')
        
    # local3
    with tf.variable_scope('local3') as scope:
        # Move everything into depth so we can perform a single matrix multiply.
        reshape = tf.reshape(pool2, [images.get_shape().as_list()[0], -1])
        dim = reshape.get_shape()[1].value
        weights = _variable_with_weight_decay('weights', shape=[dim, 384],
                                              stddev=0.04, wd=0.004)
        biases = _variable_on_cpu('biases', [384], tf.constant_initializer(0.1))
        local3 = tf.nn.relu(tf.matmul(reshape, weights) + biases, name=scope.name)
        _activation_summary(local3)
    
    # local4
    with tf.variable_scope('local4') as scope:
        weights = _variable_with_weight_decay('weights', shape=[384, 192],
                                              stddev=0.04, wd=0.004)
        biases = _variable_on_cpu('biases', [192], tf.constant_initializer(0.1))
        local4 = tf.nn.relu(tf.matmul(local3, weights) + biases, name=scope.name)
        _activation_summary(local4)
        
    # linear layer(WX + b)
    with tf.variable_scope('softmax_linear') as scope:
        weights = _variable_with_weight_decay('weights', [192, NUM_CLASSES],
                                              stddev=1/192.0, wd=None)
        biases = _variable_on_cpu('biases', [NUM_CLASSES],
                                  tf.constant_initializer(0.0))
        softmax_linear = tf.add(tf.matmul(local4, weights), biases, name=scope.name)
        _activation_summary(softmax_linear)
        
    return softmax_linear

def loss(logits, labels):
    
    labels = tf.cast(labels, tf.int64)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits, name='cross_entropy_per_example')
    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='cross_entropy')
    tf.add_to_collection('loss', cross_entropy_mean)
    
    return tf.add_n(tf.get_collection('losses'), name='total_loss')

In [11]:
def _add_loss_summaries(total_loss):
    
    loss_averages = tf.train.ExponentialMovingAverage(0.9, name='avg')
    losses = tf.get_collection('losses')
    loss_averages_op = loss_averages.apply(losses + [total_loss])
    
    # Attach a scalar summary to all individual losses and the total loss; do the
    # same for the averaged version of the losses.
    for l in losses + [total_loss]:
    # Name each loss as '(raw)' and name the moving average version of the loss
    # as the original loss name.
        tf.summary.scalar(l.op.name + ' (raw)', l)
        tf.summary.scalar(l.op.name, loss_averages.average(l))
        
    return loss_averages_op

In [12]:
# Build the trainer
def trainer(total_loss, global_step):
    
    num_batches_per_epoch = NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN / FLAGS.batch_size
    decay_steps = int(num_batches_per_epoch * NUM_EPOCHS_PER_DECAY)
    
    # Apply learning rate schedule
    lr = tf.train.exponential_decay(INITIAL_LEARNING_RATE,global_step,decay_steps,LEARNING_RATE_DECAY_FACTOR,staircase=True)
    tf.summary.scalar('learning_rate', lr)
    # Apply exponential moving average on loss
    loss_averages_op = _add_loss_summaries(total_loss)
    
    with tf.control_dependencies([loss_averages_op]):
        opt = tf.train.GradientDescentOptimizer(lr)
        grads = opt.compute_gradients(total_loss)

    # Apply gradients.
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)
    
    # Add histograms for trainable variables.
    for var in tf.trainable_variables():
        tf.summary.histogram(var.op.name, var)
        
    # Add histograms for gradients.
    for grad, var in grads:
        if grad is not None:
            tf.summary.histogram(var.op.name + '/gradients', grad)

    # Track the moving averages of all trainable variables.
    variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY, global_step)
    variables_averages_op = variable_averages.apply(tf.trainable_variables())
    
    with tf.control_dependencies([apply_gradient_op, variables_averages_op]):
        train_op = tf.no_op(name='train')
        
    return train_op

In [13]:
# train the model
def train():
    
    with tf.Graph().as_default():
        global_step = tf.contrib.slim.get_or_create_global_step()
        
        # Get images and labels for CIFAR-10.
        # Force input pipeline to CPU:0 to avoid operations sometimes ending up on
        # GPU and resulting in a slow down.
        with tf.device('/cpu:0'):
            images, labels = distorted_inputs()

        logits = inference(images)

        loss_tmp = loss(logits, global_step)

        train_op = trainer(loss_tmp, global_step)

        # Build a graph that trains the model with one batch of examples and update the model parameter
        class _LoggerHook(tf.train.SessionRunHook):
            """Logs loss and runtime."""
            def begin(self):
                self._step = -1
                self._start_time = time.time()

            def before_run(self, run_context):
                self._step += 1
                return tf.train.SessionRunArgs(loss)  # Asks for loss value.

            def after_run(self, run_context, run_values):
                if self._step % FLAGS.log_frequency == 0:
                    current_time = time.time()
                    duration = current_time - self._start_time
                    self._start_time = current_time

                loss_value = run_values.results
                examples_per_sec = FLAGS.log_frequency * FLAGS.batch_size / duration
                sec_per_batch = float(duration / FLAGS.log_frequency)

                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f '
                                'sec/batch)')
                print (format_str % (datetime.now(), self._step, loss_value,
                                       examples_per_sec, sec_per_batch))

        with tf.train.MonitoredTrainingSession(
            checkpoint_dir=FLAGS.train_dir,
            hooks=[tf.train.StopAtStepHook(last_step=FLAGS.max_steps), tf.train.NanTensorHook(loss),LoggerHook()],config=tf.ConfigProto(log_device_placement=FLAGS.log_device_placement)) as mon_sess:
            while not mon_sess.should_stop():
                mon_sess.run(train_op)

In [14]:
if tf.gfile.Exists(FLAGS.train_dir):
    tf.gfile.DeleteRecursively(FLAGS.train_dir)
    tf.gfile.MakeDirs(FLAGS.train_dir)

train()

Filling queue with 20000 CIFAR images before starting to train.This will takge a few minutes.
> <ipython-input-4-e025f163290d>(6)_generate_image_and_label_batch()
-> num_process_thread = 16
(Pdb) image
<tf.Tensor 'data_augmentation/div:0' shape=(24, 24, 3) dtype=float32>
(Pdb) label
<tf.Tensor 'data_augmentation/Cast:0' shape=(1,) dtype=int32>
(Pdb) exit


BdbQuit: 

In [None]:
% debug

In [None]:
# Improve the model

In [None]:
# Build the network
# Global constants describing the CIFAR-10 data set.
IMAGE_SIZE = cifar10_input.IMAGE_SIZE
NUM_CLASSES = cifar10_input.NUM_CLASSES
NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN
NUM_EXAMPLES_PER_EPOCH_FOR_EVAL = cifar10_input.NUM_EXAMPLES_PER_EPOCH_FOR_EVAL


# Constants describing the training process.
MOVING_AVERAGE_DECAY = 0.9999     # The decay to use for the moving average.
NUM_EPOCHS_PER_DECAY = 350.0      # Epochs after which learning rate decays.
LEARNING_RATE_DECAY_FACTOR = 0.1  # Learning rate decay factor.
INITIAL_LEARNING_RATE = 0.1       # Initial learning rate.