In [1]:
import tensorflow as tf
import numpy as np
import cv2
from six.moves import cPickle as pickle
import math
from time import time
from utils import *

In [2]:
with open('train.p', 'rb') as train_f:
    train = pickle.load(train_f)
with open('test.p', 'rb') as test_f:
    test = pickle.load(test_f)
X_train, y_train = train['features'], train['labels']
X_test, y_test = test['features'], test['labels']

In [3]:
def normalize(image):
    return cv2.normalize(image,None,-1.0, 1.0, cv2.NORM_MINMAX, dtype=cv2.CV_32F)

In [4]:
# normalize images
X_train = np.array([normalize(image) for image in X_train], dtype=np.float32)
X_test = np.array([normalize(image) for image in X_test], dtype=np.float32)

#flatten images
X_train_flat = np.array([image.flatten() for image in X_train])
X_test_flat = np.array([image.flatten() for image in X_test])

In [5]:
# one-hot-encode-labels
train_labels = dense_to_one_hot(y_train, n_classes=43)
test_labels = dense_to_one_hot(y_test, n_classes=43)

In [6]:
im_size = 32
n_channels = 3
n_features = im_size*im_size*n_channels
n_labels = 43


n_loc_layer_2 = 200
n_loc_layer_3 = 20
n_tranforms = 6
trans_out_size = (im_size, im_size)

conv_layer_5_filter_size = 5
conv_layer_5_num_filters = 50

conv_layer_6_filter_size = 5
conv_layer_6_num_filters = 100

n_fc_layer_7 = 1024
n_fc_layer_8 = 200

features = tf.placeholder(dtype=tf.float32, shape=[None, n_features])
labels = tf.placeholder(dtype= tf.float32, shape=[None, n_labels])

features_tensor = tf.reshape(features, shape=[-1,im_size,im_size,n_channels])

keep_prob = tf.placeholder(tf.float32)

In [7]:
def transformer(U, theta, out_size, name='SpatialTransformer', **kwargs):
    """Spatial Transformer Layer
    Implements a spatial transformer layer as described in [1]_.
    Based on [2]_ and edited by David Dao for Tensorflow.
    Parameters
    ----------
    U : float
        The output of a convolutional net should have the
        shape [num_batch, height, width, num_channels].
    theta: float
        The output of the
        localisation network should be [num_batch, 6].
    out_size: tuple of two ints
        The size of the output of the network (height, width)
    References
    ----------
    .. [1]  Spatial Transformer Networks
            Max Jaderberg, Karen Simonyan, Andrew Zisserman, Koray Kavukcuoglu
            Submitted on 5 Jun 2015
    .. [2]  https://github.com/skaae/transformer_network/blob/master/transformerlayer.py
    Notes
    -----
    To initialize the network to the identity transform init
    ``theta`` to :
        identity = np.array([[1., 0., 0.],
                             [0., 1., 0.]])
        identity = identity.flatten()
        theta = tf.Variable(initial_value=identity)
    """

    def _repeat(x, n_repeats):
        with tf.variable_scope('_repeat'):
            rep = tf.transpose(
                tf.expand_dims(tf.ones(shape=tf.pack([n_repeats, ])), 1), [1, 0])
            rep = tf.cast(rep, 'int32')
            x = tf.matmul(tf.reshape(x, (-1, 1)), rep)
            return tf.reshape(x, [-1])

    def _interpolate(im, x, y, out_size):
        with tf.variable_scope('_interpolate'):
            # constants
            num_batch = tf.shape(im)[0]
            height = tf.shape(im)[1]
            width = tf.shape(im)[2]
            channels = tf.shape(im)[3]

            x = tf.cast(x, 'float32')
            y = tf.cast(y, 'float32')
            height_f = tf.cast(height, 'float32')
            width_f = tf.cast(width, 'float32')
            out_height = out_size[0]
            out_width = out_size[1]
            zero = tf.zeros([], dtype='int32')
            max_y = tf.cast(tf.shape(im)[1] - 1, 'int32')
            max_x = tf.cast(tf.shape(im)[2] - 1, 'int32')

            # scale indices from [-1, 1] to [0, width/height]
            x = (x + 1.0)*(width_f) / 2.0
            y = (y + 1.0)*(height_f) / 2.0

            # do sampling
            x0 = tf.cast(tf.floor(x), 'int32')
            x1 = x0 + 1
            y0 = tf.cast(tf.floor(y), 'int32')
            y1 = y0 + 1

            x0 = tf.clip_by_value(x0, zero, max_x)
            x1 = tf.clip_by_value(x1, zero, max_x)
            y0 = tf.clip_by_value(y0, zero, max_y)
            y1 = tf.clip_by_value(y1, zero, max_y)
            dim2 = width
            dim1 = width*height
            base = _repeat(tf.range(num_batch)*dim1, out_height*out_width)
            base_y0 = base + y0*dim2
            base_y1 = base + y1*dim2
            idx_a = base_y0 + x0
            idx_b = base_y1 + x0
            idx_c = base_y0 + x1
            idx_d = base_y1 + x1

            # use indices to lookup pixels in the flat image and restore
            # channels dim
            im_flat = tf.reshape(im, tf.pack([-1, channels]))
            im_flat = tf.cast(im_flat, 'float32')
            Ia = tf.gather(im_flat, idx_a)
            Ib = tf.gather(im_flat, idx_b)
            Ic = tf.gather(im_flat, idx_c)
            Id = tf.gather(im_flat, idx_d)

            # and finally calculate interpolated values
            x0_f = tf.cast(x0, 'float32')
            x1_f = tf.cast(x1, 'float32')
            y0_f = tf.cast(y0, 'float32')
            y1_f = tf.cast(y1, 'float32')
            wa = tf.expand_dims(((x1_f-x) * (y1_f-y)), 1)
            wb = tf.expand_dims(((x1_f-x) * (y-y0_f)), 1)
            wc = tf.expand_dims(((x-x0_f) * (y1_f-y)), 1)
            wd = tf.expand_dims(((x-x0_f) * (y-y0_f)), 1)
            output = tf.add_n([wa*Ia, wb*Ib, wc*Ic, wd*Id])
            return output

    def _meshgrid(height, width):
        with tf.variable_scope('_meshgrid'):
            # This should be equivalent to:
            #  x_t, y_t = np.meshgrid(np.linspace(-1, 1, width),
            #                         np.linspace(-1, 1, height))
            #  ones = np.ones(np.prod(x_t.shape))
            #  grid = np.vstack([x_t.flatten(), y_t.flatten(), ones])
            #
            # x_t = tf.matmul(tf.ones(shape=tf.pack([height, 1])),
            #                 tf.transpose(tf.expand_dims(tf.linspace(-1.0, 1.0, width), 1), [1, 0]))
            # y_t = tf.matmul(tf.expand_dims(tf.linspace(-1.0, 1.0, height), 1),
            #                 tf.ones(shape=tf.pack([1, width])))

            x_t, y_t = tf.meshgrid(tf.linspace(-1.0, 1.0, width),
                                   tf.linspace(-1.0, 1.0, height))
            x_t_flat = tf.reshape(x_t, (1, -1))
            y_t_flat = tf.reshape(y_t, (1, -1))
            
            ones = tf.ones_like(x_t_flat)
            grid = tf.concat(0, [x_t_flat, y_t_flat, ones])
            return grid

    def _transform(theta, input_dim, out_size):
        with tf.variable_scope('_transform'):
            num_batch = tf.shape(input_dim)[0]
            height = tf.shape(input_dim)[1]
            width = tf.shape(input_dim)[2]
            num_channels = tf.shape(input_dim)[3]
            theta = tf.reshape(theta, (-1, 2, 3))
            theta = tf.cast(theta, 'float32')

            # grid of (x_t, y_t, 1), eq (1) in ref [1]
            height_f = tf.cast(height, 'float32')
            width_f = tf.cast(width, 'float32')
            out_height = out_size[0]
            out_width = out_size[1]
            grid = _meshgrid(out_height, out_width)
            grid = tf.expand_dims(grid, 0)
            grid = tf.reshape(grid, [-1])
            grid = tf.tile(grid, tf.pack([num_batch]))
            grid = tf.reshape(grid, tf.pack([num_batch, 3, -1]))

            # Transform A x (x_t, y_t, 1)^T -> (x_s, y_s)
            T_g = tf.batch_matmul(theta, grid)
            x_s = tf.slice(T_g, [0, 0, 0], [-1, 1, -1])
            y_s = tf.slice(T_g, [0, 1, 0], [-1, 1, -1])
            x_s_flat = tf.reshape(x_s, [-1])
            y_s_flat = tf.reshape(y_s, [-1])

            input_transformed = _interpolate(
                input_dim, x_s_flat, y_s_flat,
                out_size)

            output = tf.reshape(
                input_transformed, tf.pack([num_batch, out_height, out_width, num_channels]))
            return output

    with tf.variable_scope(name):
        output = _transform(theta, U, out_size)
        return output


In [8]:
loc_layer_1 = fc_layer(input = features,
                       num_inputs = n_features ,
                       num_outputs = n_loc_layer_2,
                       non_lin='tanh'
                       )
loc_layer_1 = tf.nn.dropout(loc_layer_1, keep_prob=keep_prob)

loc_layer_2 = fc_layer(input = loc_layer_1,
                       num_inputs = n_loc_layer_2 ,
                       num_outputs = n_loc_layer_3,
                       non_lin='tanh'
                       )
loc_layer_2 = tf.nn.dropout(loc_layer_2, keep_prob=keep_prob)

loc_layer_3 = fc_layer(input = loc_layer_2,
                       num_inputs = n_loc_layer_3 ,
                       num_outputs = n_tranforms,
                       non_lin='tanh'
                       )


trans_layer_4 = transformer(features_tensor, loc_layer_3, trans_out_size)

conv_layer_5 = conv_layer(input = trans_layer_4,
                          num_input_channels = n_channels,
                          filter_size = conv_layer_5_filter_size,
                          num_filters = conv_layer_5_num_filters,
                          use_pooling=True,
                          non_lin='relu')

conv_layer_6 = conv_layer(input = conv_layer_5,
                          num_input_channels = conv_layer_5_num_filters,
                          filter_size = conv_layer_6_filter_size,
                          num_filters = conv_layer_6_num_filters,
                          use_pooling=True,
                          non_lin='relu')

conv_layer_6_flat, layer_6_flat_feature_count = flatten_layer(conv_layer_6)

fc_layer_7 = fc_layer(input = conv_layer_6_flat,
                      num_inputs = layer_6_flat_feature_count,
                      num_outputs = n_fc_layer_7,
                      non_lin='relu'
                      )

fc_layer_7 = tf.nn.dropout(fc_layer_7, keep_prob=keep_prob)

fc_layer_8 = fc_layer(input = fc_layer_7,
                      num_inputs = n_fc_layer_7,
                      num_outputs = n_fc_layer_8,
                      non_lin='relu'
                      )

fc_layer_8 = tf.nn.dropout(fc_layer_8, keep_prob=keep_prob)

fc_layer_9 = fc_layer(input = fc_layer_8,
                      num_inputs = n_fc_layer_8,
                      num_outputs = n_labels,
                      non_lin=None
                      )

predictions = tf.nn.softmax(fc_layer_9)
cross_entorpy = tf.nn.softmax_cross_entropy_with_logits(logits=fc_layer_9, labels=labels)
loss = tf.reduce_mean(cross_entorpy)

optimizer = tf.train.AdamOptimizer().minimize(loss)
init = tf.global_variables_initializer()

In [9]:
def get_accuracy(predictions, acutals):
    prediction_labels = np.argmax(predictions, axis=1)
    actual_labels = np.argmax(acutals, axis=1)
    return np.mean(np.equal(prediction_labels, actual_labels).astype(np.float32))

In [10]:
def predict_class(in_features, in_labels, sess):
    batch_size = 250
    batch_count = int(math.ceil(len(in_features)/batch_size)) 
    class_predictions = np.zeros_like(in_labels)
    for i in range(batch_count):
        batch_start = i*batch_size
        batch_end = batch_start+batch_size
        batch_features = in_features[batch_start : batch_end]
        batch_labels = in_labels[batch_start : batch_end]
        feed_dict = {features: batch_features,
                     labels: batch_labels,
                     keep_prob: 1.0
                     }
        class_predictions[batch_start:batch_end] = sess.run(predictions, feed_dict=feed_dict)
    return class_predictions

In [11]:
batch_size = 100
epochs = 500

with tf.Session() as sess:
    sess.run(init)
    batch_count = int(math.ceil(len(X_train)/batch_size))
    start_time = time()
    for epoch in range(epochs):
        for batch_features, batch_labels in batch_generator(X_train,
                                                            train_labels,
                                                            batch_count,
                                                            batch_size):
            batch_features_flat = np.array([image.flatten() for image in batch_features])
            feed_dict = {features: batch_features_flat,
                         labels: batch_labels,
                         keep_prob: 0.8
                         }
            _, l = sess.run([optimizer, loss], feed_dict=feed_dict)
        
        if epoch%25 == 0:
            print("Time taken = {}".format(round((time() - start_time)/60.0 , 3)))
            start_time = time()
    
    
    train_predictions = predict_class(in_features=X_train_flat,
                                      in_labels=train_labels,
                                      sess=sess)
    test_predictions = predict_class(in_features=X_test_flat,
                                     in_labels=test_labels,
                                     sess=sess)

Time taken = 0.211
Time taken = 4.942
Time taken = 4.972
Time taken = 5.059
Time taken = 5.036
Time taken = 4.972
Time taken = 4.933
Time taken = 4.961
Time taken = 5.014
Time taken = 4.983
Time taken = 4.997
Time taken = 4.996
Time taken = 4.967
Time taken = 4.983
Time taken = 4.999
Time taken = 5.016
Time taken = 5.021
Time taken = 5.003
Time taken = 5.005
Time taken = 5.033


In [12]:
training_accuracy = get_accuracy(train_predictions, train_labels)
test_accuracy = get_accuracy(test_predictions, test_labels)
print('Training Accuracy: {}'.format(round(training_accuracy, 3)))
print('Test Accuracy: {}'.format(round(test_accuracy, 3)))

Training Accuracy: 0.9860000014305115
Test Accuracy: 0.9449999928474426
