In [1]:
import mmh3

In [2]:
import functools

In [3]:
import iteration_utilities

In [4]:
import random

In [5]:
import numpy as np

In [6]:
import math

In [7]:
import tensorflow as tf

In [8]:
# tf.enable_eager_execution()

In [9]:
from tensorflow.python import debug as tf_debug

In [10]:
from enum import IntEnum

In [11]:
print("TensorFlow version: {}".format(tf.VERSION))

TensorFlow version: 1.8.0


In [12]:
# (x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
training_data, testing_data = tf.keras.datasets.fashion_mnist.load_data()

In [13]:
number_of_classes = int(np.max(testing_data[1])) + 1
number_of_input_channels = 1

In [14]:
number_of_hidden_layers = 32

In [15]:
number_of_candidate_layers = 32

In [16]:
number_of_enabled_layers = 16

In [17]:
number_of_features_per_layer = 8

In [18]:
paddings = (0, 0), (2, 2), (2, 2)

In [19]:
random_seed = number_of_hidden_layers

In [20]:
weight_scale = 0.9

In [21]:
def grid_sample(images, offset_x, offset_y):
    tf.assert_rank(images, 4)
    tf.assert_rank(offset_x, 4)
    tf.assert_rank(offset_y, 4)
    image_shape = tf.shape(images)
    current_batch_size = image_shape[0]
    width = image_shape[1]
    height = image_shape[2]
    number_of_features = image_shape[3]
    
    def check_width(tensor):
        if (tf.test.is_gpu_available()):
            return tensor
        else:
            return tf.maximum(tf.minimum(tensor, width - 1), 0)
    
    def check_height(tensor):
        if (tf.test.is_gpu_available()):
            return tensor
        else:
            return tf.maximum(tf.minimum(tensor, height - 1), 0)
    
    offset_left = tf.floor(offset_x)
    offset_top = tf.floor(offset_y)
    tf.assert_equal(tf.shape(offset_left), image_shape)
    tf.assert_equal(tf.shape(offset_top), image_shape)

    
    eps = 1e-7

    factor_right = offset_x - offset_left + eps
    factor_left = 1.0 + 2.0 * eps - factor_right
    factor_bottom = offset_y - offset_top + eps
    factor_top = 1.0 + 2.0 * eps - factor_bottom
    
    image_index, x_index, y_index, feature_index = tf.meshgrid(
        tf.range(current_batch_size, dtype=tf.int32),
        tf.range(width, dtype=tf.int32),
        tf.range(height, dtype=tf.int32),
        tf.range(number_of_features, dtype=tf.int32),
        indexing='ij',
    )

    left_index = check_width(x_index + tf.cast(offset_left, tf.int32))
    top_index = check_height(y_index + tf.cast(offset_top, tf.int32))

    right_index = left_index + 1
    bottom_index = top_index + 1
    
    tf.assert_rank(left_index, 4)
    tf.assert_rank(right_index, 4)
    tf.assert_rank(bottom_index, 4)
    tf.assert_rank(top_index, 4)
    
    images_top_left = tf.gather_nd(images, tf.stack((image_index, left_index, top_index, feature_index), axis=4))
    images_top_right = tf.gather_nd(images, tf.stack((image_index, right_index, top_index, feature_index), axis=4))
    images_bottom_left = tf.gather_nd(images, tf.stack((image_index, left_index, bottom_index, feature_index), axis=4))
    images_bottom_right = tf.gather_nd(images, tf.stack((image_index, right_index, bottom_index, feature_index), axis=4))

    tf.assert_rank(images_top_left, 4)
    tf.assert_rank(images_top_right, 4)
    tf.assert_rank(images_bottom_left, 4)
    tf.assert_rank(images_bottom_right, 4)

    lerp_top = factor_left * images_top_left + factor_right * images_top_right
    lerp_bottom = factor_left * images_bottom_left + factor_right * images_bottom_right
    output = factor_top * lerp_top + factor_bottom * lerp_bottom
    tf.assert_rank(output, 4)
    return output

In [22]:
def dropout_offnet(images, number_of_candidate_layers, number_of_features_per_layer, weight_scale, number_of_enabled_layers, name=None):
    with tf.variable_scope(name, default_name="dropout_offnet"):
        input_shape = images.shape
        number_of_input_features = int(input_shape[3])
        enabled_layer_ids = tf.random_shuffle(tf.range(number_of_candidate_layers))[:number_of_enabled_layers]
        
        output_indices = tf.concat(
            (
                tf.range(number_of_input_features),
                tf.reshape(
                    tf.expand_dims(enabled_layer_ids, axis=1) * number_of_features_per_layer +
                        tf.expand_dims(tf.range(number_of_features_per_layer), axis=0),
                    (number_of_features_per_layer * number_of_enabled_layers, )
                ) + tf.constant(number_of_input_features, shape=(1, ))
            ),
            axis=0
        )
        number_of_features = int(number_of_features_per_layer * number_of_candidate_layers + number_of_input_features)
        dense_weight_size = int((number_of_features - number_of_features_per_layer + number_of_input_features) * (number_of_features - number_of_input_features) // 2)
        score_weight = tf.get_variable(
            name="score_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(dense_weight_size, number_of_features_per_layer)
        )
        offset_x_weight = tf.get_variable(
            name="offset_x_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(number_of_candidate_layers, number_of_input_features, number_of_features_per_layer)
        )
        offset_y_weight = tf.get_variable(
            name="offset_y_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(number_of_candidate_layers, number_of_input_features, number_of_features_per_layer)
        )
        score_bias = tf.get_variable(
            name="score_bias",
            initializer=tf.zeros_initializer(),
            dtype=tf.float32,
            shape=(number_of_candidate_layers, 1, 1, 1, number_of_features_per_layer),
        )
        offset_x_bias = tf.get_variable(
            name="offset_x_bias",
            initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
            dtype=tf.float32,
            shape=(number_of_candidate_layers, 1, 1, 1, number_of_features_per_layer),
        )
        offset_y_bias = tf.get_variable(
            name="offset_y_bias",
            initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
            dtype=tf.float32,
            shape=(number_of_candidate_layers, 1, 1, 1, number_of_features_per_layer),
        )

        # TODO: index should be started at number_of_input_features
        def scale_initializer():
            return tf.reshape(
                tf.sqrt(weight_scale / tf.range(number_of_input_features, number_of_features, delta=number_of_features_per_layer, dtype=tf.float32)),
                (number_of_candidate_layers, 1, 1)
            )
        
        score_scale = tf.get_variable(
            name="score_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )
        offset_x_scale = tf.get_variable(
            name="offset_x_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )
        offset_y_scale = tf.get_variable(
            name="offset_y_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )


        def score_map(previous_layers, i):
            layer_id = enabled_layer_ids[i]
            number_of_previous_layers = number_of_input_features + layer_id * number_of_features_per_layer
            weight_start = (number_of_input_features + number_of_previous_layers) * (number_of_previous_layers - number_of_input_features) // 2
            tf.assert_rank(weight_start, 0)

            # TODO test if the performance improves when using SparseTensor
            tf.assert_rank(previous_layers, 4)
#             tf.assert_equal(tf.shape(previous_layers)[3], )

            return grid_sample(
                score_bias[layer_id] + tf.tensordot(
                    previous_layers,
                    tf.gather(
                        score_weight,
                        weight_start + output_indices[:(number_of_input_features + i * number_of_features_per_layer)]
                    ) * score_scale[layer_id],
                    axes=1
                ),
                offset_x_bias[layer_id] + tf.tensordot(
                    images,
                    offset_x_weight[layer_id] * offset_x_scale[layer_id],
                    axes=1
                ),
                offset_y_bias[layer_id] + tf.tensordot(
                    images,
                    offset_y_weight[layer_id] * offset_y_scale[layer_id],
                    axes=1
                ),
            )

        output, i = tf.while_loop(
            lambda layers, i: i < number_of_enabled_layers,
            lambda layers, i: (
                tf.concat(
                    (
                        layers,
                        score_map(
                            layers,
                            i
                        )
                    ),
                    axis=3,
                ),
                i + 1
            ),
            (
                images,
                0,
            ),
            shape_invariants=(
                tf.TensorShape((input_shape[0], input_shape[1], input_shape[2], None)), 
                tf.TensorShape(())
            )
        )
        # tf.assert_equal(i, number_of_features)
        
        dynamic_input_shape = tf.shape(images)

        return tf.transpose(tf.unsorted_segment_sum(tf.transpose(output), output_indices, number_of_input_features + number_of_features_per_layer * number_of_candidate_layers))
#         return tf.IndexedSlices(
#             output,
#             output_indices,
#             (dynamic_input_shape[0], dynamic_input_shape[1], dynamic_input_shape[2], number_of_input_features + number_of_features_per_layer * number_of_candidate_layers)
#         )


In [23]:
# def dropout_offnet(input_images, number_of_additional_features, name=None):
#     """
#     input_images: a tensor of batch_size x width x height x number_of_input_channels
#     """
#     with tf.variable_scope(name, default_name="dropout_offnet"):
#         if input_images is IndexedSlices:
#             image_shape = input_images.dense_shape
#         else:
#             image_shape = input_images.shape
#         number_of_input_channels = image_shape[0]
#         batch_size = image_shape[1]
#         width = image_shape[2]
#         height = image_shape[3]

#         score_weight = tf.get_variable(
#             name="score_weight",
#             initializer=tf.random_normal_initializer(),
#             dtype=tf.float32,
#             shape=(number_of_additional_features, number_of_input_channels)
#         )
#         score_bias = tf.get_variable(
#             name="score_bias",
#             initializer=tf.random_normal_initializer(),
#             dtype=tf.float32,
#             shape=(number_of_input_channels, 1, 1, 1)
#         )
        
#         offset_x_weight = tf.get_variable(
#             name="offset_x_weight",
#             initializer=tf.random_normal_initializer(),
#             dtype=tf.float32,
#             shape=(number_of_additional_features, number_of_input_channels)
#         )
#         offset_x_bias = tf.get_variable(
#             name="offset_x_bias",
#             initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
#             dtype=tf.float32,
#             shape=(number_of_additional_features, 1, 1, 1),
#         )

#         offset_y_weight = tf.get_variable(
#             name="offset_y_weight",
#             initializer=tf.random_normal_initializer(),
#             dtype=tf.float32,
#             shape=(number_of_additional_features, number_of_input_channels)
#         )
#         offset_y_bias = tf.get_variable(
#             name="offset_y_bias",
#             initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
#             dtype=tf.float32,
#             shape=(number_of_additional_features, 1, 1, 1),
#         )
        
#         def indexed_sum(weight, bias):
#             if input_images is IndexedSlices:
#                 tf.tensordot(tf.gather(weight, input_images.indices), TODO)
                
# #                 input_images.values
# #                 values_shape = tf.shape(input_images.values)
                
# #                 tf.sparse_matmul(
# #                     weight,
# #                     tf.SparseTensor(
# #                         input_images.indices,
# #                         tf.reshape(input_images.values, (values_shape[0], values_shape[1] * values_shape[2] * values_shape[3])),
# #                         input_images.dense_shape,
# #                     )
# #                     tf.sparse_reshape(input_images)
# #                 )
# #                 TODO
#             else:
#                 tf.tensordot(weight, input_images, axes=1) + bias
#         indexed_sum(weight, score_bias)
#         indexed_sum(weight, score_bias)
#         indexed_sum(weight, score_bias)

In [24]:
def offnet(images, number_of_features, name=None):
    with tf.variable_scope(name, default_name="offnet"):
        input_shape = images.shape
#         if type(images) is tf.Tensor:
#         elif type(images) is tf.IndexedSlices:
#             input_shape = images.dense_shape
#             images = tf.unsorted_segment_sum(
#                 images.values,
#                 images.indices,
#                 input_shape[3],
#             )
            
#             tf.IndexedSlices(
#                 images.values,
#                 images.indices,
#                 tf.convert_to_tensor(images.dense_shape)
#             ) # TODO: optimize this
#         else:
#             raise ValueError()
        number_of_input_features = int(input_shape[3])    
        score_weight = tf.get_variable(
            name="score_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(number_of_input_features, number_of_features)
        )
        offset_x_weight = tf.get_variable(
            name="offset_x_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(number_of_input_features, number_of_features)
        )
        offset_y_weight = tf.get_variable(
            name="offset_y_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(number_of_input_features, number_of_features)
        )
        score_bias = tf.get_variable(
            name="score_bias",
            initializer=tf.zeros_initializer(),
            dtype=tf.float32,
            shape=(1, 1, 1, number_of_features),
        )
        offset_x_bias = tf.get_variable(
            name="offset_x_bias",
            initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
            dtype=tf.float32,
            shape=(1, 1, 1, number_of_features),
        )
        offset_y_bias = tf.get_variable(
            name="offset_y_bias",
            initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
            dtype=tf.float32,
            shape=(1, 1, 1, number_of_features),
        )
        def scale_initializer():
            return tf.reshape(tf.sqrt(weight_scale / number_of_input_features), (1, 1))
        score_scale = tf.get_variable(
            name="score_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )
        offset_x_scale = tf.get_variable(
            name="offset_x_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )
        offset_y_scale = tf.get_variable(
            name="offset_y_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )
        return grid_sample(
            tf.tensordot(
                images,
                score_weight * score_scale,
                axes=1
            ) + score_bias,
            tf.tensordot(
                images,
                offset_x_weight * offset_x_scale,
                axes=1
            ) + offset_x_bias,
            tf.tensordot(
                images,
                offset_y_weight * offset_y_scale,
                axes=1
            ) + offset_y_bias,
        )

        

In [25]:
def dense_offnet(images, number_of_additional_layers, number_of_features_per_layer, weight_scale, name=None):
    with tf.variable_scope(name, default_name="dense_offnet"):
        input_shape = images.shape
        number_of_input_features = int(input_shape[3])
        number_of_features = int(number_of_features_per_layer * number_of_additional_layers + number_of_input_features)
        dense_weight_size = int((number_of_features - number_of_features_per_layer + number_of_input_features) * (number_of_features - number_of_input_features) // 2)
        score_weight = tf.get_variable(
            name="score_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(dense_weight_size, number_of_features_per_layer)
        )
        offset_x_weight = tf.get_variable(
            name="offset_x_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(dense_weight_size, number_of_features_per_layer)
        )
        offset_y_weight = tf.get_variable(
            name="offset_y_weight",
            initializer=tf.random_normal_initializer(),
            dtype=tf.float32,
            shape=(dense_weight_size, number_of_features_per_layer)
        )
        score_bias = tf.get_variable(
            name="score_bias",
            initializer=tf.zeros_initializer(),
            dtype=tf.float32,
            shape=(number_of_additional_layers, 1, 1, 1, number_of_features_per_layer),
        )
        offset_x_bias = tf.get_variable(
            name="offset_x_bias",
            initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
            dtype=tf.float32,
            shape=(number_of_additional_layers, 1, 1, 1, number_of_features_per_layer),
        )
        offset_y_bias = tf.get_variable(
            name="offset_y_bias",
            initializer=tf.random_uniform_initializer(minval=-3.0, maxval=3.0),
            dtype=tf.float32,
            shape=(number_of_additional_layers, 1, 1, 1,  number_of_features_per_layer),
        )

        # TODO: index should be started at number_of_input_features
        def scale_initializer():
            return tf.sqrt(weight_scale / tf.range(number_of_input_features, number_of_features, delta=number_of_features_per_layer, dtype=tf.float32))

        score_scale = tf.get_variable(
            name="score_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )
        offset_x_scale = tf.get_variable(
            name="offset_x_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )
        offset_y_scale = tf.get_variable(
            name="offset_y_scale",
            initializer=scale_initializer(),
            dtype=tf.float32
        )


        def score_map(previous_layers, i):
            number_of_previous_layers = number_of_input_features + i * number_of_features_per_layer
            weight_start = (number_of_input_features + number_of_previous_layers) * (number_of_previous_layers - number_of_input_features) // 2
            weight_end = weight_start + number_of_previous_layers
            tf.assert_rank(weight_start, 0)

            # TODO test if the performance improves when using SparseTensor
            tf.assert_rank(previous_layers, 4)

            def indexed_sum(weight, bias, scale):
                tf.assert_rank(weight, 2)
                weight_slice = weight[weight_start:weight_end] * scale[i]

                tf.assert_equal(tf.shape(weight_slice)[0], tf.shape(previous_layers)[3])
                return tf.tensordot(
                    previous_layers,
                    weight_slice,
                    axes=1
                ) + bias[i]

            return grid_sample(
                indexed_sum(score_weight, score_bias, score_scale),
                indexed_sum(offset_x_weight, offset_x_bias, offset_x_scale),
                indexed_sum(offset_y_weight, offset_y_bias, offset_y_scale),
            )

        output, i = tf.while_loop(
            lambda layers, i: i < number_of_additional_layers,
            lambda layers, i: (
                tf.concat(
                    (
                        layers,
                        score_map(
                            layers,
                            i
                        )
                    ),
                    axis=3,
                ),
                i + 1
            ),
            (
                images,
                0,
            ),
            shape_invariants=(
                tf.TensorShape((input_shape[0], input_shape[1], input_shape[2], None)), 
                tf.TensorShape(())
            )
        )
        #tf.assert_equal(i, number_of_features)

        
        statically_shaped_output = output + 0 # Workaround for a tensorflow bug
        statically_shaped_output.set_shape((input_shape[0], input_shape[1], input_shape[2], input_shape[3] + number_of_additional_layers * number_of_features_per_layer))
        return statically_shaped_output

In [26]:
def model_fn(features, labels, mode, params, config):
    random.seed(number_of_hidden_layers)
    padded_input = tf.pad(tf.cast(features, tf.float32) / 255.0 - 0.5, paddings)

    base_layers = dense_offnet(tf.expand_dims(padded_input, axis=3), number_of_hidden_layers, number_of_features_per_layer, weight_scale)
    feature_layers = dropout_offnet(
        base_layers,
        number_of_candidate_layers,
        number_of_features_per_layer,
        weight_scale,
        number_of_enabled_layers if mode == tf.estimator.ModeKeys.TRAIN else number_of_candidate_layers
    )
    layers = offnet(feature_layers, number_of_classes)

    scores = tf.reduce_mean(layers, axis=(1, 2))

    probabilities = tf.nn.softmax(logits=scores)
    predicted_classes = tf.argmax(scores, 1)
    predictions = {
        'probabilities' : probabilities,
        'scores': scores,
        'class': predicted_classes,
    }
    optimizer = tf.train.AdagradOptimizer(learning_rate=0.01)
    if labels is None:
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
        )
    else:
        loss = tf.losses.softmax_cross_entropy(logits=scores, onehot_labels=tf.one_hot(labels, number_of_classes))
        eval_metric_ops = {
            'accuracy': tf.metrics.accuracy(labels=labels, predictions=predicted_classes)
        }
        return tf.estimator.EstimatorSpec(
            mode=mode,
            predictions=predictions,
            loss=loss,
            train_op=optimizer.minimize(loss, global_step=tf.train.get_global_step()),
            eval_metric_ops=eval_metric_ops,
        )

In [27]:
run_config = tf.estimator.RunConfig(
    model_dir=f"models/dropout3offnet{number_of_features_per_layer}x{number_of_hidden_layers}+{number_of_features_per_layer}x{number_of_candidate_layers}",
    session_config=tf.ConfigProto(
        gpu_options=tf.GPUOptions(
            allow_growth=True,
        ),
#         graph_options=tf.GraphOptions(
#             optimizer_options=tf.OptimizerOptions(
#                 global_jit_level=tf.OptimizerOptions.ON_2,
#                 do_function_inlining=True,
#                 do_constant_folding=True,
#                 do_common_subexpression_elimination=True,
#             ),
#         )
    )
)

In [29]:
estimator = tf.estimator.Estimator(model_fn, config=run_config)

INFO:tensorflow:Using config: {'_model_dir': 'models/dropout3offnet8x32+8x32', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': gpu_options {
  allow_growth: true
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f014dc6ae48>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [30]:
batch_size = 32

In [31]:
def training_dataset():
    return tf.data.Dataset.from_tensor_slices(training_data).shuffle(1000).batch(batch_size)

In [32]:
def testing_dataset():
    return tf.data.Dataset.from_tensor_slices(testing_data).batch(batch_size)

In [33]:
# hook = tf_debug.TensorBoardDebugHook("localhost:6064")
# estimator.train(training_dataset,hooks=[hook])

In [None]:
tf.estimator.train_and_evaluate(
    estimator,
    train_spec=tf.estimator.TrainSpec(training_dataset),
    eval_spec=tf.estimator.EvalSpec(testing_dataset)
)

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into models/dropout3offnet8x32+8x32/model.ckpt.
INFO:tensorflow:loss = 2.3501825, step = 0


In [None]:
# estimator.evaluate(input_fn=lambda:tf.data.Dataset.from_tensor_slices(testing_data).batch(3).take(1))

In [None]:
# tuple(estimator.predict(input_fn=lambda:tf.data.Dataset.from_tensor_slices(testing_data).batch(3).take(1)))

In [None]:
# estimator.train(input_fn=training_dataset, steps=1)