In [4]:
%matplotlib inline
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from itertools import accumulate
from functools import wraps

# Activations, with various initialisation

Let's have a look at the histogram of activations per layer in a reasonably deep NN, with a few different inits/activation functions.

First we have to make some neural nets, so let's get that ready to go.

In [3]:
def dense_layer(input_var, size, name):
    # for now we won't bother with a bias
    input_size = input_var.get_shape()[1].value
    weights = tf.get_variable(name+'_W', [input_size, size])
    
    return tf.matmul(input_var, weights)

def deep_net(input_var, layers, size, nonlinearity):
    """Return all activations for a net with `layers` layers each of width `size`"""
    shape = [input_var] + ([size] * layers)
    i = 0
    def layer(a, b):
        i += 1
        return nonlinearity(dense_layer(a, b, 'Layer{}'.format(i)))
    return accumulate(shape, layer)

We also need some initialisers. The first is the 'common heuristic' shot down in Glorot,
$$
    w_{ij} \sim \mathcal{U}\left[\frac{-1}{\sqrt{n}}, \frac{1}{\sqrt{n}} \right]
$$
With $n$ the number of inputs to the layer (number of rows of the current matrix, the way we are handling things above).

In [6]:
def scaled_uniform_init():
    def _init(shape, dtype=tf.float32):
        if len(shape) != 2:
            raise ValueError('I can only handle matrices.')
        scale = 1.0 / np.sqrt(shape[0])
        return np.random.uniform(-scale, scale, shape, dtype=np.float32)
    return _init

And the proposed 'normalized initialisation', aka Glorot Initialisation (which has variance $\frac{1}{n_i + n_j}$
where the $n$ are the size of weight matrix). We will do a version drawn from a normal distribution with this variance as well, because why not.

In [7]:
def glorot_uniform_init():
    def _gu_init(shape, dtype=tf.float32):
        if len(shape) != 2:
            raise ValueError('need shape of length 2')
        scale = np.sqrt(6) / np.sqrt(shape[0] + shape[1])
        return np.random.uniform(-scale, scale, shape, dtype=np.float32)
    return _gu_init

def glorot_normal_init():
    def _gn_init(shape, dtype=tf.float32):
        if len(shape) != 2:
            raise ValueError('need shape of length 2')
        # std dev should be sqrt(variance), variance shuold be 2/(shape[0] + shape[1])
        scale = np.sqrt(2) / np.sqrt(shape[0] + shape[1])
        return np.random.normal(scale=scale, size=shape)

We are also interested in Saxe's orthogonal init, coming soon

Now we can make some nets, we will run them on some MNIST digits eventually so let's make them that size.
We also want to play with a few activation functions, so we're going to eventually make something of a grid and let loose on that.

In [10]:
inputs = tf.placeholder(tf.float32, [300, 784])  # we want to plot averages over a few inputs

with tf.variable_scope('standard', initializer=scaled_uniform_init()):
    standard_outs = deep_net(inputs, 5, 500, tf.nn.tanh)