In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import rnndatasets.sequentialmnist as mnist

Just to see if this isn't a terrible idea, we are going to construct a deep feedforward network where each weight matrix is a slice of a 3-tensor, which is expressed in a decomposed form.

That is to say, the middle part, because we are going to use a decomposition which requires them all to be the same size.

In [2]:
def get_tensor_layers(input_var, depth, width, rank, nonlinearity=tf.nn.relu):
    # first we get the three parameter matrices as per the CP decomp
    A = tf.get_variable('A', shape=[width, rank])
    B = tf.get_variable('B', shape=[depth, rank])
    C = tf.get_variable('C', shape=[width, rank])
    # now we loop through and construct the layers
    # the biases are not part of the decomp
    layer_in = input_var
    for layer in range(depth):
        with tf.variable_scope('tensor_layer_{}'.format(layer)):
            weights = tf.matmul((A * B[layer, :]), C, transpose_b=True)
            bias = tf.get_variable('bias_{}'.format(layer), 
                                   initializer=tf.constant_initializer(0.0),
                                   shape=[width])
            layer_in = tf.nn.bias_add(tf.matmul(layer_in, weights), bias)
            layer_in = nonlinearity(layer_in)
    return layer_in

In [3]:
def affine(input_var, new_size, name):
    with tf.variable_scope(name):
        input_size = input_var.get_shape()[1].value
        weights = tf.get_variable('weights', shape=[input_size, new_size])
        bias = tf.get_variable('bias', shape=[new_size], 
                               initializer=tf.constant_initializer(0.0))
        
        return tf.nn.bias_add(tf.matmul(input_var, weights), bias)

In [4]:
def maxine(activations):
    in_shape = activations.get_shape()[1].value
    a = tf.get_variable('a', shape=[in_shape],
                        initializer=tf.constant_initializer(1.0))
    b = tf.get_variable('b', shape=[in_shape],
                        initializer=tf.constant_initializer(1.0))
    c = tf.get_variable('c', shape=[in_shape],
                        initializer=tf.constant_initializer(0.0))
    
    a_acts = a * (activations - c)
    b_acts = b * (activations - c)
    
    return tf.maximum(a_acts, b_acts)

In [31]:
def orthonormal_init():
    def _on_init(shape, dtype=tf.float32):
        if len(shape) != 2:
            raise ValueError('nope')
        np.random.seed(1234)
        mat = np.random.normal(size=shape)
        q, _ = np.linalg.qr(mat, mode='complete')
        print(shape, q.shape)
        return q  # need to be more carfeul about the shapes here
    return _on_init

def orthogonal_regulariser(beta=1.0):
    def o_r(mat):
        if len(mat.get_shape()) != 2:
            return None
        cov = tf.matmul(mat, mat, transpose_b=True)  # careful which way
        eye = tf.constant(np.eye(mat.get_shape()[0].value), dtype=tf.float32)
        return tf.reduce_sum(tf.square(cov - eye)) * beta
    return o_r

In [44]:
tf.reset_default_graph()

inputs = tf.placeholder(tf.float32, name='inputs', shape=[None, 784])
targets = tf.placeholder(tf.int32, name='targets', shape=[None])

DEPTH = 10
RANK = 50
WIDTH = 100

with tf.variable_scope('net'):
    input_proj = maxine(affine(inputs, WIDTH, 'input_layer'))
    
    # do the cool guy stuff
    with tf.variable_scope('tensor_stuff', regularizer=orthogonal_regulariser(0.01)):
        t_out = get_tensor_layers(input_proj, DEPTH, WIDTH, RANK, nonlinearity=maxine)
    
    # and output layer
    net_out = affine(t_out, 10, 'output')

loss = tf.nn.sparse_softmax_cross_entropy_with_logits(net_out, targets)
loss = tf.reduce_mean(loss) + tf.add_n(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
accuracy = tf.contrib.metrics.accuracy(
    tf.cast(tf.argmax(net_out, 1), tf.int32), targets)

opt = tf.train.RMSPropOptimizer(0.001)
train_op = opt.minimize(loss)

In [45]:
data, labels = mnist.get_data('train', 60000)
test_data, test_labels = mnist.get_data('test', 10000)
data = data.reshape((-1, 784))
test_data = test_data.reshape((-1, 784))

In [46]:
sess = tf.Session()
sess.run(tf.initialize_all_variables())

In [None]:
def batch_iter(data, labels, batch_size):
    num_batches = data.shape[0] // batch_size
    
    idcs = np.arange(len(data))
    np.random.shuffle(idcs)
    
    for i in range(num_batches):
        batch_idcs = idcs[i*batch_size:(i+1)*batch_size]
        yield data[batch_idcs,...], labels[batch_idcs]

In [None]:
EPOCHS = 50
BATCH_SIZE = 100
valid_accs = []
for epoch in range(EPOCHS):
    epoch_loss = 0
    epoch_steps = 0
    
    for dbatch, tbatch in batch_iter(data, labels, BATCH_SIZE):
        batch_loss, _ = sess.run([loss, train_op],
                                 {inputs: dbatch,
                                  targets: tbatch})
        epoch_loss += batch_loss
        epoch_steps += 1
        
    valid_acc = 0
    valid_steps = 0
    for dbatch, tbatch in batch_iter(test_data, test_labels, BATCH_SIZE):
        batch_acc = sess.run(accuracy, {inputs: dbatch, targets: tbatch})
        valid_acc += batch_acc
        valid_steps += 1
    
    valid_accs.append(valid_acc/valid_steps)
        
    print('\r~~({:>3}) train loss: {:.5f}~~'.format(epoch+1, epoch_loss/epoch_steps), end='')

In [None]:
plt.plot(valid_accs)
print(max(valid_accs))

In [None]:
print(784 * WIDTH + DEPTH*WIDTH*WIDTH + 10*WIDTH)
print(78*WIDTH + 10*WIDTH + 2*RANK*WIDTH + RANK*DEPTH)