In [1]:
from __future__ import absolute_import, print_function, division

In [2]:
import random
import numpy as np
import tensorflow as tf

## Synthetic data

We create a set of *boards* of size $5 \times 5$ with 3 channels for each position. The label at each position $(j,k)$ is computed as a function of the channel values $v_{jkl}$ at that position.

$$
    L_{jk} = \sum_{l=0}^2 a_l \cdot (v_{jkl})^{l+1}
$$

Here, $a_l$ denote arbitrary coefficients defined below. Note that this function is the same for every position. Thus, a sufficiently deep convolutional network with only $1 \times 1$ kernels should easily learn this function by simultaneously looking at all the positions of any given training board.

In [3]:
def create_data(N):
    batch = np.zeros([N,5,5,3])
    labels = np.zeros([N,5,5,1])
    a=[.9, .3, -.2]
    for i in range(N):
        for x in range(5):
            for y in range(5):
                for l in range(3):
                    v = 2*(random.random()-0.5)
                    batch[i][x][y][l] = v
                    labels[i][x][y][0] += a[l] * v**(l+1)
    return batch,labels

In [4]:
N = 100
batch, labels = create_data(N)
batch_t, labels_t = create_data(N)

In [5]:
batch.shape

(100, 5, 5, 3)

Smartly rearranging the dimensions of the first *board* of the batch shows the three $5 \times 5$ channels

In [48]:
print(np.rollaxis(batch[0], 2, 0))

[[[ 0.74693969  0.48546746 -0.64032873  0.20740517 -0.52110262]
  [-0.24502381 -0.39422057 -0.79559477 -0.78173892 -0.26821271]
  [ 0.64203813 -0.10964741  0.10761636  0.20474502  0.88349296]
  [ 0.6045561   0.4825901   0.22126914 -0.96609026  0.77700791]
  [-0.74618991 -0.7185272   0.25961772 -0.40991138  0.32262537]]

 [[-0.70078064 -0.88537715 -0.98702052 -0.86351054 -0.06063682]
  [-0.2799541  -0.63746665 -0.60918102 -0.20878153 -0.46629432]
  [ 0.36279194  0.41720322 -0.38299768  0.4476159   0.65373987]
  [-0.43108833  0.63979487 -0.98023316  0.46543722  0.31619075]
  [ 0.83441546  0.29428425  0.12673373  0.03372938 -0.05205615]]

 [[ 0.21045353 -0.28477088  0.42438695  0.00186075 -0.94323097]
  [ 0.39970533 -0.18084252  0.15286042  0.71141744 -0.26074029]
  [-0.94848738 -0.04065104  0.38869737  0.62644538  0.71679899]
  [-0.79279819 -0.90494988 -0.28311771 -0.72705819  0.15688136]
  [-0.68228306 -0.38828652  0.1926327  -0.13308862  0.24756423]]]


In [49]:
_inputs = tf.placeholder(tf.float32, [None, 5, 5, 3])
_labels = tf.placeholder(tf.float32, [None, 5, 5, 1])

### A special CNN
The convolutional network below can actually be regarded as a single convolutional layer with the kernel itself being a 5-layer feed-forward NN with layers $[3, 8, 32, 32, 1]$.

In [50]:
conv1 = tf.layers.conv2d(inputs=_inputs, filters=32, kernel_size=[1,1], strides=[1,1], padding='VALID', activation=tf.nn.elu)
conv2 = tf.layers.conv2d(inputs=conv1, filters=128, kernel_size=[1,1], strides=[1,1], padding='VALID', activation=tf.nn.elu)
conv3 = tf.layers.conv2d(inputs=conv2, filters=32, kernel_size=[1,1], strides=[1,1], padding='VALID', activation=tf.nn.elu)
conv4 = tf.layers.conv2d(inputs=conv3, filters=1, kernel_size=[1,1], strides=[1,1], padding='VALID')

loss = tf.losses.mean_squared_error(_labels,conv4)
optimizer = tf.train.AdamOptimizer(learning_rate=3e-4).minimize(loss)

### Training
We train the network and compute training loss and test loss once in a while

In [51]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    for i in range(10001):
        _ = session.run(optimizer, feed_dict={_inputs: batch, _labels: labels})
        if i % 1000 == 0:
            l = session.run(loss, feed_dict={_inputs: batch, _labels: labels})
            l_t = session.run(loss, feed_dict={_inputs: batch_t, _labels: labels_t})
            print(l, l_t)

0.2959582 0.2887018
0.0007335471 0.00072026637
0.00014291795 0.00014278585
6.4077256e-05 6.6136825e-05
2.4939978e-05 2.5989855e-05
1.0700216e-05 1.134723e-05
1.8971024e-05 2.0532565e-05
6.2155605e-06 6.667616e-06
5.3054796e-06 5.7024954e-06
4.788528e-06 5.1525794e-06
4.8150964e-06 5.3615126e-06


From the smooth convergence also on the test set we can see that the network has indeed learned our label function.