In [None]:
%load_ext autoreload
%autoreload 2 

In [None]:
from __future__ import print_function, absolute_import, division

In [None]:
import numpy as np
import tensorflow as tf
print(tf.__version__)
tf.test.is_gpu_available()

## The Game to Learn From

In [None]:
from wgomoku import (
    GomokuBoard, HeuristicGomokuPolicy, Heuristics, GomokuTools as gt,
    data_from_game)
stones=gt.string_to_stones('e10g8g5f5f6e7f7f8e8g9h10d9g10f10h8h9i9g7e9j8h11i12e11e12g11f11f12e13g13h14i10g14j9k8i8k4i7')
heuristics = Heuristics(kappa=3.0)
stones = stones[:-5]
print(stones)

In [None]:
board_size=20

In [None]:
board = GomokuBoard(N=board_size, disp_width=10, heuristics=heuristics, stones=stones)
board.display()

In [None]:
from GomokuData import create_sample, to_matrix12
s = create_sample(board.stones, 20, 0)
to_matrix12(s)

---
## The Deep Q-Function
This function is designed to integrate into the estimator training

In [None]:
def conv_gomoku(board_size, features, feature_columns, options):

    N = board_size
    
    layout = options['layout']
    
    feature_columns = [num('state', shape=((N+2)*(N+2)*2))]

    input_layer = tf.feature_column.input_layer( 
        features, feature_columns=feature_columns)

    layer = tf.reshape(input_layer, [-1, N+2, N+2, 2], name='reshape_input') 
   
    for filters, kernel in np.reshape(layout, [-1,2]):
        layer = tf.layers.conv2d(inputs=layer, filters=filters, 
                                 kernel_size=[kernel, kernel], strides=[1,1], 
                                 padding='SAME')
        
        # Exotic! Let the network learn efficient activation functions at each layer
        beta_l = tf.Variable(-0.5),
        beta_r = tf.Variable(0.5)
        layer = layer * (layer - beta_l) * (layer - beta_r)
        
    layer = tf.layers.conv2d(inputs=layer, filters=1, 
                              kernel_size=[kernel, kernel], strides=[1,1], 
                             padding='SAME')
    
    return layer

In [None]:
state = tf.reshape(tf.constant(s, dtype=tf.float32), [-1, 968])

In [None]:
from tensorflow.feature_column import numeric_column as num
features = {'state': [state] * 5}
feature_columns = [num('state', shape=((board_size+2)*(board_size+2)*2))]

In [None]:
layout=[128, 3, 128, 3, 128, 3, 64, 3, 64, 3, 16, 3]
options={'layout': layout, 'learning_rate': 1e-4}

In [None]:
qf = conv_gomoku(20, features, feature_columns, options)
qf

--- 
## Masking the edges

In [None]:
mask = np.ones([22, 22], dtype=int)
mask[0] = 0
mask[21] = 0
mask[:,0]=0
mask[:,21]=0
mask = tf.constant(mask, dtype=tf.float32)
mask = tf.expand_dims(mask,-1)
mask

In [None]:
qfm = qf * mask
qfm

In [None]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    q, m, qm = sess.run([qf, mask, qfm])

In [None]:
q.shape, m.shape, qm.shape

In [None]:
qvalues = np.rollaxis(qm[0], 2, 0)[0]

In [None]:
qvalues[:2], qvalues.shape

## The Labels From Heuristics

In [None]:
from wgomoku import heuristic_QF, wrap_sample

In [None]:
policy = HeuristicGomokuPolicy(bias=0.5, topn=5, style=2)
hqf, dval = heuristic_QF(board, policy)

In [None]:
hqf = wrap_sample(hqf, 0.0)

In [None]:
hqf[0:2], hqf.shape

In [None]:
hqf = hqf/100.0 # helps converge faster
hqf.shape

## Learning to QValuate a Single Situation
This is just to verify that the exotic choices in the hypotheses still provide good convergence.

In [None]:
s5 = [s]*5
hqf5 = np.array([hqf]*5)
hqf5 = np.expand_dims(hqf5,-1)
hqf5.shape, np.shape(s5)

In [None]:
def create_model(inputs, labels, options):
    """
    samples and labels must be ndarrays of shape (N,22,22,2) and (N,22,22,1) resp.
    """

    learning_rate=options['learning_rate']
    
    mask = np.ones([22, 22], dtype=int)
    mask[0] = 0
    mask[21] = 0
    mask[:,0]=0
    mask[:,21]=0
    mask = tf.constant(mask, dtype=tf.float32)
    mask = tf.expand_dims(mask,-1)
    
    inputs = [tf.reshape(tf.constant(sample, dtype=tf.float32), [-1, 968]) 
              for sample in inputs]
    
    from tensorflow.feature_column import numeric_column as num
    feature_columns = [num('state', shape=((board_size+2)*(board_size+2)*2))]
    
    #inputs = {'state': [ sample for sample in inputs ]}
    features = {'state': inputs}
    qf = conv_gomoku(20, features, feature_columns, options)    
    
    labels = tf.constant(labels, dtype=tf.float32)
    loss = tf.losses.mean_squared_error(labels, qf * mask)
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)
    
    
    
    return qf * 100.0, optimizer, loss

In [None]:
qf, opt, loss = create_model(s5, hqf5 / 100.0, options)

In [None]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    for i in range(3001):
        _ = session.run(opt)
        if i % 200 == 0:
            l = session.run(loss)
            l_t = session.run(loss)
            print(l, l_t)
    res=session.run(100.0 * qf)

In [None]:
np.rollaxis(res[0], 2, 0)[0][8].astype(int)

Note, that the first and last field in this 8th row are effectively beyond the border and don't contribute to the loss function.

In [None]:
(hqf*100.0)[8].astype(int)

# Learning from 8 different  samples

In [None]:
from wgomoku import create_samples_and_qvalues

In [None]:
states, qvalues, _ = create_samples_and_qvalues(board, policy, heuristics)

In [None]:
states.shape, qvalues.shape

In [None]:
qf, opt, loss = create_model(states, qvalues / 100.0, options)

In [None]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    for i in range(5001):
        _ = session.run(opt)
        if i % 500 == 0:
            l = session.run(loss)
            l_t = session.run(loss)
            print(l, l_t)
    res=session.run(qf)

All learned Q-Values are within a tolerance of $\pm 10$.

In [None]:
tolerance = 10.0
for i in range(8):
    deepq = np.rollaxis(res[i], 2, 0)[0][1:-1].T[1:-1].T
    qvals = np.rollaxis(qvalues[i], 2, 0)[0][1:-1].T[1:-1].T
    print ((deepq - qvals > -tolerance).all() & (deepq - qvals < tolerance).all())

## Learn From the Game's Entire History

In [None]:
from copy import deepcopy
from wgomoku import data_from_game

In [None]:
states, qvalues = data_from_game(deepcopy(board), policy, heuristics)

In [None]:
states.shape, qvalues.shape

In [None]:
options['learning_rate'] = 1e-4
qf, opt, loss = create_model(states, qvalues / 100.0, options)

In [None]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    for i in range(2001):
        _ = session.run(opt)
        if i % 1000 == 0:
            l = session.run(loss)
            l_t = session.run(loss)
            print(l, l_t)
    res=session.run(qf)

In [None]:
for i in range(8):
    deepq = np.rollaxis(res[i], 2, 0)[0][1:-1].T[1:-1].T
    qvals = np.rollaxis(qvalues[i], 2, 0)[0][1:-1].T[1:-1].T
    print(max(np.max(deepq-qvals), -np.min(deepq-qvals)))

That's not going to work at all. Let's look at the sixth row of all the 24 different games states. We can see that the network is particularly bad at critical states. 

In [None]:
for idx in range(0, 192, 8):
    default_value = qvalues[idx][0][0]
    deepq = np.rollaxis(res[idx], 2, 0)[0][1:-1].T[1:-1].T
    qvals = np.rollaxis(qvalues[idx], 2, 0)[0][1:-1].T[1:-1].T
    print("Default: %s" % default_value)
    print("Differences: %s" % (deepq - qvals).astype(int)[6])
    print("Heuristics: %s" % qvals.astype(int)[6])

In [None]:
from wgomoku import to_matrix_xo
def to_matrix_xo(sample):
    if np.sum(to_matrix12(sample)>0) % 2 == 0:
        symbols = ['. ', 'x ', 'o ']
    else:
        symbols = ['. ', 'o ', 'x ']
    im12 = to_matrix12(sample)
    return "\n".join(["".join([symbols[c] for c in im12[r]]) for r in range(20) ])

In [None]:
print(to_matrix_xo(states[0]))

In [None]:
print(to_matrix_xo(states[8]))

---
## Learning A-Values Instead
The reason is obviously the massive variance of the default QValue. QValues are hard to learn. Let's try to learn Advantage values instead. In our case we can easily identify the default value used in the heuristic QFunction, since the beyond-border fields are filled by that value. Subtracting that default value from all fields provides a pretty good estimate for an advantage function.

In [None]:
for i in range(192):
    avalues[i] = qvalues[i]-qvalues[i][0][0][0]

In [None]:
options['learning_rate'] = 1e-4
qf, opt, loss = create_model(states, avalues / 100.0, options)

In [None]:
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    for i in range(10001):
        _ = session.run(opt)
        if i % 100 == 0:
            l = session.run(loss)
            l_t = session.run(loss)
            print(l, l_t)
    res=session.run(qf)

In [None]:
for idx in range(0, 192, 8):
    default_value = qvalues[idx][0][0]
    deepq = np.rollaxis(res[idx], 2, 0)[0][1:-1].T[1:-1].T + default_value
    qvals = np.rollaxis(qvalues[idx], 2, 0)[0][1:-1].T[1:-1].T
    print("Default: %s" % default_value)
    print("Differences: %s" % (deepq - qvals).astype(int)[6])
    print("Heuristics: %s" % qvals.astype(int)[6])

In [None]:
idx=32
default_value = qvalues[idx][0][0]
deepq = np.rollaxis(res[idx], 2, 0)[0][1:-1].T[1:-1].T + default_value
qvals = np.rollaxis(qvalues[idx], 2, 0)[0][1:-1].T[1:-1].T
deepq.astype(int)

---
### Summary
The advantage function can obviously be efficiently learned. The Q-Function is too hard to understand for any neural network.