In [1]:
import copy
import numpy as np

class Node:

    def __init__(self, observation, to_play):
        x = np.zeros(shape=(3, 3, 3))
        x[observation == 1, 0] = 1
        x[observation == -1, 1] = 1
        x[:, :, 2] = 0 if to_play == 1 else 1

        self.x = x
        self.value = None
        self.policy = None
        self.children = dict()


class Minimax:

    def __init__(self, env):
        self._env = env

    def move(self, observation):
        player = self._env.to_play
        root = Node(observation, player)
        action, _ = self._minimax(self._env, observation, player, root)
        return root

    def _minimax(self, env, observation, player, root):
        to_play = env.to_play
        legal_actions = env.legal_actions()
        selected_action = None
        value = None
        policy = dict()

        for action in legal_actions:
            env_ = copy.deepcopy(env)
            obs, reward, done, _ = env_.step(action)

            node = Node(obs, env_.to_play)
            root.children[action] = node

            if not done:
                _, reward = self._minimax(env_, obs, player, node)
            else:
                node.value = reward * env_.to_play
                node.policy = dict()

            p = (reward * to_play + 1) / 2
            policy[action] = p

            if (selected_action is None or
                    self._better_value(value, reward, player, to_play)):
                selected_action = action
                value = reward

        den = sum(policy.values())
        if den != 0:
            for action in policy.keys():
                policy[action] /= den
        else:
            p = 1 / len(policy)
            for action in policy.keys():
                policy[action] = p

        root.value = value * to_play
        root.policy = policy

        # Return the selected action at the root
        return selected_action, value

    @staticmethod
    def _better_value(current, new, player, to_play):
        # Work with (win, >0), (lose, <0)
        current = player * current
        new = player * new
        if player == to_play and new > current:
            return True
        if player != to_play and new < current:
            return True
        return False

In [2]:
from game.tictactoe import TicTacToe


game = TicTacToe()
tree = Minimax(game)
root = tree.move(game.reset())

In [3]:
def generate_dataset(node):

    x = list()
    v = list()
    p = list()

    def recursion(n):
        for child in n.children.values():
            x.append(child.x)
            v.append(child.value)
            temp = np.zeros(9)
            temp[list(child.policy.keys())] = list(child.policy.values())
            p.append(temp)
            recursion(child)

    recursion(node)

    return np.array(x), (np.array(v), np.array(p))

In [4]:
x, (v, p) = generate_dataset(root)

In [5]:
x.shape, v.shape, p.shape

((549945, 3, 3, 3), (549945,), (549945, 9))

In [6]:
from tfg.util import enable_gpu

enable_gpu()

In [107]:
from tfg.alphaZero import AlphaZero
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

az = AlphaZero(game, nn_config=dict(residual_layers=3, kernel_size=(1, 1), regularizer_constant=0))
model = az.neural_network.model

model.compile(optimizer=Adam(learning_rate=az.neural_network.learning_rate),
              loss={
                  'value_head': tf.keras.losses.MeanSquaredError(),
                  'policy_head': tf.keras.losses.CategoricalCrossentropy()
              },
              loss_weights={'value_head': 0.5, 'policy_head': 0.5},
              metrics=['acc'])



In [108]:
history = dict()

size = int(len(x) * .8)

indices = np.random.choice(len(x), size=size)
x_train = x[indices]
y_train = v[indices], p[indices]

x_test = x[~indices]
y_test = v[~indices], p[~indices]


history = model.fit(x_train, y_train, epochs=15, validation_split=.1).history

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [111]:
import plotly.express as px

axis = list(range(1, len(history['value_head_loss']) + 1))
px.line(x=axis, y=[history['value_head_loss'], history['policy_head_loss']])

In [110]:
model.evaluate(x_test, y_test)



[3.633585214614868,
 0.37848958373069763,
 6.888680458068848,
 0.4106842577457428,
 0.2630922198295593]

In [112]:
def print_example():
    i = np.random.choice(len(x))
    x_ = x[i:i+1]
    v_, p_ = v[i], p[i]

    predictions = tuple(map(lambda y: y.numpy(), model(x_)))
    
    board = np.zeros(shape=(3, 3))
    board[x_[0, ..., 0] == 1] = 1
    board[x_[0, ..., 1] == 1] = -1
    
    print(board)
    print()
    print(x_[..., 0], x_[..., 1], x_[..., 2], sep='\n\n')
    print()
    print("Expected values:")
    print(v_, p_, sep=' \n')
    print()
    print("Predicted values")
    print(*predictions, sep='\n')
    print(np.argmax(predictions[1]))

In [123]:
print_example()

[[ 0. -1.  0.]
 [ 0. -1.  1.]
 [ 1.  1. -1.]]

[[[0. 0. 0.]
  [0. 0. 1.]
  [1. 1. 0.]]]

[[[0. 1. 0.]
  [0. 1. 0.]
  [0. 0. 1.]]]

[[[0. 0. 0.]
  [0. 0. 0.]
  [0. 0. 0.]]]

Expected values:
0.0 
[1. 0. 0. 0. 0. 0. 0. 0. 0.]

Predicted values
[[0.66630834]]
[[1.9943116e-06 3.2732897e-10 9.9978834e-01 2.0906444e-04 1.9628066e-08
  3.6516185e-10 1.0458939e-12 3.3853132e-07 2.0452914e-07]]
2


In [88]:
game.reset()
game.render()

 | | 
-+-+-
 | | 
-+-+-
 | | 



In [98]:
observation, _, _, _ = game.step(4)
game.render()

X|X|O
-+-+-
X|X| 
-+-+-
O|O| 



In [99]:
n = Node(observation, game.to_play)
v_, p_ = model(n.x.reshape(1, 3, 3, 3))
v_.numpy(), p_.numpy(), np.argmax(p_)

(array([[0.934794]], dtype=float32),
 array([[0.00608047, 0.30944157, 0.0160199 , 0.00049814, 0.00040816,
         0.17290425, 0.14250521, 0.1662804 , 0.18586186]], dtype=float32),
 1)