# Train the contender on self-play episodes of the champion

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from alphazero.interfaces import TrainParams
from pickle import Unpickler
from alphazero.gomoku_model import GomokuModel
from alphazero.gomoku_board import GomokuBoard
from domoku.board import GomokuBoard as LegacyBoard

import numpy as np
from domoku import tools as gt

In [3]:
BOARD_SIZE = 15

----

## Training from pre-produced trajectories

In [4]:
trainings_file = 'training_data/checkpoint_0.pth.tar.examples'
test_file = 'test_data/checkpoint_0.pth.tar.examples'

In [5]:
with open(trainings_file, "rb") as f:
    training_examples = Unpickler(f).load()[0]
with open(test_file, "rb") as f:
    test_examples = Unpickler(f).load()[0]

In [6]:
print(len(training_examples))
print(len(test_examples))

3168
240


In [8]:
the_model = GomokuModel( BOARD_SIZE+ 2, kernel_size=11)
#the_model.policy.summary()
sample = np.expand_dims(training_examples[0][0], axis=0).astype(float)
print(sample.shape)
# Currently the output is just noise
# the_model.policy(sample, debug=True)

(1, 17, 17, 3)


In [12]:
# TODO: Consider two different parameter sets, one for data, one for training

params = TrainParams(
    epochs_per_train=2000, # up to 2000?
    update_threshold=0.6,
    max_queue_length=8192,    # Number of game examples to keep to train the neural networks.
    num_simulations=25, # may want way more
    arena_compare=2,         # Number of games to play during arena play to evaluate new network.
    cpuct=1.0,
    checkpoint_dir='./examples/',
    load_model=False,
    load_folder_file=('/dev/models/8x100x50', 'best.pth.tar'),
    num_iters_for_train_examples_history=4,
    num_iterations=2, # Will be way more when really going for it...
    num_episodes=2,  # Again, possibly way more
    temperature_threshold=12
)
the_model.train(training_examples, test_examples, n_epochs=2000)

Epoch: 1, Training: 1.7313604354858398, Test: 5.843680381774902
Epoch: 101, Training: 1.6979451179504395, Test: 5.927306175231934
Epoch: 201, Training: 1.6564377546310425, Test: 5.989526271820068
Epoch: 301, Training: 1.6279456615447998, Test: 6.040534973144531


KeyboardInterrupt: 

In [14]:
ex=training_examples[-81]
print(len(ex), ex[0].shape, ex[2])
print((np.array(ex[1]) * 100).reshape((15, 15)).astype(int))
gt.print_channels(ex[0])

3 (17, 17, 3) 0.07549842984757062
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  2  0  0  1  0]
 [ 0  0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  1  0  2  1  1  0]
 [ 0  0  0  0  0 13  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 16  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  3  9  0 10  0  0  0  0]
 [ 0  0  0  0  0  0  0 15  0  0  0  7  0  0  0]
 [ 0  0  0  0  0  1  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  1  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
shape: (17, 17, 3)
[[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [3 0 0 0 0 0 0 2 0 0 0 2 0 0 0 0 3]
 [3 0 0 0 0 0 0 0 1 1 0 2 0 0 0 0 3]
 [3 

In [15]:
p, v = the_model.predict(np.expand_dims(ex[0], axis=0).astype(float))
print((p*100).numpy().reshape((15, 15)).astype(int))
print()
print(f"Value: {v}")

[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  2  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  3  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  2  1  0  0]
 [ 0  0  0  0  0 11  0  0  0  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 13  4  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  0  0  0]
 [ 0  0  0  0  0  0  0  3 10  0  6  0  0  0  0]
 [ 0  0  0  0  0  0  0 19  0  0  0  3  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]

Value: [[0.10616709]]


## Testing the policy on unseen boards



In [18]:
test_file = 'test_data/checkpoint_0.pth.tar.examples'

In [19]:
from pickle import Unpickler

with open(test_file, "rb") as f:
    examples = Unpickler(f).load()
    examples = examples[0]

In [20]:
ex = examples[-63]
print(len(ex), ex[0].shape, ex[2])
print((np.array(ex[1]) * 100).reshape((15, 15)).astype(int))
gt.print_channels(ex[0])

3 (17, 17, 3) -0.02214566632676415
[[ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 46  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 53  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
shape: (17, 17, 3)
[[3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3]
 [3 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3]
 [3

In [21]:
sample = np.expand_dims(ex[0], axis=0).astype(float)
p, v = the_model.predict(sample)
print((p*100).numpy().reshape((15, 15)).astype(int))
print()
print(f"Value: {int(v)}")

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 1 0 1 0 0 5 0 1 0 0 0 0 0]
 [0 0 1 1 2 0 5 4 1 0 0 0 0 0 0]
 [0 0 0 3 1 0 0 6 2 0 0 0 0 0 0]
 [0 0 0 0 1 2 0 0 0 0 0 0 0 0 0]
 [0 0 4 3 0 0 2 0 0 0 0 0 0 0 0]
 [0 0 0 2 1 0 3 3 0 0 0 0 0 0 0]
 [0 0 0 2 4 0 0 0 2 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

Value: 0
