In [1]:
from tfg.alphaZero import create_alphazero, AlphaZero
from tfg.alphaZeroCallbacks import Checkpoint
from tfg.alphaZeroAdapters import TicTacToeAdapter
from tfg.util import enable_gpu, play
from tfg.strategies import Minimax, MonteCarloTree
from game.tictactoe import TicTacToe

In [2]:
enable_gpu()

game = TicTacToe()

In [4]:
callback = Checkpoint('exp_chk', 'models/checkpoints', delay=4)

alphazero = create_alphazero(game, TicTacToeAdapter(),
                             max_workers=8, self_play_times=64,
                             max_games_counter=1024, buffer_size=512,
                             callbacks=None, mcts_iter=100, c_puct=1.25,
                             temperature=5, exploration_noise=(.25, .045))

2021-03-24 20:39:56,508	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Epoch 1/10
4/4 - 2s - loss: 1.5691 - value_head_loss: 0.6122 - policy_head_loss: 2.3758
Epoch 2/10
4/4 - 0s - loss: 1.4381 - value_head_loss: 0.4824 - policy_head_loss: 2.1917
Epoch 3/10
4/4 - 0s - loss: 1.3864 - value_head_loss: 0.4417 - policy_head_loss: 2.0808
Epoch 4/10
4/4 - 0s - loss: 1.3749 - value_head_loss: 0.4337 - policy_head_loss: 2.0286
Epoch 5/10
4/4 - 0s - loss: 1.3416 - value_head_loss: 0.4108 - policy_head_loss: 1.9583
Epoch 6/10
4/4 - 0s - loss: 1.3026 - value_head_loss: 0.4072 - policy_head_loss: 1.8661
Epoch 7/10
4/4 - 0s - loss: 1.2304 - value_head_loss: 0.3793 - policy_head_loss: 1.7377
Epoch 8/10
4/4 - 0s - loss: 1.1726 - value_head_loss: 0.3453 - policy_head_loss: 1.6488
Epoch 9/10
4/4 - 0s - loss: 1.1459 - value_head_loss: 0.3682 - policy_head_loss: 1.5686
Epoch 10/10
4/4 - 0s - loss: 1.0808 - value_head_loss: 0.3298 - policy_head_loss: 1.4748
Games played: 64
Epoch 1/10
4/4 - 0s - loss: 1.5211 - value_head_loss: 0.5456 - policy_head_loss: 2.1390
Epoch 2/10
4/4

Epoch 3/10
4/4 - 0s - loss: 0.8371 - value_head_loss: 0.3822 - policy_head_loss: 1.0214
Epoch 4/10
4/4 - 0s - loss: 0.7923 - value_head_loss: 0.3493 - policy_head_loss: 0.9642
Epoch 5/10
4/4 - 0s - loss: 0.7522 - value_head_loss: 0.3143 - policy_head_loss: 0.9192
Epoch 6/10
4/4 - 0s - loss: 0.7341 - value_head_loss: 0.2987 - policy_head_loss: 0.9002
Epoch 7/10
4/4 - 0s - loss: 0.7146 - value_head_loss: 0.2912 - policy_head_loss: 0.8720
Epoch 8/10
4/4 - 0s - loss: 0.6988 - value_head_loss: 0.2666 - policy_head_loss: 0.8693
Epoch 9/10
4/4 - 0s - loss: 0.6874 - value_head_loss: 0.2583 - policy_head_loss: 0.8601
Epoch 10/10
4/4 - 0s - loss: 0.6731 - value_head_loss: 0.2391 - policy_head_loss: 0.8562
Games played: 640
Epoch 1/10
4/4 - 0s - loss: 1.0421 - value_head_loss: 0.4870 - policy_head_loss: 1.3523
Epoch 2/10
4/4 - 0s - loss: 0.8339 - value_head_loss: 0.3613 - policy_head_loss: 1.0639
Epoch 3/10
4/4 - 0s - loss: 0.7702 - value_head_loss: 0.3020 - policy_head_loss: 0.9939
Epoch 4/10
4/

In [4]:
alphazero.save('models/experiment_tictactoe.h5')

In [5]:
play(game, Minimax(game), alphazero, games=100)

(73, 27, 0)

In [5]:
alphazero = AlphaZero(game, buffer_size=1024, mcts_iter=100)

In [6]:
alphazero.load('models/experiment_tictactoe.h5')

In [8]:
mcts_draws = list()
az_draws = list()
minimax = Minimax(game)
iters = (2, 10, 50, 100, 200, 400)

for m in iters:
    mcts = MonteCarloTree(game, max_iter=m, reset_tree=False)
    _, draws, _ = play(game, minimax, mcts, games=100, max_workers=10)
    mcts_draws.append(draws)
    
    alphazero.set_max_iter(m)
    _, draws, _ = play(game, minimax, alphazero, games=100)
    az_draws.append(draws)

mcts_draws, az_draws

([0, 9, 35, 53, 60, 80], [79, 90, 86, 87, 85, 92])

In [25]:
import plotly.graph_objects as go


fig = go.Figure()
fig.add_trace(go.Scatter(x=iters, y=mcts_draws, mode='lines+markers', name='MCTS draws'))
fig.add_trace(go.Scatter(x=iters, y=az_draws, mode='lines+markers', name='AZ draws'))
fig.update_xaxes(title_text='Iterations')
fig.update_yaxes(title_text='Draws')
fig.show()

In [13]:
from tfg.strategies import HumanStrategy


alphazero.set_max_iter(1000)

play(game, HumanStrategy(game), alphazero, render=True, print_results=True)

 | | 
-+-+-
 | | 
-+-+-
 | | 


[PLAYER] choose action: 0, 1, 2, 3, 4, 5, 6, 7, 8
> 4
 | | 
-+-+-
 |X| 
-+-+-
 | | 

 | | 
-+-+-
 |X|O
-+-+-
 | | 


[PLAYER] choose action: 0, 1, 2, 3, 6, 7, 8
> 0
X| | 
-+-+-
 |X|O
-+-+-
 | | 

X| | 
-+-+-
 |X|O
-+-+-
 | |O


[PLAYER] choose action: 1, 2, 3, 6, 7
> 2
X| |X
-+-+-
 |X|O
-+-+-
 | |O

X| |X
-+-+-
 |X|O
-+-+-
O| |O


[PLAYER] choose action: 1, 3, 7
> 1
X|X|X
-+-+-
 |X|O
-+-+-
O| |O

PLAYER 1 WON


(1, 0, 0)

In [6]:
model = alphazero.neural_network.model

In [9]:
import numpy as np
board = np.array([[
    [[1, 0, 0],
     [0, 1, 1],
     [0, 0, 0]],
    [[0, 0, 0],
     [1, 0, 0],
     [1, 1, 0]],
    [[0] * 3] * 3
]])
v, p = model(board)
v = v.numpy()[0, 0]
p = p.numpy()[0]
v, p[..., 0], np.argmax(p)

(-0.12563615,
 array([[0.11519483, 0.11831883, 0.09242978],
        [0.12356576, 0.11612709, 0.10849004],
        [0.10646517, 0.11223781, 0.10717058]], dtype=float32),
 3)