In [1]:
from tfg.alphaZero import create_alphazero, AlphaZero
from tfg.alphaZeroCallbacks import Checkpoint
from tfg.util import enable_gpu, play
from tfg.strategies import Minimax, MonteCarloTree
from game.tictactoe import TicTacToe

In [2]:
enable_gpu()

game = TicTacToe()

In [14]:
callback = Checkpoint('exp_chk', 'models/checkpoints', delay=4)

alphazero = create_alphazero(game, max_workers=8, self_play_times=64,
                             max_games_counter=1024, buffer_size=1024,
                             callbacks=[callback], mcts_iter=100, c_puct=2,
                             temperature=4, exploration_noise=(.25, .045),
                             nn_config=dict(residual_layers=7, filters=256))

Epoch 1/10
4/4 - 1s - loss: 1.7673 - value_head_loss: 0.7989 - policy_head_loss: 2.5845
Epoch 2/10
4/4 - 0s - loss: 1.5132 - value_head_loss: 0.5836 - policy_head_loss: 2.2378
Epoch 3/10
4/4 - 0s - loss: 1.4752 - value_head_loss: 0.5637 - policy_head_loss: 2.1317
Epoch 4/10
4/4 - 0s - loss: 1.4207 - value_head_loss: 0.5003 - policy_head_loss: 2.0476
Epoch 5/10
4/4 - 0s - loss: 1.3690 - value_head_loss: 0.4212 - policy_head_loss: 1.9948
Epoch 6/10
4/4 - 0s - loss: 1.3230 - value_head_loss: 0.3596 - policy_head_loss: 1.9444
Epoch 7/10
4/4 - 0s - loss: 1.2709 - value_head_loss: 0.3406 - policy_head_loss: 1.8454
Epoch 8/10
4/4 - 0s - loss: 1.2187 - value_head_loss: 0.3161 - policy_head_loss: 1.7566
Epoch 9/10
4/4 - 0s - loss: 1.1718 - value_head_loss: 0.2971 - policy_head_loss: 1.6764
Epoch 10/10
4/4 - 0s - loss: 1.1167 - value_head_loss: 0.2506 - policy_head_loss: 1.6098
Games played: 64
Epoch 1/10
4/4 - 0s - loss: 1.4966 - value_head_loss: 0.4923 - policy_head_loss: 2.1268
Epoch 2/10
4/4

Epoch 2/10
4/4 - 0s - loss: 1.0019 - value_head_loss: 0.2720 - policy_head_loss: 1.4736
Epoch 3/10
4/4 - 0s - loss: 0.9211 - value_head_loss: 0.2501 - policy_head_loss: 1.3336
Epoch 4/10
4/4 - 0s - loss: 0.8816 - value_head_loss: 0.1948 - policy_head_loss: 1.3077
Epoch 5/10
4/4 - 0s - loss: 0.8150 - value_head_loss: 0.1872 - policy_head_loss: 1.1796
Epoch 6/10
4/4 - 0s - loss: 0.7961 - value_head_loss: 0.1750 - policy_head_loss: 1.1528
Epoch 7/10
4/4 - 0s - loss: 0.7637 - value_head_loss: 0.1554 - policy_head_loss: 1.1077
Epoch 8/10
4/4 - 0s - loss: 0.7536 - value_head_loss: 0.1474 - policy_head_loss: 1.0967
Epoch 9/10
4/4 - 0s - loss: 0.7300 - value_head_loss: 0.1308 - policy_head_loss: 1.0684
Epoch 10/10
4/4 - 0s - loss: 0.7126 - value_head_loss: 0.1144 - policy_head_loss: 1.0533
Games played: 640
Epoch 1/10
4/4 - 0s - loss: 1.1057 - value_head_loss: 0.3126 - policy_head_loss: 1.6448
Epoch 2/10
4/4 - 0s - loss: 0.9157 - value_head_loss: 0.2425 - policy_head_loss: 1.3366
Epoch 3/10
4/

In [4]:
alphazero.save('models/experiment_tictactoe.h5')

In [6]:
alphazero.c_puct = 1.5
play(game, Minimax(game), alphazero, games=5)

(5, 0, 0)

In [5]:
alphazero = AlphaZero(game, buffer_size=1024, mcts_iter=100)

In [6]:
alphazero.load('models/experiment_tictactoe.h5')

In [8]:
mcts_draws = list()
az_draws = list()
minimax = Minimax(game)
iters = (2, 10, 50, 100, 200, 400)

for m in iters:
    mcts = MonteCarloTree(game, max_iter=m, reset_tree=False)
    _, draws, _ = play(game, minimax, mcts, games=100, max_workers=10)
    mcts_draws.append(draws)
    
    alphazero.set_max_iter(m)
    _, draws, _ = play(game, minimax, alphazero, games=100)
    az_draws.append(draws)

mcts_draws, az_draws

([0, 9, 35, 53, 60, 80], [79, 90, 86, 87, 85, 92])

In [25]:
import plotly.graph_objects as go


fig = go.Figure()
fig.add_trace(go.Scatter(x=iters, y=mcts_draws, mode='lines+markers', name='MCTS draws'))
fig.add_trace(go.Scatter(x=iters, y=az_draws, mode='lines+markers', name='AZ draws'))
fig.update_xaxes(title_text='Iterations')
fig.update_yaxes(title_text='Draws')
fig.show()

In [13]:
from tfg.strategies import HumanStrategy


alphazero.set_max_iter(1000)

play(game, HumanStrategy(game), alphazero, render=True, print_results=True)

 | | 
-+-+-
 | | 
-+-+-
 | | 


[PLAYER] choose action: 0, 1, 2, 3, 4, 5, 6, 7, 8
> 4
 | | 
-+-+-
 |X| 
-+-+-
 | | 

 | | 
-+-+-
 |X|O
-+-+-
 | | 


[PLAYER] choose action: 0, 1, 2, 3, 6, 7, 8
> 0
X| | 
-+-+-
 |X|O
-+-+-
 | | 

X| | 
-+-+-
 |X|O
-+-+-
 | |O


[PLAYER] choose action: 1, 2, 3, 6, 7
> 2
X| |X
-+-+-
 |X|O
-+-+-
 | |O

X| |X
-+-+-
 |X|O
-+-+-
O| |O


[PLAYER] choose action: 1, 3, 7
> 1
X|X|X
-+-+-
 |X|O
-+-+-
O| |O

PLAYER 1 WON


(1, 0, 0)

In [7]:
model = alphazero.neural_network.model

In [10]:
import numpy as np
board = np.array([[
    [[1, 0, 0],
     [0, 1, 0],
     [0, 0, 0]],
    [[0, 0, 0],
     [1, 0, 0],
     [0, 1, 0]],
    [[0] * 3] * 3
]])
v, p = model(board)
v = v.numpy()[0, 0]
p = p.numpy()[0]
v, p, np.argmax(p)

(0.070715435,
 array([3.8721971e-02, 5.0386719e-02, 3.7229776e-01, 3.6403816e-02,
        3.0246456e-04, 2.9178083e-04, 3.6230662e-01, 5.1153586e-03,
        1.3417354e-01], dtype=float32),
 2)