In [1]:
from tfg.alphaZero import create_alphazero, AlphaZero
from tfg.alphaZeroCallbacks import GameStore
from tfg.alphaZeroAdapters import TicTacToeAdapter
from tfg.util import enable_gpu, play
from tfg.strategies import Minimax, MonteCarloTree
from game.tictactoe import TicTacToe

In [2]:
enable_gpu()

game = TicTacToe()

In [3]:
store = GameStore()

alphazero = create_alphazero(game, TicTacToeAdapter(),
                             max_workers=10, self_play_times=120,
                             max_games_counter=480, buffer_size=1500,
                             callbacks=[store], mcts_iter=200, c_puct=1,
                             exploration_noise=(.25, .045), epochs=30, batch_size=512)

2021-04-02 10:39:28,521	INFO services.py:1172 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Epoch 1/30
16/16 - 7s - loss: 1.6253 - value_head_loss: 0.6730 - policy_head_loss: 2.2691
Epoch 2/30
16/16 - 0s - loss: 1.6239 - value_head_loss: 0.6008 - policy_head_loss: 2.1873
Epoch 3/30
16/16 - 0s - loss: 1.5903 - value_head_loss: 0.5487 - policy_head_loss: 2.1595
Epoch 4/30
16/16 - 0s - loss: 1.5386 - value_head_loss: 0.5203 - policy_head_loss: 2.1105
Epoch 5/30
16/16 - 0s - loss: 1.4897 - value_head_loss: 0.5193 - policy_head_loss: 2.0429
Epoch 6/30
16/16 - 0s - loss: 1.4290 - value_head_loss: 0.5035 - policy_head_loss: 1.9650
Epoch 7/30
16/16 - 0s - loss: 1.3651 - value_head_loss: 0.4773 - policy_head_loss: 1.8876
Epoch 8/30
16/16 - 0s - loss: 1.3335 - value_head_loss: 0.4684 - policy_head_loss: 1.8510
Epoch 9/30
16/16 - 0s - loss: 1.3057 - value_head_loss: 0.4840 - policy_head_loss: 1.7928
Epoch 10/30
16/16 - 0s - loss: 1.2551 - value_head_loss: 0.4365 - policy_head_loss: 1.7512
Epoch 11/30
16/16 - 0s - loss: 1.2298 - value_head_loss: 0.4382 - policy_head_loss: 1.7096
Epoch 12

Games played: 360
Epoch 1/30
16/16 - 0s - loss: 0.9555 - value_head_loss: 0.3746 - policy_head_loss: 1.3775
Epoch 2/30
16/16 - 0s - loss: 0.8690 - value_head_loss: 0.2873 - policy_head_loss: 1.2730
Epoch 3/30
16/16 - 0s - loss: 0.8385 - value_head_loss: 0.2671 - policy_head_loss: 1.2256
Epoch 4/30
16/16 - 0s - loss: 0.8057 - value_head_loss: 0.2312 - policy_head_loss: 1.1952
Epoch 5/30
16/16 - 0s - loss: 0.7837 - value_head_loss: 0.2219 - policy_head_loss: 1.1657
Epoch 6/30
16/16 - 0s - loss: 0.7686 - value_head_loss: 0.2090 - policy_head_loss: 1.1580
Epoch 7/30
16/16 - 0s - loss: 0.7590 - value_head_loss: 0.1932 - policy_head_loss: 1.1615
Epoch 8/30
16/16 - 0s - loss: 0.7444 - value_head_loss: 0.1864 - policy_head_loss: 1.1424
Epoch 9/30
16/16 - 0s - loss: 0.7428 - value_head_loss: 0.1831 - policy_head_loss: 1.1464
Epoch 10/30
16/16 - 0s - loss: 0.7350 - value_head_loss: 0.1786 - policy_head_loss: 1.1383
Epoch 11/30
16/16 - 0s - loss: 0.7482 - value_head_loss: 0.1840 - policy_head_los

In [4]:
alphazero.save('models/experiment_tictactoe.h5')

In [7]:
#alphazero.set_max_iter(400)

play(game, Minimax(game), alphazero, games=10)

(6, 4, 0)

In [5]:
alphazero = AlphaZero(game, buffer_size=1024, mcts_iter=100)

In [6]:
alphazero.load('models/experiment_tictactoe.h5')

In [8]:
mcts_draws = list()
az_draws = list()
minimax = Minimax(game)
iters = (2, 10, 50, 100, 200, 400)

for m in iters:
    mcts = MonteCarloTree(game, max_iter=m, reset_tree=False)
    _, draws, _ = play(game, minimax, mcts, games=100, max_workers=10)
    mcts_draws.append(draws)
    
    alphazero.set_max_iter(m)
    _, draws, _ = play(game, minimax, alphazero, games=100)
    az_draws.append(draws)

mcts_draws, az_draws

([0, 9, 35, 53, 60, 80], [79, 90, 86, 87, 85, 92])

In [25]:
import plotly.graph_objects as go


fig = go.Figure()
fig.add_trace(go.Scatter(x=iters, y=mcts_draws, mode='lines+markers', name='MCTS draws'))
fig.add_trace(go.Scatter(x=iters, y=az_draws, mode='lines+markers', name='AZ draws'))
fig.update_xaxes(title_text='Iterations')
fig.update_yaxes(title_text='Draws')
fig.show()

In [None]:
from tfg.strategies import HumanStrategy


alphazero.set_max_iter(400)

play(game, HumanStrategy(game), alphazero, render=True, print_results=True)

 | | 
-+-+-
 | | 
-+-+-
 | | 


[PLAYER] choose action: 0, 1, 2, 3, 4, 5, 6, 7, 8
> 8
 | | 
-+-+-
 | | 
-+-+-
 | |X

O| | 
-+-+-
 | | 
-+-+-
 | |X


[PLAYER] choose action: 1, 2, 3, 4, 5, 6, 7


In [5]:
model = alphazero.neural_network.model

In [6]:
import numpy as np
board = np.array([[
    [[0, 0, 0],
     [0, 0, 0],
     [0, 0, 0]],
    [[0, 0, 0],
     [0, 0, 0],
     [0, 0, 0]],
    [[0] * 3] * 3
]])
v, p = model(board)
v = v.numpy()[0, 0]
p = p.numpy()[0]
v, p, np.argmax(p)

(0.46693388,
 array([0.35438573, 0.02306549, 0.01683437, 0.01510538, 0.5563846 ,
        0.01145097, 0.00673587, 0.01082927, 0.00520825], dtype=float32),
 4)

In [7]:
games = np.zeros((3, 3))

for i, g in enumerate(store.games):
    if i % 120 == 0:
        games[...] = 0
    games += g[1]
    if i % 120 == 119:
        print(games)
        print()

[[15. 12.  6.]
 [23. 10. 19.]
 [ 7.  9. 19.]]

[[13.  0.  5.]
 [ 7. 85.  3.]
 [ 2.  2.  3.]]

[[17.  6.  2.]
 [ 3. 81.  3.]
 [ 1.  7.  0.]]

[[59.  1.  2.]
 [ 3. 49.  2.]
 [ 1.  3.  0.]]



In [45]:
observation = game.reset()
alphazero.update(None)
observation, _, _, _ = game.step(8)
alphazero.update(8)
observation, _, _, _ = game.step(0)
alphazero.update(0)
game.render()
action = alphazero.move(observation)
game.step(action)
game.render()

O| | 
-+-+-
 | | 
-+-+-
 | |X

O| | 
-+-+-
 |X| 
-+-+-
 | |X



In [47]:
actions = alphazero._mcts.stats['actions']
N = sum(n.visit_count for n in actions.values())
print("action  ", " pi ", "   P  ", "value", sep='    ')
for a, n in actions.items():
    print(f"action {a}", f"{n.visit_count / N:.3f}", f"{n.probability:.3f}", f"{n.value:.3f}", sep='    ')

action       pi        P      value
action 1    0.015    0.043    -0.169
action 2    0.251    0.257    0.067
action 3    0.005    0.009    -0.472
action 4    0.719    0.449    0.027
action 5    0.005    0.143    -0.977
action 6    0.000    0.000    0.000
action 7    0.005    0.099    -0.762


In [44]:
alphazero._mcts._root.root

True