# AlphaZero Algorithm - Testing

This notebook was built to conduct experiences on the AlphaZero Algorithm and better understand its implementation details

# Import libraries / modules

In [1]:
# Python libraries
import time

# z3rd party libraries
import matplotlib.pyplot as plt
import numpy as np

# Game-related libraries
import games_mod # Games
import policy_mod # neural network
from play_mod import Play
import training_mod
from game_utils import DotDict, policy_player_mcts, random_player, match_ai, network_only
from log_data import LogData

# Game, Training and Play Settings

In [2]:
# Game settings
game_settings = DotDict({
    "board_size": (3,3),
    "N": 3
})

# Self-play training settings
game_training_settings = DotDict({
    "comp_interval":1000,
    "episods": 100,
    "self_play_iterations": 50,
    "explore_steps": 500,
    "temp_threshold": [50, 0.01],
    "dir_eps": 0.25,
    "dir_alpha": 1.0 
})

# temp_threshold: [x,y] means "up to x episods, applies y temperature"

# neural network settings
nn_training_settings = DotDict({
    "load_policy": False,
    "ai_ckp": "",
    "lr": .01, 
    "weight_decay": 1.e-4,
    "training_steps":30,
    "buffer_size":1500,
    "batch_size": 20
})

# play settings
play_settings = DotDict({
    "explore_steps": 50,
    "temperature": 0.01                         
})

buffer_size = nn_training_settings.buffer_size
batch_size = nn_training_settings.batch_size

## Self-Play Competition

In [3]:
total_wins, total_losses = match_ai(game_settings, play_settings, network_only, network_only, total_rounds = 1)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
tensor([[[[ 0.,  0.,  0.],
          [ 0.,  0.,  0.],
          [ 0.,  0.,  0.]]]]) [[0.13761409 0.0903748  0.15134129]
 [0.08667693 0.11487675 0.08641278]
 [0.14266936 0.07666049 0.11337356]]
[[0. 0. 1.]
 [0. 0. 0.]
 [0. 0. 0.]]
tensor([[[[-0., -0., -1.],
          [-0., -0., -0.],
          [-0., -0., -0.]]]]) [[0.08661634 0.1030232  0.        ]
 [0.08133116 0.22488971 0.10132253]
 [0.20639579 0.08938055 0.10673177]]
[[ 0.  0.  1.]
 [ 0. -1.  0.]
 [ 0.  0.  0.]]
tensor([[[[ 0.,  0.,  1.],
          [ 0., -1.,  0.],
          [ 0.,  0.,  0.]]]]) [[0.19766623 0.10648648 0.        ]
 [0.11144192 0.         0.15153173]
 [0.11327257 0.09444978 0.22206922]]
[[ 0.  0.  1.]
 [ 0. -1.  0.]
 [ 0.  0.  1.]]
tensor([[[[-0., -0., -1.],
          [-0.,  1., -0.],
          [-0., -0., -1.]]]]) [[0.08774607 0.14426005 0.        ]
 [0.09272014 0.         0.50685096]
 [0.05548812 0.11263628 0.        ]]
[[ 0.  0.  1.]
 [ 0. -1. -1.]
 [ 0.  0.  1.]]
tensor([[[[ 0., 

In [4]:
import torch

In [5]:

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

game_state1 = np.array([[1, 0, 0], [0, 0, 0], [-1, -1, 0]])

game_state2 = np.array([[-1, 0, 0], [0, 0, 0], [1, 1, 0]])


frame1 = torch.tensor(game_state1, dtype=torch.float, device=device).unsqueeze(0)
frame2 = torch.tensor(game_state2, dtype=torch.float, device=device).unsqueeze(0)

policy_path = "ai_ckp.pth"
policy = policy_mod.Policy()
policy.load_weights(policy_path)

new_tensor = torch.stack((frame1, frame2))
v, p = policy.forward_batch(new_tensor)
v1 = v.detach().numpy()
v2 = v.detach().numpy()

p1 = p.detach().numpy()
p2 = p.detach().numpy()

print("Probabilities = {}, {}; Values = {}, {}".format(p1, p2, v1, v2))

Probabilities = [[1.23858033e-03 1.09221607e-01 3.44767809e-01 1.17241390e-01
  1.60804167e-01 1.22051708e-01 1.23858033e-03 1.23858033e-03
  1.42197609e-01]
 [1.73365478e-07 1.00252895e-04 2.01736786e-03 2.14605927e-04
  2.68485583e-03 9.72873648e-04 1.73365478e-07 1.73365478e-07
  9.94009495e-01]], [[1.23858033e-03 1.09221607e-01 3.44767809e-01 1.17241390e-01
  1.60804167e-01 1.22051708e-01 1.23858033e-03 1.23858033e-03
  1.42197609e-01]
 [1.73365478e-07 1.00252895e-04 2.01736786e-03 2.14605927e-04
  2.68485583e-03 9.72873648e-04 1.73365478e-07 1.73365478e-07
  9.94009495e-01]]; Values = [[0.07414395]
 [0.99949706]], [[0.07414395]
 [0.99949706]]
