# AlphaZero Algorithm - Testing

This notebook was built to conduct experiences on the AlphaZero Algorithm and better understand its implementation details

# Import libraries / modules

In [1]:
import time

import matplotlib.pyplot as plt
import numpy as np

import games_mod # Games
import policy_mod # neural network
from play_mod import Play #functionalities of game
import training_mod #neural network training
from replay_buffer_dict import ReplayBuffer #centralized buffer
from utils import DotDict #other utilities
from log_data import LogData #logging class for monitoring purposes

# Game, Training, Competition, Benchmark and Play Settings

In [2]:
# Game settings
game_settings = DotDict({
    "board_size": (3,3),
    "N": 3
})

# Self-play training settings
game_training_settings = DotDict({
    "generations": 100,
    "self_play_iterations": 50,
    "explore_steps": 50,
    "temp": 1,
    "dir_eps": 0.25,
    "dir_alpha": 0.25,
    "data_augmentation_times": 1
})
# alpha = 10 / average legal moves 
# https://medium.com/oracledevs/lessons-from-alphazero-part-3-parameter-tweaking-4dceb78ed1e5 

# neural network settings
nn_training_settings = DotDict({
    "load_policy": False,
    "policy_path": "ai_ckp.pth",
    "ckp_folder":"ckp",
    "lr": .005, 
    "weight_decay": 1.e-4,
    "buffer_size_target": 1000,
    "n_epochs": 1,
    "batch_size": 50
})

benchmark_competition_settings = DotDict({
    "compet_freq":0,
    "compet_rounds": 10,
    "net_compet_threshold": 0,
    "benchmark_freq": 5,
    "benchmark_rounds": 25,
    "mcts_iterations": 1000,
    "mcts_random_moves":0
})

# play settings
play_settings = DotDict({
    "explore_steps": 50,
    "temperature": 0.01                         
})

# Training the AI

In [3]:
log_data = LogData()
log_data.add_chart("nn_loss", ["nn_loss.csv", ['iter', 'loss', 'value_loss', 'prob_loss']])
log_data.add_chart("buffer", ["buffer.csv", ['iter', 'wins', 'losses', 'draws']])
log_data.add_chart("compet", ["compet.csv",['iter', 'scores']])

game=games_mod.ConnectN(game_settings)

policy = policy_mod.Policy(nn_training_settings.policy_path, 
                           nn_training_settings, 
                           log_data)
policy.save_weights()

buffer = ReplayBuffer(nn_training_settings.buffer_size_target, 
                      nn_training_settings.batch_size, 
                      log_data)

In [4]:
t0 = time.time()
alpha_0 = training_mod.AlphaZeroTraining(
    game_settings, 
    game_training_settings,
    nn_training_settings,
    benchmark_competition_settings,
    play_settings,
    policy,
    log_data)
alpha_0.training_pipeline(buffer)
t1 = time.time()
print (t1 - t0)

4105.611351013184


# Testing some positions

In [14]:
import torch
import numpy as np
import policy_mod  # neural network

def test_final_positions(game_state):

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    frame = torch.tensor(game_state, dtype=torch.float, device=device).unsqueeze(0).unsqueeze(0)
    policy_path = "ckp/ai_ckp.pth"
    policy = policy_mod.Policy(policy_path)
    policy.load_weights(policy_path)
    print (frame)
    v, p = policy.forward_batch(frame)
    print("Probabilities = {}; Values = {}".format(p, v))
    

In [15]:
game_state1 = np.array([[-1, 1, -1], [0, 1, 0], [0, 0, 0]])
test_final_positions (game_state1)

tensor([[[[-1.,  1., -1.],
          [ 0.,  1.,  0.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 2.8055e-11,  2.8055e-11,  2.8055e-11,  2.9608e-05,  2.8055e-11,
          5.0483e-04,  1.0367e-05,  9.9900e-01,  4.5571e-04]]); Values = tensor([[ 0.9988]])


In [16]:
game_state2 = np.array([[-1, 1, -1], [0, 1, -1], [0, 0, 0]])
test_final_positions (game_state2)

tensor([[[[-1.,  1., -1.],
          [ 0.,  1., -1.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 2.7364e-10,  2.7364e-10,  2.7364e-10,  3.4019e-05,  2.7364e-10,
          2.7364e-10,  1.0967e-04,  9.9851e-01,  1.3413e-03]]); Values = tensor([[ 0.9975]])


## Game with 2nd position not in the center

In [17]:
game_state3 = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])
test_final_positions (game_state3)

tensor([[[[ 0.,  0.,  0.],
          [ 0.,  0.,  0.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 0.0995,  0.0325,  0.1270,  0.0348,  0.1866,  0.0445,  0.2864,
          0.0515,  0.1372]]); Values = tensor([[ 0.2140]])


In [18]:
game_state4 = np.array([[-1, 0, 0], [0, 0, 0], [0, 0, 0]])
test_final_positions (game_state4)

tensor([[[[-1.,  0.,  0.],
          [ 0.,  0.,  0.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 0.0006,  0.0778,  0.0271,  0.0911,  0.5970,  0.0731,  0.0451,
          0.0466,  0.0417]]); Values = tensor([[-0.9739]])


In [19]:
game_state5 = np.array([[1, 0, 0], [0, 0, -1], [0, 0, 0]])
test_final_positions (game_state5)

tensor([[[[ 1.,  0.,  0.],
          [ 0.,  0., -1.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 0.0002,  0.0883,  0.5215,  0.0187,  0.0370,  0.0002,  0.3095,
          0.0018,  0.0226]]); Values = tensor([[ 0.9971]])


In [20]:
game_state6 = np.array([[-1, 0, -1], [0, 0, 1], [0, 0, 0]])
test_final_positions (game_state6)

tensor([[[[-1.,  0., -1.],
          [ 0.,  0.,  1.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 0.0002,  0.3189,  0.0002,  0.0737,  0.4490,  0.0002,  0.0321,
          0.1056,  0.0200]]); Values = tensor([[-0.9912]])


In [21]:
game_state7 = np.array([[1, -1, 1], [0, 0, -1], [0, 0, 0]])
test_final_positions (game_state7)

tensor([[[[ 1., -1.,  1.],
          [ 0.,  0., -1.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 0.0001,  0.0001,  0.0001,  0.0229,  0.4994,  0.0001,  0.3820,
          0.0158,  0.0797]]); Values = tensor([[ 0.9998]])


In [22]:
game_state8 = np.array([[-1, 1, -1], [0, -1, 1], [0, 0, 0]])
test_final_positions (game_state8)

tensor([[[[-1.,  1., -1.],
          [ 0., -1.,  1.],
          [ 0.,  0.,  0.]]]])
Probabilities = tensor([[ 0.0004,  0.0004,  0.0004,  0.0930,  0.0004,  0.0004,  0.4493,
          0.2257,  0.2300]]); Values = tensor([[-0.9852]])
