# Setup a game

In [1]:
from ConnectN import ConnectN

# Initialize input board size and the winning condition
game_setting = {'size':(3,3), 'N':3}

# Initialize a ConnectN object
game = ConnectN(**game_setting)



In [2]:
game.move((0,1))
print(game.state)
print(game.player) # first player 1, and second player -1
print(game.score) # keep track socre(game is not over: None, draw: 0, first player won: 1, second player won: -1)

[[0. 1. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
-1
None


In [3]:
# player -1 move
game.move((0,0))
# player +1 move
game.move((1,1))
# player -1 move
game.move((1,0))
# player +1 move
game.move((2,1))

print(game.state)
print(game.player)
print(game.score)

[[-1.  1.  0.]
 [-1.  1.  0.]
 [ 0.  1.  0.]]
1
1


# Play a game interactively

In [4]:
import matplotlib.pyplot as plt
%matplotlib qt

from Play import Play
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE" # this avoids the crush of the ipykernel kernel


# when player1 or player2 is None, it means they are from mouse click.
gameplay=Play(ConnectN(**game_setting), 
              player1=None, 
              player2=None)

plt.show()

# Initialize an AI to play the game
We need to define a policy for tic-tac-toe, that takes the game state as input, and outputs a policy and a critic

## Tentative Exercise:
Code up your own policy for training

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from math import *
import numpy as np
import random


class Policy(nn.Module):

    def __init__(self):
        super(Policy, self).__init__()
        
        
        '''
        NN contains both the policy and the critic
        
        1 input image channel, 16 output channels/feaure map(filter)
        output_dim = (W-F+2P)/S + 1
        
        Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0, bias=True)
        Linear(in_features, out_features, bias=True)
        '''
        self.conv = nn.Conv2d(1, 16, kernel_size=2, stride=1, bias=False)
        self.size = 2*2*16 # input features: 16 outputs 2x2 filtered map size
        self.fc = nn.Linear(self.size,32)

        # layers for the policy
        self.fc_action1 = nn.Linear(32, 16)
        self.fc_action2 = nn.Linear(16, 9)
        
        # layers for the critic
        self.fc_value1 = nn.Linear(32, 8)
        self.fc_value2 = nn.Linear(8, 1)
        self.tanh_value = nn.Tanh()
        
    # feedforward behavior
    def forward(self, x):

        '''
        # Apply [CONV-RELU]
        # Flattening used for the output of conv/pooling layer to a linear layer (vector)
        '''
        y = F.relu(self.conv(x))   # Apply [CONV-RELU]
        y = y.view(-1, self.size)  # Flattening used for the output of conv/pooling layer to a linear layer
        y = F.relu(self.fc(y))     # One linear layer
        
        # you can easily use a softmax on a 9x9 matrix to get the prob. but this doesn't work quite that well.
        # Given that not all the moves are available because say, player one plays a piece in a position,
        # then player negative one could not place another piece in that same location.
        # To fix it, compute 'avial' called the availablility matrix, which basically gives zeros when
        # the move is unavailable in that location and it gives one when a move is available.
        # the action head
        a = F.relu(self.fc_action1(y))
        a = self.fc_action2(a)
        
        # availability of moves
        avail = (torch.abs(x.squeeze())!=1).type(torch.FloatTensor)
        avail = avail.reshape(-1,9)
#         avail = avail.view(-1, 9)
        
        # locations where actions are not possible, we set the prob to zero
        maxa = torch.max(a)
        
        # Softmax operation.
        # Doin this below will ensure that when a move is not legal, the prob will be exactly zero.
        # subtract off max for numerical stability (avoids blowing up at infinity)
        exp = avail*torch.exp(a-maxa)
        prob = exp/torch.sum(exp)
        
        # critic
        # the value head
        value = F.relu(self.fc_value1(y))
        value = self.tanh_value(self.fc_value2(value))
        return prob.view(3,3), value

# # we use the adam optimizer with learning rate 2e-4
# # optim.SGD is also possible
# import torch.optim as optim
policy = Policy()
# optimizer = optim.Adam(policy.parameters(), lr=1.e-4, weight_decay=1.e-4)
print(policy)

Policy(
  (conv): Conv2d(1, 16, kernel_size=(2, 2), stride=(1, 1), bias=False)
  (fc): Linear(in_features=64, out_features=32, bias=True)
  (fc_action1): Linear(in_features=32, out_features=16, bias=True)
  (fc_action2): Linear(in_features=16, out_features=9, bias=True)
  (fc_value1): Linear(in_features=32, out_features=8, bias=True)
  (fc_value2): Linear(in_features=8, out_features=1, bias=True)
  (tanh_value): Tanh()
)


## Define a player that uses MCTS and the expert policy + critic to play a game

We've introduced a new parameter
$T$ = temperature

This tells us how to choose the next move based on the MCTS results

$$p_a = \frac{N_a^{\frac{1}{T}}}{\sum_a N_a^{\frac{1}{T}}}$$

$T \rightarrow 0$, we choose action with largest $N_a$. 

In [6]:
import MCTS

from copy import copy
import random

def Policy_Player_MCTS(game):
    mytree = MCTS.Node(copy(game)) # make a copy and initialize a MCTS class
    for _ in range(50):
        mytree.explore(policy) # Compute all the U's, pick the brach with maximal U, search, expand, back-prop and then increase the viscalc
   
    mytreenext, (v, nn_v, p, nn_p) = mytree.next(temperature=0.1) # tell the tree to choose a next move
        
    return mytreenext.game.last_move

def Random_Player(game):
    return random.choice(game.available_moves())    


In [7]:
game = ConnectN(**game_setting)
print(game.state)
Policy_Player_MCTS(game)

[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu!

# Play a game against the policy

In [None]:
# % matplotlib notebook


gameplay=Play(ConnectN(**game_setting), 
              player1=None, 
              player2=Policy_Player_MCTS)


# Training Loop

In [None]:
# initialize our alphazero agent and optimizer
import torch.optim as optim

game=ConnectN(**game_setting)
policy = Policy()
optimizer = optim.Adam(policy.parameters(), lr=.01, weight_decay=1.e-4)

## Tenative exercise:
code up the alphazero loss function, defined to be
$$L = \sum_t \left\{ (v^{(t)}_\theta - z)^2  - \sum_a p^{(t)}_a \log \pi_\theta(a|s_t) \right\} + \textrm{constant}$$ 
I added a constant term $\sum_t \sum_a p^{(t)}\log p^{(t)}$ so that when $v_\theta^{(t)} = z$ and $p^{(t)}_a = \pi_\theta(a|s_t)$, $L=0$, this way we can have some metric of progress.

In [None]:
import torch
import wandb
from tqdm import tqdm
from collections import deque
import MCTS

# Initialize wandb
wandb.init(project="alpha-zero-training", name="alpha-zero")

episodes = 800
outcomes = []
losses = []

with tqdm(total=episodes, desc="Training Progress") as pbar:
    for e in range(episodes):
        # each episode, initialize a top node for the MCTS
        mytree = MCTS.Node(ConnectN(**game_setting))
        
        vterm = []
        logterm = []
        
        # whenever the game is not over yet, explore the tree 50 steps
        while mytree.outcome is None:
            for _ in range(50):
                mytree.explore(policy)
            
            current_player = mytree.game.player  # keep track of the player
            
            # v: expected outcome computed from the tree search
            # nn_v: the critic value evaluating the current board
            # p: a list of prob of taking each action computed from MCTS
            # nn_p: similar to p but from the policy network directly
            mytree, (v, nn_v, p, nn_p) = mytree.next()  # increment to the next tree
            mytree.detach_mother()
            
            # Compute the loss function by comparing p with nn_p and nn_v with the actual outcome
            loglist = torch.log(nn_p) * p  # part of the loss function
            constant = torch.where(p > 0, p * torch.log(p), torch.tensor(0.))
            logterm.append(-torch.sum(loglist - constant))
            
            vterm.append(nn_v * current_player)
            
        # Compute the "policy_loss" for computing gradient
        outcome = mytree.outcome
        outcomes.append(outcome)
        
        loss = torch.sum((torch.stack(vterm) - outcome) ** 2 + torch.stack(logterm))
        optimizer.zero_grad()
        
        loss.backward()
        losses.append(float(loss))
        optimizer.step()
        
        # Log metrics to wandb
        wandb.log({"episode": e + 1, "loss": float(loss), "outcome": outcome})
        
        # Update tqdm progress bar
        pbar.update(1)
        
        del loss

# Finish wandb run
wandb.finish()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mstan_fu[0m ([33mstanfu[0m). Use [1m`wandb login --relogin`[0m to force relogin


Training Progress: 100%|██████████| 800/800 [01:56<00:00,  6.90it/s]


0,1
episode,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,█▅▆▅▅▄▇▅▂▆▂▂▂▃▁▃▂▆▁▂▁▂▁▁▁▂▂▁▁▂▁▁▂▂█▁▁▁▆▁
outcome,▁██▅▅▅██▅█▅▅▅▅▅▅▅▁▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅▅█▅▅▅█▅

0,1
episode,800.0
loss,0.51857
outcome,0.0


In [None]:
# plot your losses

import matplotlib.pyplot as plt

# % matplotlib notebook
plt.plot(losses)
plt.show()


# Play a game against your alphazero agent !

In [None]:
# as first player
gameplay=Play(ConnectN(**game_setting), 
              player1=None, 
              player2=Policy_Player_MCTS)


Traceback (most recent call last):
  File "c:\Users\97575\anaconda3\envs\udacity-py36\lib\site-packages\matplotlib\cbook\__init__.py", line 224, in process
    func(*args, **kwargs)
  File "c:\ML_Projects\RL_review\2.codes\4.AlphaZero\Play.py", line 141, in click
    loc = self.player1(self.game)
TypeError: 'NoneType' object is not callable


In [None]:
%matplotlib qt

# as second player

gameplay=Play(ConnectN(**game_setting), 
              player2=None, 
              player1=Policy_Player_MCTS)


In [None]:
%matplotlib qt

# AI vs AI

gameplay=Play(ConnectN(**game_setting), 
              player2=Policy_Player_MCTS, 
              player1=Policy_Player_MCTS)
