### Import useful libraries

In [1]:
import random
import copy
import json
from itertools import tee
from tqdm import trange
from board import Board
from player import Player

In [2]:
# iterate over a list pairwise
def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

### Train the player

In [3]:
# player parameters
alpha = 0.5   # learning rate

board = Board()
player = Player(alpha)
# player.load_policy("params/values_10M_training.json")

In [4]:
# number of training matches
num_train = 10000000

for n in trange(num_train):
    # reset board for new game
    board.reset()
    # store the board states to backpropagate value when game ends
    board_states = []
    
    # 1/2 of the time o-player plays first
    if random.random() < 0.5:
        # o-player starts in random location
        board.add_o(random.choice([i for i in range(9)]))
        
    # continue playing until board is full or someone won
    while not board.is_full():
        # x-player chooses an action
        action = player.choose_action(board.get_state())
        # update board
        board.add_x(action)
        # store board state for training later
        board_states.append(board.get_state())
        # check if player won
        if player.has_won(board.get_state()): break
        # if nobody won yet, inverse the board
        board.inverse()

    # backpropagate value of game to update the policy
    board_states.reverse()
    for state_k, state_km1 in pairwise(board_states):
        player.update_values(state_km1, state_k)
    
    # reduce learning rate every 10000 matches
    if n % 100000 == 0:
        player.reduce_alpha()
    # player.set_alpha(1/(n+1))
        
# save player's policy
player.save_values("params/values.json")

100%|██████████| 10000000/10000000 [32:16<00:00, 5164.22it/s]


AttributeError: 'Player' object has no attribute 'save_policy'

In [46]:
player.states.get("----x----")

0.6597808029835397

### Test if the player learned the optimal strategy

In [8]:
# no random actions anymore
player.playing_mode()
# player.load_policy("params/values_10M_training.json")

o_indices = [0,2,6,8]
# number of testing matches
num_test = len(o_indices)
# number of matches that finished in a draw
num_draw = 0

for o_idx in o_indices:
    # reset board for new game
    board.reset()
    # o-player starts in corner location
    board.add_o(o_idx)

    is_draw = True
    while not board.is_full():
        # player 'x' plays
        action = player.choose_action(board.get_state())
        # update board
        board.add_x(action)
        # check if player won
        if player.has_won(board.get_state()): 
            is_draw = False
            break
        # if nobody won yet, inverse the board
        board.inverse()

    if is_draw:
        num_draw += 1

print("Finished testing")
print('   number of draws : ', num_draw, " of ", num_test)

Finished testing
   number of draws :  4  of  4


### Play against it

In [87]:
board.reset()

In [91]:
o_idx = 7    # None or index

# reset board for new game
if board.is_full():
    board.reset()

# o-player starts in random corner location
board.add_o(o_idx)

if not board.is_full():
    # player 'x' plays
    action = player.choose_action(board.get_state())

    # update board
    board.add_x(action)

board.print()


['o', 'x', 'x']
['x', 'o', 'o']
['-', 'o', 'x']



In [80]:
print(player.states.get('oxoxox---'))

9.799192918181171e-06


In [74]:
print(player.states.get('xoxoxo---'))

0.8361312809238517
