In [1]:
import random
import copy
import json
from itertools import tee
from tqdm import trange
from board import Board
from player import Player

In [2]:
def pairwise(iterable):
    a, b = tee(iterable)
    next(b, None)
    return zip(a, b)

In [3]:
epsilon = 0
alpha = 0.5

### Train the player

In [4]:
board = Board()
player = Player(epsilon, alpha)

# number of training matches
num_train = 10

for n in trange(num_train):
    # reset board for new game
    board.reset()
    # store the board states to backpropagate value when game ends
    board_states = []
    
    # 1/2 of the time o-player plays first
    if random.random() < 0.5:
        # o-player starts in random location
        board.add_o(random.choice([i for i in range(9)]))
        
    # continue playing until board is full or someone won
    while not board.is_full():
        # player 'x' plays
        action = player.choose_action(board.get_state())
        # update board
        board.add_x(action)
        # store board state
        board_states.append(board.get_state())
        # check if player won
        if player.has_won(board.get_state()): break
        # if nobody won yet, inverse the board
        board.inverse()

    # backpropagate value of game to update the policy
    board_states.reverse()
    for state_k, state_km1 in pairwise(board_states):
        player.update_values(state_km1, state_k)
    player.reduce_alpha()
    # player.set_alpha(1/(n+1))
        
# save player's policy
player.save_policy("params/policy.json")

100%|██████████| 10/10 [00:00<00:00, 4685.85it/s]


In [5]:
player.alpha

0.4950224401048741

### Test if the player learned the optimal strategy

In [6]:
# no random actions anymore
player.playing_mode()
player.load_policy("params/policy_10M_training.json")

o_indices = [0,2,6,8]
# number of training matches
num_test = len(o_indices)
# number of matches that finished in a draw
num_draw = 0

for o_idx in o_indices:

    # reset board for new game
    board.reset()

    # o-player starts in random corner location
    board.add_o(o_idx)

    is_draw = True
    while not board.is_full():
        # player 'x' plays
        action = player.choose_action(board.get_state())
        # update board
        board.add_x(action)
        # check if player won
        if player.has_won(board.get_state()): 
            is_draw = False
            # board.print()
            break
        # if nobody won yet, inverse the board
        board.inverse()

    if is_draw:
        num_draw += 1

print("Finished testing")
print('   number of draws : ', num_draw, " of ", num_test)

Finished testing
   number of draws :  2  of  4


### Play against it

In [36]:
board.reset()

In [48]:
o_idx = 7    # None or index

# reset board for new game
if board.is_full():
    board.reset()

# o-player starts in random corner location
board.add_o(o_idx)

if not board.is_full():
    # player 'x' plays
    action = player.choose_action(board.get_state())

    # update board
    board.add_x(action)

board.print()


['x', 'x', 'o']
['o', 'o', 'x']
['x', 'o', '-']



In [45]:
player.states.get('----o---x')

0.31520587085280605