### Import useful libraries

In [1]:
import random
from datetime import datetime
from tqdm import trange
from board import Board
from player import Player

### Train the player

analyze performance of algorithms depending on: order of norm, learning rate, value of draw (not always 0.5)

In [2]:
me     = 'o'
agent  = 'x'

num_train = 1000000
lr = 0.2
lr_red_steps = 100000

board  = Board(nrow=3, ncol=3, sign_play=[agent,me])

In [3]:
# player with full exploitation
player = Player(sign=agent, order=float('inf'))
player.train(board, num_train=num_train, lr=lr, lr_red_steps=lr_red_steps)
player.save_args("params/player-"+datetime.now().strftime("%y%m%d-%H%M"))

100%|██████████| 1000000/1000000 [2:36:39<00:00, 106.39it/s] 


In [4]:
# player with full exploration
player = Player(sign=agent, order=0)
player.train(board, num_train=num_train, lr=lr, lr_red_steps=lr_red_steps)
player.save_args("params/player-"+datetime.now().strftime("%y%m%d-%H%M"))

100%|██████████| 1000000/1000000 [2:10:31<00:00, 127.69it/s] 


In [5]:
# player with exploration and exploitation (order p = 1)
player = Player(sign=agent, order=1)
player.train(board, num_train=num_train, lr=lr, lr_red_steps=lr_red_steps)
player.save_args("params/player-"+datetime.now().strftime("%y%m%d-%H%M"))

100%|██████████| 1000000/1000000 [2:10:14<00:00, 127.97it/s] 


In [6]:
# player with exploration and exploitation (order p = 2)
player = Player(sign=agent, order=2)
player.train(board, num_train=num_train, lr=lr, lr_red_steps=lr_red_steps)
player.save_args("params/player-"+datetime.now().strftime("%y%m%d-%H%M"))

100%|██████████| 1000000/1000000 [2:27:15<00:00, 113.17it/s] 


In [3]:
# player with exploration and exploitation (order p = 2)
player = Player(sign=agent, order=10)
player.train(board, num_train=num_train, lr=lr, lr_red_steps=lr_red_steps)
player.save_args("params/player-"+datetime.now().strftime("%y%m%d-%H%M"))

  n = sum( [ abs(v)**ord for v in values ] )
  0%|          | 2529/1000000 [00:27<3:00:08, 92.29it/s] 


KeyboardInterrupt: 

### Test if the player learned the optimal strategy

In [None]:
player.load_args("params/player-201016-2059")
player.playing_mode()

In [None]:
# number of testing matches
num_test = 1000
# number of matches that finished in a draw
num_draw = 0

for n in trange(num_test):
    # reset board for new game
    board.reset()
    
    if random.random() < 0.5:
        # start in random location
        i = random.choices(range(board.get_nrow()))[0]
        j = random.choices(range(board.get_nrow()))[0]
        board.add(me, row=i, col=j)

    # assume the game will end in a draw
    num_draw += 1
    while not board.is_full():
        # RL agent chooses an action
        action = player.choose_action(board)
        # update board
        board.add(sign=agent, row=action[0], col=action[1])
        # check if RL agent won
        if board.is_won(): 
            num_draw -= 1
            break
        # if nobody won yet, inverse the board
        board.inverse()

print("Finished testing")
print('   number of draws : ', num_draw, " of ", num_test)

### Play against it

In [None]:
board.reset()

In [None]:
if board.is_full():
    board.reset()
    
# None or array [row,col]
my_pos = [1,2]
if my_pos:
    board.add(me, row=my_pos[0], col=my_pos[1])

if (not board.is_full()) and (not board.is_won()):
    # player 'x' plays
    board = player.play(board)

board.print()

In [None]:
board.reset()
board.add(agent, row=0, col=0)
board.add(me, row=1, col=1)

board = player.play(board)

board.print()


### Similar boards converge to similar values

In [None]:
# do 
print(player.get_value('o---x-ox-'))
print(player.get_value('o-oxx----'))
print(player.get_value('-xo-x---o'))
print(player.get_value('----xxo-o'))

print(player.get_value('ox--x-o--'))
print(player.get_value('---xx-o-o'))
print(player.get_value('--o-x--xo'))
print(player.get_value('o-o-xx---'))

In [None]:
print(player.get_value('xox-xo-o-'))
print(player.get_value('--xoxo-ox'))
print(player.get_value('-o-ox-xox'))
print(player.get_value('xo-oxox--'))

print(player.get_value('-o--xoxox'))
print(player.get_value('-oxoxo--x'))
print(player.get_value('xoxox--o-'))
print(player.get_value('xo-oxox--'))