### Import useful libraries

In [1]:
import random
from datetime import datetime
from tqdm import trange
from board import Board
from player import Player
from utils import *

### Test if the player learned the optimal strategy

In [2]:
me     = 'o'
agent  = 'x'

board  = Board(nrow=3, ncol=3, sign_play=[agent,me])
player = Player(sign=agent, order=float('inf'))

In [3]:
# board.add(sign='o', row=0, col=0)
# board.add(sign='x', row=1, col=1)
# board.add(sign='o', row=2, col=2)
# board.add(sign='x', row=0, col=1)

# board.print()
# print(board.get_state())
# print(board.get_next_states(sign=agent))

a, N, Q = player.choose_action_mcts(board, num_sim=10**6, return_dicts=True)

# N = {}
# Q = {}
# P = {}
# c = 1
# N, Q = player.mcts_simulation(board, N, Q, P, c)

100%|██████████| 1000000/1000000 [2:44:39<00:00, 101.22it/s] 


In [22]:
save_dict(data=Q, file_name="data/mcts-Q-1000000")
save_dict(data=N, file_name="data/mcts-N-1000000")

In [11]:
import math
P = {}
c = 1

In [31]:
random.seed(100)

# play on a copy of the board
board_cpy = copy.deepcopy(board)
# store all the states of this MCTS simulation
board_states = []

# assume that the game will be a draw
reward = 0.5
while not board_cpy.is_full():

    # update visit count (necessary because of self-play = inverse board)
    N[board_cpy.get_state()] = N.get(board_cpy.get_state(), 0) + 1

    # evaluate possible actions
    next_states = board_cpy.get_next_states(sign=agent)
    ucb_state   = []
    for state in next_states:
#         print("")
#         print(state)
        q  = Q.get(state, 0.5)
        p  = P.get(state, 1/len(next_states))
        na = N.get(state, 0)
        nb = N.get(board_cpy.get_state())
#         print('ratio           = ', math.sqrt(nb) / (1+na))
#         print('num next states = ', len(next_states))
#         print('N(s,a)          = ', na)
        ucb_state.append(q + c * p * math.sqrt(nb) / (1+na))

    print(["{0:0.10f}".format(v) for v in ucb_state])

    # select action that maximizes the UCB value
    action = random.choices(board_cpy.get_free_positions(), weights=normalize(ucb_state, 5))[0]
#     action = random.choices(board_cpy.get_free_positions(), weights=normalize(ucb_state, float('inf')))[0]
    # take action
    board_cpy.add(sign=agent, row=action[0], col=action[1])
    
    board_cpy.print()
    
    # update visit count
    N[board_cpy.get_state()] = N.get(board_cpy.get_state(), 0) + 1
    # add board state to list of visited states
    board_states.append(board_cpy.get_state())

    # check if player won
    if board_cpy.is_won(): 
        reward = 1
        break
    # if nobody won yet, inverse the board
    board_cpy.inverse()   

['0.5033397155', '0.5033303513', '0.5033278869', '0.5033303493', '0.5033298234', '0.5033259254', '0.5033303858', '0.5033303493', '0.5033303432']

 x - - 
 - - - 
 - - - 
['0.4975853495', '0.4707377071', '0.4990018264', '0.5022875662', '0.5008225714', '0.4946384147', '0.4969784418', '0.4991832479']

 o - - 
 - x - 
 - - - 
['0.5008247948', '0.4991169679', '0.5007832216', '0.5008021714', '0.4886066180', '0.5008219737', '0.5006111639']

 x x - 
 - o - 
 - - - 
['0.5104387560', '0.5098797155', '0.5102564683', '0.5092206018', '0.5100964276', '0.5098797155']

 o o x 
 - x - 
 - - - 
['0.4951249631', '0.4951249631', '0.4998437795', '0.4951249631', '0.4951249631']

 x x o 
 - o - 
 x - - 
['0.5085656611', '0.5004061860', '0.3430008865', '0.5057092163']

 o o x 
 x x - 
 o - - 
['0.5029285032', '0.5016525977', '0.5017299253']

 x x o 
 o o x 
 x - - 
['0.5016460245', '0.5011898925']

 o o x 
 x x o 
 o x - 
['0.5051220678']

 x x o 
 o o x 
 x o x 


In [32]:
# next possible states
next_states = board.get_next_states(sign=agent)    
# get count for each next state
next_counts = [N.get(state, 0) for state in next_states]
# randomly select action according to weights in next_counts
print(normalize(next_counts, 1))
print(next_states)

[0.04899506909368722, 0.06684173000712987, 0.12458463289197505, 0.05242900384892687, 0.45335238630466024, 0.07098965119662727, 0.11263086001365974, 0.04912606660473451, 0.021050600038599265]
['x--------', '-x-------', '--x------', '---x-----', '----x----', '-----x---', '------x--', '-------x-', '--------x']


In [6]:
Q

{'-xxooxo-x': 1.0,
 '-o-xxox-o': 0.0,
 '-x--oxo-x': 0.6,
 '-o--x-x-o': 0.026315789473684223,
 '-x--o---x': 0.498323085522639,
 '----x---o': 0.5026752164090509,
 '--------x': 0.49805234905705065,
 '-ox-xoxox': 1.0,
 '-xo-ox-xo': 0.0,
 '-ox-x--ox': 1.0,
 '-xo----xo': 0.06666666666666667,
 '--x----ox': 0.9814814814814815,
 '-------xo': 0.0,
 'xxo-xoo-x': 1.0,
 '-ox-oxx-o': 0.0,
 '-x--xoo-x': 0.8552631578947368,
 '-o---xx-o': 0.0,
 '-x---o--x': 0.6666666666666667,
 '-----x--o': 0.1875,
 'xoo-x---x': 1.0,
 'oxx-----o': 0.0,
 'x-o-----x': 0.5,
 '--x-----o': 0.0,
 'xooooxxxx': 1.0,
 'oxxxxo-oo': 0.0,
 'xo-oox-xx': 1.0,
 'ox-xx--oo': 0.0,
 'xo-o---xx': 1.0,
 '-x-x---oo': 0.0,
 '-o-----xx': 0.9166666666666667,
 '-x------o': 0.1,
 'ooxoxxxox': 1.0,
 'xx-xoooxo': 0.0,
 '-o-oxxxox': 1.0,
 '-x-xoo-xo': 0.0,
 '---oxx-ox': 0.875,
 '---x-o-xo': 0.0,
 '---o-x--x': 0.5888888888888889,
 '---x----o': 0.0,
 'oxxxooxox': 0.5,
 'x-ooxxoxo': 0.5,
 'o-xx-oxox': 0.5,
 'x-oo-x-xo': 0.5,
 'o-xx-o--x': 0.5,
 'x--o

In [None]:
N

In [None]:
player.load_values("data/values/order-inf")
# player.playing_mode()
# player.training_mode()

In [None]:
# number of testing matches
num_test = 1000
# number of matches that finished in a draw
num_draw = 0

for n in trange(num_test):
    # reset board for new game
    board.reset()
    
    if random.random() < 0.5:
        # start in random location
        i = random.choices(range(board.get_nrow()))[0]
        j = random.choices(range(board.get_nrow()))[0]
        board.add(me, row=i, col=j)

    # assume the game will end in a draw
    num_draw += 1
    while not board.is_full():
        # RL agent chooses an action
        action = player.choose_action(board)
        # update board
        board.add(sign=agent, row=action[0], col=action[1])
        # check if RL agent won
        if board.is_won(): 
            num_draw -= 1
            break
        # if nobody won yet, inverse the board
        board.inverse()

print("Finished testing")
print('   number of draws : ', num_draw, " of ", num_test)

### Play against it

In [None]:
board.reset()

In [None]:
if board.is_full():
    board.reset()
    
# None or array [row,col]
my_pos = [2,1]
if my_pos:
    board.add(me, row=my_pos[0], col=my_pos[1])

if (not board.is_full()) and (not board.is_won()):
    # player 'x' plays
    board = player.play(board)

board.print()

In [None]:
print(player.get_value('o---x-ox-'))
print(player.get_value('o-oxx----'))
print(player.get_value('-xo-x---o'))
print(player.get_value('----xxo-o'))

print(player.get_value('ox--x-o--'))
print(player.get_value('---xx-o-o'))
print(player.get_value('--o-x--xo'))
print(player.get_value('o-o-xx---'))

In [None]:
# for order = 0 expect value = 0.333
# for order = 1 expect value = 0.233
# for order = 2 expect value = 0.106

print(player.get_value('x-o-xx-oo'))
print(player.get_value('--xox-oxo'))
print(player.get_value('oo-xx-o-x'))
print(player.get_value('oxo-xox--'))

print(player.get_value('o-xxx-oo-'))
print(player.get_value('x---xooxo'))
print(player.get_value('-oo-xxx-o'))
print(player.get_value('oxoox---x'))

In [None]:
# for order = 0 expect value = 0.75
# for order = 1 expect value = 0.5
# for order = 2 expect value = 0.5

print(player.get_value('o-xxoo-xx'))
print(player.get_value('-xoxo-xox'))
print(player.get_value('xx-ooxx-o'))
print(player.get_value('xox-oxox-'))

print(player.get_value('x-oooxxx-'))
print(player.get_value('ox--oxxox'))
print(player.get_value('-xxxooo-x'))
print(player.get_value('xoxxo--xo'))

In [None]:
# for order = 0 expect value = 0

print(player.get_value('xxooxx-oo'))

In [None]:
dico = {'a':1, 'b':2}

dico['a'] += dico.get('a', 0) + 1

print(dico)