In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tqdm
import random

from min_max_agent import MinMaxAgent
from qlearning_agent import QleaningAgent, play_game
from tic_tac_toe import TicTacToe, TikTakCounter
from mcts_agent import MCTSAgent

Крестики-нолики -- это, конечно, далеко не го, и обычный альфа-бета поиск с отсечением здесь наверняка может работать идеально вплоть до довольно больших досок. Однако мы всё-таки для этого учебного задания будем реализовывать более практически релевантный метод MCTS -- заодно фактически получится и упражнение на многоруких бандитов.

Вспомогательные функции

In [16]:
def random_step(env: TicTacToe):
    actions = env.getEmptySpaces()
    return random.choice(actions)


def update_counter(counter: TikTakCounter, reward):
    if reward == 1:
        counter.cross += 1
    elif reward == -1:
        counter.naughts += 1
    elif reward == 0:
        counter.draw += 1
    else:
        counter.invalid += 1
    counter.tot += 1


def play_rand_game(agent, env, counter: TikTakCounter):
    done = False

    while not done:
        action = agent.get_action(env)
        _, reward, done, _ = env.step(action)
        if done:
            update_counter(counter, reward)
            break
        action = random_step(env)
        _, reward, done, _ = env.step(action)
        if done:
            update_counter(counter, reward)

Сыграем в игру со случайным противником

In [9]:
mtcs_agent = MCTSAgent(num_rounds=2000)
random.seed(1000)
counter = TikTakCounter()
env = TicTacToe(4, 4, 3)
for i in tqdm.tqdm_notebook(range(100)):
    env.reset()
    play_rand_game(mtcs_agent, env, counter)
    if i % 10 == 0:
        print(counter)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm.tqdm_notebook(range(100)):


HBox(children=(FloatProgress(value=0.0), HTML(value='')))

cross=1 naughts=0 tot=1 draw=0 invalid=0
cross=9 naughts=2 tot=11 draw=0 invalid=0
cross=14 naughts=7 tot=21 draw=0 invalid=0
cross=18 naughts=13 tot=31 draw=0 invalid=0
cross=24 naughts=17 tot=41 draw=0 invalid=0
cross=32 naughts=19 tot=51 draw=0 invalid=0
cross=39 naughts=22 tot=61 draw=0 invalid=0
cross=45 naughts=26 tot=71 draw=0 invalid=0
cross=50 naughts=30 tot=81 draw=1 invalid=0
cross=57 naughts=33 tot=91 draw=1 invalid=0



Стратегия иногда проигрывает, попробуем увеличить количество игр при построении дерева.

In [17]:
mtcs_agent = MCTSAgent(num_rounds=30_000)
random.seed(1000)
counter2 = TikTakCounter()
env = TicTacToe(4, 4, 3)
for i in tqdm.tqdm_notebook(range(10)):
    env.reset()
    play_rand_game(mtcs_agent, env=env, counter=counter2)
    print(counter2)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm.tqdm_notebook(range(10)):


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))

cross=1 naughts=0 tot=1 draw=0 invalid=0
cross=2 naughts=0 tot=2 draw=0 invalid=0
cross=3 naughts=0 tot=3 draw=0 invalid=0
cross=3 naughts=1 tot=4 draw=0 invalid=0
cross=4 naughts=1 tot=5 draw=0 invalid=0
cross=5 naughts=1 tot=6 draw=0 invalid=0
cross=5 naughts=2 tot=7 draw=0 invalid=0
cross=6 naughts=2 tot=8 draw=0 invalid=0
cross=7 naughts=2 tot=9 draw=0 invalid=0
cross=8 naughts=2 tot=10 draw=0 invalid=0



Натренеруем Q-learning агента

In [20]:
env = TicTacToe(4, 4, 4)
agent_q = QleaningAgent(env)
counter2 = TikTakCounter(100)
N_GAME=00_000
for i in tqdm.tqdm(range(N_GAME)):
    agent_q.new_game(-1)
    play_game(env, None, agent_q, counter2,
                    True, False, verbose=False)
    counter2.tot += 1
    counter2.update_history()

 59%|█████▊    | 58702/100000 [39:52<28:02, 24.54it/s]  


KeyboardInterrupt: 

print(counter2)

Сыграем agent_q против mtcs_agent

In [10]:
def play_game2(agent1, agent2, env, counter: TikTakCounter):
    done = False

    while not done:
        action = agent.get_action(env)
        _, reward, done, _ = env.step(action)
        if done:
            update_counter(counter, reward)
            break
        state = env.getHash()
        action = agent2.get_next_step(state, env, False)
        _, reward, done, _ = env.step(action)
        if done:
            update_counter(counter, reward)

In [None]:
env = TicTacToe(4, 4, 3)
mtcs_agent = MCTSAgent(num_rounds=10_000)
counter3 = TikTakCounter()
N_GAME=20
for i in tqdm.tqdm(range(N_GAME)):
    agent_q.new_game(-1)
    play_game2(mtcs_agent, agent_q, agent_q, counter3)
    counter3.tot += 1
    counter3.update_history()

In [None]:
print(counter3)