Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [2]:
import numpy as np
import itertools
from tqdm import tqdm
import random

# Tic Tac Toe Implementation

In [3]:
class TicTacToe:
    def __init__(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def reset(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def add_x(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = 1

    def add_o(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = -1

    def check_pos(self, player, pos):
        target = 3 if player == "X" else -3
        center = (1, 1)
        right_diag = {(0, 0), (2, 2)}
        left_diag = {(0, 2), (2, 0)}
        # checking row and column
        if self._board[pos[0], :].sum() == target or self._board[:, pos[1]].sum() == target:
            return True
        # check right_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == target:
                return True
        # check left_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == target:
                return True
        return False

    def check(self):
        rows = self._board.sum(axis = 0)
        cols = self._board.sum(axis = 1)
        if(np.any((rows == 3)) or  np.any((cols == 3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == 3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == 3):
            return 1
        if(np.any((rows == -3)) or  np.any((cols == -3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == -3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == -3):
            return -1
        return 0

    def copy(self):
        bd_copy = self._board.copy()
        new_game = TicTacToe()
        new_game._board = bd_copy
        return new_game

    def next_moves(self):
        return [(i, j) for j in range(3) for i in range(3) if self._board[i, j] == 0]

    def __str__(self):
        mapping = {0: "_", 1: "X", -1: "O"}
        pretty = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        for i in range(3):
            for j in range(3):
                pretty[i][j] = mapping[self._board[i, j]]
        return "\n".join(["|".join(el) for el in pretty])

    def new_state(self, move: tuple[int], player):
        val = 1 if player == "X" else -1
        # always player O
        board_copy = self._board.copy()
        board_copy[move[0], move[1]] = val
        return tuple([tuple(row) for row in list(board_copy)])

    def get_state(self):
         return tuple([tuple(row) for row in list(self._board)])


In [None]:
def game():
    inst = TicTacToe()
    print(inst)
    player = 0
    while not inst.check():
        move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
        xy = move.split("-")
        x, y = int(xy[0]), int(xy[1])
        if player == 0:
            inst.add_x((x, y))
        else:
            inst.add_o((x, y))
        print(inst)
        print()
        player = 1 - player

    print(f"Player {1 - player} won!")

game()

# Reiforcement Learning Strategy

The idea is to use a *Model-Free* Approach

Reward = 1 for game won, 0 for draw, -1 for game lost

# BackPropagation Strategy

In [188]:
# Training parameters
n_training_episodes = 30_000
learning_rate = 0.7

# Environment parameters
gamma = 0.95

# Exploration parameters
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

In [189]:
def backpropagation(Qtable: dict, states: list, actions: list, reward):
    next_state = None
    for act, state in zip(reversed(actions), reversed(states)):
        fut_max = max(Qtable[next_state].values()) if next_state else default
        Qtable[state][act] = Qtable[state][act] + learning_rate * (reward + gamma * fut_max - Qtable[state][act])
        next_state = state
    return Qtable

In [190]:
def epsilon_greedy_policy(Qtable: dict, state: TicTacToe, epsilon: float):
  random_int = random.uniform(0,1)
  actions = state.next_moves()
  if random_int > epsilon:
    return max(actions, key= lambda act: Qtable[state.get_state()][act])
  else:
    action = actions[random.randint(0, len(actions) - 1)]
  return action

# Training only against Random

In [162]:
Qtable = dict() # {state1: {action1: value, ..., action_n: value} ,..., state_n:{...} }
default = 0.0
disps = [disp for disp in list(itertools.product([0, 1, -1], repeat = 9)) if -1 <= sum(disp) <= 0] # filter valid states
disps = [tuple([disp[ind:ind+3] for ind in range(0, 9, 3)]) for disp in disps]
for disp in disps:
    next_moves = [(i, j) for j in range(3) for i in range(3) if disp[i][j] == 0]
    Qtable[disp] = {mv: default for mv in next_moves}

env = TicTacToe()
for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    # player = 0 # random.randint(0,1)
    for n in [0, 1]:
        # Reset the environment
        env.reset()
        player = n
        done = 0
        steps = 0
        # simulating game
        state_action = env.get_state()
        state_history = [state_action] if n == 1 else []
        actions_history = []
        while done == 0 and steps <9:
            actions = env.next_moves()
            if player == 1:
                action = epsilon_greedy_policy(Qtable, env, epsilon)
                env.add_x(action)
                actions_history.append(action)
            else:
                action = actions[random.randint(0, len(actions) - 1)]
                env.add_o(action)
                state_history.append(env.get_state())
            done = env.check()
            player = 1 - player
            steps += 1
        # updating states
        reward = done if done != 0 else 0.05 # +1 win -1 lose 0.1 draw
        if len(state_history) != len(actions_history):
            # print(state_history)
            # print(actions_history)
            state_history.pop()
        Qtable = backpropagation(Qtable, state_history, actions_history, reward)

100%|██████████| 30000/30000 [00:09<00:00, 3044.95it/s]


Training only with random is not perfectly effective because the training phase tends to not explore all the possible states

and in model free approach, that is deterministic, this leads to weaknesses of the agent trained. Being tictactoe an easy game
with few states, it is easy to find the weakness by playing against the agent 20-30 times.

We need to find a better trainer!

# MinMax Strategy as benchmark

In [191]:
min_cache = {}
max_cache = {}
def minmax(game: TicTacToe, plyr, to_maximize):
    cache = max_cache if plyr == to_maximize else min_cache
    if (game.get_state(), plyr) in cache:
        return cache[(game.get_state(), plyr)]
    over = game.check()
    next_actions = game.next_moves()
    if over != 0 or not next_actions:
        if to_maximize == "O":
            over = -over
        # cache[game.get_state()] = (None, over)
        return None, over
    scores = []
    for i, move in enumerate(next_actions):
        new_g = game.copy()
        if plyr == "X":
            new_g.add_x(move)
            scores.append((i, minmax(new_g, "O", to_maximize)[1]))
        else:
            new_g.add_o(move)
            scores.append((i, minmax(new_g, "X", to_maximize)[1]))
    if plyr == to_maximize: # max player
        best = max(scores, key= lambda x: x[1])
    else:
        best = min(scores, key = lambda  x: x[1])
    choice = next_actions[best[0]]
    cache[(game.get_state(), plyr)] = (choice, best[1])
    return choice, best[1]

In [192]:
initial_moves = [(i, j) for i in range(3) for j in range(3)]
gm = TicTacToe()
# fill memoization
minmax(gm, "X", "X")
for mv in initial_moves:
    gm_copy = gm.copy().add_o(mv)
    minmax(gm, "X", "X")

minmax(gm, "O", "O")
for mv in initial_moves:
    gm_copy = gm.copy().add_x(mv)
    minmax(gm, "O", "O")

In [193]:
class Player:
    def __init__(self, name, letter):
        self._name = name
        self._letter = letter

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        pass

    @property
    def letter(self):
        return self._letter

    @property
    def name(self):
        return self._name

In [194]:
class MinMaxPlayer(Player):
    def __init__(self, name, letter):
        super().__init__(name, letter)

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        if self._letter == "X":
            mv, _ = minmax(game_inst, "X", "X")
            game_inst.add_x(mv)
        else:
            mv, _ = minmax(game_inst, "O", "O")
            game_inst.add_o(mv)

In [195]:
class RandomPlayer(Player):
    def __init__(self, name, letter):
        super().__init__(name, letter)

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        if self._letter == "X":
            acts = game_inst.next_moves()
            mv = random.randint(0, len(acts) - 1)
            game_inst.add_x(acts[mv])
        else:
            acts = game_inst.next_moves()
            mv = random.randint(0, len(acts) - 1)
            game_inst.add_o(acts[mv])

In [196]:
class QPlayer(Player):
    def __init__(self, name, qtable: dict, letter):
        super().__init__(name, letter)
        self._QTable = qtable

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        acts = game_inst.next_moves()
        if self._letter == "X":
            best_move = max(acts, key=lambda act: Qtable[game_inst.get_state()][act])
        else:
            # Tha agent learned to play with +1 strategy => we need to invert the state to properly query the QTable
            def invert_state(state):
                brd = []
                for i in range(3):
                    brd.append([])
                    for el in state[i]:
                        brd[i].append(-el)
                brd = tuple([tuple(row) for row in brd])
                return brd
            best_move = max(acts, key=lambda act: Qtable[invert_state(game_inst.get_state())][act])
        if self._letter == "X":
            game_inst.add_x(best_move)
        else:
            game_inst.add_o(best_move)

# Training

In [204]:
Qtable = dict() # {state1: {action1: value, ..., action_n: value} ,..., state_n:{...} }
default = 0.0
disps = [disp for disp in list(itertools.product([0, 1, -1], repeat = 9)) if -1 <= sum(disp) <= 0] # filter valid states
disps = [tuple([disp[ind:ind+3] for ind in range(0, 9, 3)]) for disp in disps]
for disp in disps:
    next_moves = [(i, j) for j in range(3) for i in range(3) if disp[i][j] == 0]
    Qtable[disp] = {mv: default for mv in next_moves}

env = TicTacToe()
min_max_player = MinMaxPlayer("Trainer", "O")
random_player = RandomPlayer("RandomTrainer", "O")

for episode in tqdm(range(30_000)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    # player = 0 # random.randint(0,1)
    for n in [0, 1]:
        min_max_agent = random.choice([False, False, True]) # minmax 25%
        # Reset the environment
        env.reset()
        player = n
        done = 0
        steps = 0
        # simulating game
        state_action = env.get_state()
        state_history = [state_action] if n == 1 else []
        actions_history = []
        while done == 0 and steps <9:
            actions = env.next_moves()
            if player == 1:
                action = epsilon_greedy_policy(Qtable, env, epsilon)
                env.add_x(action)
                actions_history.append(action)
            else:
                if not min_max_agent or steps == 0:
                    random_player.move(env)
                else:
                    action = min_max_player.move(env)
                state_history.append(env.get_state())
            done = env.check()
            player = 1 - player
            steps += 1
        # updating states
        reward = done if done != 0 else 0.00 # +1 win -1 lose 0.1 draw
        if len(state_history) != len(actions_history):
            state_history.pop()
        Qtable = backpropagation(Qtable, state_history, actions_history, reward)

100%|██████████| 30000/30000 [00:11<00:00, 2649.25it/s]


In [205]:
perfectX_player = QPlayer("PerfectX Player", Qtable, "X")
perfectO_player = QPlayer("PerfectO Player", Qtable, "O")

In [211]:
def benchmark(player1: Player, player2: Player):
    n_games = 10_000
    winning = 0
    drawing = 0
    # first needs to be random otherwise minmax plays everytime the same strategy
    winner = 1 if player1.letter == "X" else -1
    rndO = RandomPlayer("OPlayer", "O")
    rndX = RandomPlayer("XPlayer", "X")
    first_move = {"O": rndO, "X": rndX}
    for _ in tqdm(range(n_games)):
        inst = TicTacToe()
        player = 0
        steps = 0
        while inst.check() == 0 and steps < 9:
            if player == 0:
                if steps == 0:
                    first_move[player1.letter].move(inst)
                else:
                    player1.move(inst)
            else:
                if steps == 0:
                    first_move[player2.letter].move(inst)
                else:
                    player2.move(inst)
            player = 1 - player
            steps += 1
        if inst.check() == winner:
            winning += 1
        elif inst.check() == 0:
            drawing += 1
    win_rate = winning * 100 / n_games
    draw_rate = drawing * 100 / n_games
    print(f"{player1.name} winning Rate against {player2.name}: {win_rate}%")
    print(f"{player1.name} Drawing Rate against {player2.name}: {draw_rate}%")
    print(f"{player1.name} Losing Rate against {player2.name}: {100 - win_rate - draw_rate}%")

In [215]:
min_max_player = MinMaxPlayer("MinMax", "O")
random_player = RandomPlayer("Random", "O")
benchmark(perfectX_player, min_max_player)
benchmark(min_max_player, perfectX_player)
benchmark(perfectX_player, random_player)
benchmark(random_player, perfectX_player)
benchmark(perfectO_player, perfectX_player)
benchmark(perfectX_player, perfectO_player)

100%|██████████| 10000/10000 [00:02<00:00, 4762.32it/s]


PerfectX Player winning Rate against MinMax: 0.0%
PerfectX Player Drawing Rate against MinMax: 100.0%
PerfectX Player Losing Rate against MinMax: 0.0%


100%|██████████| 10000/10000 [00:02<00:00, 4588.44it/s]


MinMax winning Rate against PerfectX Player: 0.0%
MinMax Drawing Rate against PerfectX Player: 100.0%
MinMax Losing Rate against PerfectX Player: 0.0%


100%|██████████| 10000/10000 [00:01<00:00, 6607.61it/s]


PerfectX Player winning Rate against Random: 86.28%
PerfectX Player Drawing Rate against Random: 11.04%
PerfectX Player Losing Rate against Random: 2.6799999999999997%


100%|██████████| 10000/10000 [00:01<00:00, 5957.41it/s]


Random winning Rate against PerfectX Player: 0.89%
Random Drawing Rate against PerfectX Player: 18.38%
Random Losing Rate against PerfectX Player: 80.73%


100%|██████████| 10000/10000 [00:02<00:00, 3969.80it/s]


PerfectO Player winning Rate against PerfectX Player: 11.15%
PerfectO Player Drawing Rate against PerfectX Player: 88.85%
PerfectO Player Losing Rate against PerfectX Player: 0.0%


100%|██████████| 10000/10000 [00:02<00:00, 3880.68it/s]

PerfectX Player winning Rate against PerfectO Player: 11.26%
PerfectX Player Drawing Rate against PerfectO Player: 88.74%
PerfectX Player Losing Rate against PerfectO Player: 0.0%





# It's time to play
Change the variable "player" to 1 to play the second turn

In [None]:
def game_again_agent():
    inst = TicTacToe()
    print(inst)
    player = 0
    agent = perfectX_player
    steps = 0
    while inst.check() == 0 and steps < 9:
        if player == 0:
            move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
            xy = move.split("-")
            x, y = int(xy[0]), int(xy[1])
            inst.add_o((x, y))
        else:
            agent.move(inst)
        print(inst)
        print()
        player = 1 - player
        steps += 1
    if inst.check() != 0:
        print(f"Player {1 - player} won!")
    else:
        print(f"It's a draw guys!")

game_again_agent()