Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [2]:
import numpy as np
import itertools
from tqdm import tqdm
import random

In [411]:
class TicTacToe:
    def __init__(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def reset(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def add_x(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = 1

    def add_o(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = -1

    def check_pos(self, player, pos):
        target = 3 if player == "X" else -3
        center = (1, 1)
        right_diag = {(0, 0), (2, 2)}
        left_diag = {(0, 2), (2, 0)}
        # checking row and column
        if self._board[pos[0], :].sum() == target or self._board[:, pos[1]].sum() == target:
            return True
        # check right_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == target:
                return True
        # check left_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == target:
                return True
        return False

    def check(self):
        rows = self._board.sum(axis = 0)
        cols = self._board.sum(axis = 1)
        if(np.any((rows == 3)) or  np.any((cols == 3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == 3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == 3):
            return 1
        if(np.any((rows == -3)) or  np.any((cols == -3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == -3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == -3):
            return -1
        return 0

    def copy(self):
        bd_copy = self._board.copy()
        new_game = TicTacToe()
        new_game._board = bd_copy
        return new_game

    def next_moves(self):
        return [(i, j) for j in range(3) for i in range(3) if self._board[i, j] == 0]

    def __str__(self):
        mapping = {0: "_", 1: "X", -1: "O"}
        pretty = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        for i in range(3):
            for j in range(3):
                pretty[i][j] = mapping[self._board[i, j]]
        return "\n".join(["|".join(el) for el in pretty])

    def new_state(self, move: tuple[int], player):
        val = 1 if player == "X" else -1
        # always player O
        board_copy = self._board.copy()
        board_copy[move[0], move[1]] = val
        return tuple([tuple(row) for row in list(board_copy)])

    def get_state(self):
         return tuple([tuple(row) for row in list(self._board)])


In [None]:
board = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
print(board[1, :].sum())
print(board.sum(axis = 0))
print(board.sum(axis=1))

game = TicTacToe()
print(game)
game.add_o((0, 0))
game.add_x((1, 1))
game.add_o((0, 1))
# game.add_o((0, 1))
print(game.new_state((0, 2)))

In [None]:
def game():
    inst = TicTacToe()
    print(inst)
    player = 0
    while not inst.check():
        move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
        xy = move.split("-")
        x, y = int(xy[0]), int(xy[1])
        if player == 0:
            inst.add_x((x, y))
        else:
            inst.add_o((x, y))
        print(inst)
        print()
        player = 1 - player

    print(f"Player {1 - player} won!")

game()

# Reiforcement Learning Strategy

The idea is to use a *Model-Free* Approach

Reward = 1 for game won, 0 for draw, -1 for game lost

# BackPropagation Strategy

In [398]:
# Training parameters
n_training_episodes = 30_000
learning_rate = 0.7

# Environment parameters
gamma = 0.95

# Exploration parameters
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.005

In [399]:
Qtable = dict() # {state1: {action1: value, ..., action_n: value} ,..., state_n:{...} }
default = 0.0
disps = [disp for disp in list(itertools.product([0, 1, -1], repeat = 9)) if -1 <= sum(disp) <= 0] # filter valid states
disps = [tuple([disp[ind:ind+3] for ind in range(0, 9, 3)]) for disp in disps]
for disp in disps:
    next_moves = [(i, j) for j in range(3) for i in range(3) if disp[i][j] == 0]
    Qtable[disp] = {mv: default for mv in next_moves}

In [400]:
def backpropagation(Qtable: dict, states: list, actions: list, reward):
    next_state = None
    for act, state in zip(reversed(actions), reversed(states)):
        fut_max = max(Qtable[next_state].values()) if next_state else default
        Qtable[state][act] = Qtable[state][act] + learning_rate * (reward + gamma * fut_max - Qtable[state][act])
        next_state = state
    return Qtable

In [401]:
def epsilon_greedy_policy(Qtable: dict, state: TicTacToe, epsilon: float):
  random_int = random.uniform(0,1)
  actions = state.next_moves()
  if random_int > epsilon:
    return max(actions, key= lambda act: Qtable[state.get_state()][act])
  else:
    action = actions[random.randint(0, len(actions) - 1)]
  return action

In [402]:
env = TicTacToe()
for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    # player = 0 # random.randint(0,1)
    for n in [0, 1]:
        # Reset the environment
        env.reset()
        player = n
        done = 0
        steps = 0
        # simulating game
        state_action = env.get_state()
        state_history = [state_action] if n == 1 else []
        actions_history = []
        while done == 0 and steps <9:
            actions = env.next_moves()
            if player == 1:
                action = epsilon_greedy_policy(Qtable, env, epsilon)
                env.add_x(action)
                actions_history.append(action)
            else:
                action = actions[random.randint(0, len(actions) - 1)]
                env.add_o(action)
                state_history.append(env.get_state())
            done = env.check()
            player = 1 - player
            steps += 1
        # updating states
        reward = done if done != 0 else 0.05 # +1 win -1 lose 0.1 draw
        if len(state_history) != len(actions_history):
            # print(state_history)
            # print(actions_history)
            state_history.pop()
        Qtable = backpropagation(Qtable, state_history, actions_history, reward)

100%|██████████| 30000/30000 [00:09<00:00, 3030.47it/s]


In [403]:
# print(Qtable.values())
table = ((0, 1, -1), (1, -1, -1), (1, 0, 0))
print(Qtable[table])

{(0, 0): 0.7, (2, 1): 0.0, (2, 2): 0.0}


In [404]:
class Player:
    def __init__(self, name, qtable: dict, letter):
        self._name = name
        self._QTable = qtable
        self._letter = letter

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        acts = game_inst.next_moves()
        if self._letter == "X":
            best_move = max(acts, key=lambda act: Qtable[game_inst.get_state()][act])
        else:
            # Tha agent learned to play with +1 strategy => we need to invert the state to properly query the QTable
            def invert_state(state):
                brd = []
                for i in range(3):
                    brd.append([])
                    for el in state[i]:
                        brd[i].append(-el)
                brd = tuple([tuple(row) for row in brd])
                return brd
            best_move = max(acts, key=lambda act: Qtable[invert_state(game_inst.get_state())][act])
        if self._letter == "X":
            game_inst.add_x(best_move)
        else:
            game_inst.add_o(best_move)

In [None]:
def game_again_agent():
    inst = TicTacToe()
    print(inst)
    player = 0
    agent = Player("Agent", Qtable, "X")
    steps = 0
    while inst.check() == 0 and steps < 9:
        if player == 0:
            move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
            xy = move.split("-")
            x, y = int(xy[0]), int(xy[1])
            inst.add_o((x, y))
        else:
            agent.move(inst)
        print(inst)
        print()
        player = 1 - player
        steps += 1
    if inst.check() != 0:
        print(f"Player {1 - player} won!")
    else:
        print(f"It's a draw guys!")

game_again_agent()

In [408]:
def proof():
    winning = 0
    drawing = 0
    agent0 = Player("Agent2", Qtable, "X")
    for i in range(10_000):
        inst = TicTacToe()
        player = 1
        steps = 0
        history = []
        while inst.check() == 0 and steps < 9:
            if player == 0:
                agent0.move(inst)
            else:
                # # add randomness otherwise it is performed the same strategy evertime
                # if steps == 0:
                #     act = inst.next_moves()[random.randint(0, 8)]
                #     inst.add_x(act)
                # else:
                #     agent1.move(inst)
                act = inst.next_moves()[random.randint(0, len(inst.next_moves()) - 1)]
                inst.add_o(act)
            history.append(f"{inst}")
            player = 1 - player
            steps += 1
        if inst.check() == 1:
            winning += 1
        elif inst.check() == 0:
            drawing += 1
    win_rate = winning * 100 / 10_000
    draw_rate = drawing * 100 / 10_000
    print(f"Winning Rate against Random: {win_rate}")
    print(f"Drawing Rate against Random: {draw_rate}")

proof()

Winning Rate against Random: 80.31
Drawing Rate against Random: 18.68


# MinMax Strategy as benchmark

In [430]:
def minmax(game: TicTacToe, plyr, to_maximize):
    over = game.check()
    next_actions = game.next_moves()
    if over != 0 or not next_actions:
        if to_maximize == "O":
            over = -over
        return None, over
    scores = []
    for i, move in enumerate(next_actions):
        new_g = game.copy()
        if plyr == "X":
            new_g.add_x(move)
            scores.append((i, minmax(new_g, "O", to_maximize)[1]))
        else:
            new_g.add_o(move)
            scores.append((i, minmax(new_g, "X", to_maximize)[1]))
    if plyr == to_maximize: # max player
        best = max(scores, key= lambda x: x[1])
    else:
        best = min(scores, key = lambda  x: x[1])
    choice = next_actions[best[0]]
    return choice, best[1]

In [None]:
g = TicTacToe()
g.add_o((2,2))
g.add_o((0,0))
g.add_x((0, 2))
g.add_x((2,1))
mov, _ = minmax(g, "O", "O")
g.add_o(mov)
g.add_x((2, 0))
mov, _ = minmax(g, "O", "O")
g.add_o(mov)
print(g)

In [439]:
class MinMaxPlayer:
    def __init__(self, name, letter):
        self._name = name
        self._letter = letter

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        if self._letter == "X":
            mv, _ = minmax(game_inst, "X", "X")
            game_inst.add_x(mv)
        else:
            mv, _ = minmax(game_inst, "O", "O")
            game_inst.add_o(mv)

In [447]:
def benchmark():
    winning = 0
    drawing = 0
    agent0 = Player("Agent0", Qtable, "X")
    agent1 = MinMaxPlayer("Agent1", "O")
    for i in tqdm(range(100)):
        inst = TicTacToe()
        player = 0
        steps = 0
        while inst.check() == 0 and steps < 9:
            if player == 0:
                agent0.move(inst)
            else:
                # if steps == 0:
                #     rand_ind = random.randint(0,8)
                #     inst.add_o(inst.next_moves()[rand_ind])
                # else:
                agent1.move(inst)
            player = 1 - player
            steps += 1
        if inst.check() == 1:
            winning += 1
        elif inst.check() == 0:
            drawing += 1
    win_rate = winning * 100 / 100
    draw_rate = drawing * 100 / 100
    print(f"Winning Rate against Random: {win_rate}")
    print(f"Drawing Rate against Random: {draw_rate}")

benchmark()

100%|██████████| 100/100 [01:24<00:00,  1.18it/s]

Winning Rate against Random: 0.0
Drawing Rate against Random: 100.0



