Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [2]:
import numpy as np
import itertools
from tqdm import tqdm
import random

In [3]:
class TicTacToe:
    def __init__(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def reset(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def add_x(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = 1

    def add_o(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = -1

    def check_pos(self, player, pos):
        target = 3 if player == "X" else -3
        center = (1, 1)
        right_diag = {(0, 0), (2, 2)}
        left_diag = {(0, 2), (2, 0)}
        # checking row and column
        if self._board[pos[0], :].sum() == target or self._board[:, pos[1]].sum() == target:
            return True
        # check right_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == target:
                return True
        # check left_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == target:
                return True
        return False

    def check(self):
        rows = self._board.sum(axis = 0)
        cols = self._board.sum(axis = 1)
        if(np.any((rows == 3)) or  np.any((cols == 3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == 3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == 3):
            return 1
        if(np.any((rows == -3)) or  np.any((cols == -3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == -3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == -3):
            return -1
        return 0

    def next_moves(self):
        return [(i, j) for j in range(3) for i in range(3) if self._board[i, j] == 0]

    def __str__(self):
        mapping = {0: "_", 1: "X", -1: "O"}
        pretty = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        for i in range(3):
            for j in range(3):
                pretty[i][j] = mapping[self._board[i, j]]
        return "\n".join(["|".join(el) for el in pretty])

    def new_state(self, move: tuple[int]):
        # always player O
        board_copy = self._board.copy()
        board_copy[move[0], move[1]] = 1
        return tuple([tuple(row) for row in list(board_copy)])

    def get_state(self):
         return tuple([tuple(row) for row in list(self._board)])


In [4]:
board = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
print(board[1, :].sum())
print(board.sum(axis = 0))
print(board.sum(axis=1))

game = TicTacToe()
print(game)
game.add_o((0, 0))
game.add_x((1, 1))
game.add_o((0, 1))
# game.add_o((0, 1))
print(game.new_state((0, 2)))

3
[3 3 3]
[0 3 6]
_|_|_
_|_|_
_|_|_
((-1, -1, 1), (0, 1, 0), (0, 0, 0))


In [63]:
def game():
    inst = TicTacToe()
    print(inst)
    player = 0
    while not inst.check():
        move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
        xy = move.split("-")
        x, y = int(xy[0]), int(xy[1])
        if player == 0:
            inst.add_x((x, y))
        else:
            inst.add_o((x, y))
        print(inst)
        print()
        player = 1 - player

    print(f"Player {1 - player} won!")

game()

_|_|_
_|_|_
_|_|_
_|X|_
_|_|_
_|_|_

_|X|_
_|O|_
_|_|_

_|X|_
_|O|_
_|_|X

_|X|_
_|O|O
_|_|X

X|X|_
_|O|O
_|_|X

X|X|_
O|O|O
_|_|X

Player 1 won!


# Reiforcement Learning Strategy

The idea is to use a *Model-Free* Approach

Reward = 1 for game won, 0 for draw, -1 for game lost

In [5]:
disps = [disp for disp in list(itertools.product([0, 1, -1], repeat = 9)) if -1 <= sum(disp) <= 1] # filter valid states
disps = [tuple([disp[ind:ind+3] for ind in range(0, 9, 3)]) for disp in disps]
Qtable = {disp: 0 for disp in disps}

In [6]:
print(len(disps))
print(len(Qtable))

8953
8953


In [7]:
def epsilon_greedy_policy(Qtable: dict, state: TicTacToe, epsilon: float):
  random_int = random.uniform(0,1)
  actions = state.next_moves()
  if random_int > epsilon:
    return max(actions, key= lambda act: Qtable[state.new_state(act)])
  else:
    action = actions[random.randint(0, len(actions) - 1)]
  return action

In [8]:
# Training parameters
n_training_episodes = 100_000
learning_rate = 0.7

# Evaluation parameters
n_eval_episodes = 100

# Environment parameters
gamma = 0.95

# Exploration parameters
max_epsilon = 1.0
min_epsilon = 0.05
decay_rate = 0.0005

In [9]:
env = TicTacToe()
for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    # Reset the environment
    env.reset()
    player = 1
    done = 0
    steps = 0
    # simulating game
    state_action = env.get_state()
    while done == 0 and steps <9:
        actions = env.next_moves()
        if player == 1:
            action = epsilon_greedy_policy(Qtable, env, epsilon)
            env.add_x(action)
            state_action = env.get_state()
            reward = env.check()
            if reward:
                max_rew = 1
                Qtable[state_action] = Qtable[state_action] + learning_rate * (
                    reward + gamma * max_rew - Qtable[state_action])
        else:
            action = actions[random.randint(0, len(actions) - 1)]
            env.add_o(action)
            reward = env.check()
            actions = env.next_moves()
            if actions:
                max_act = max(actions, key=lambda act: Qtable[env.new_state(act)])
                max_rew  = Qtable[env.new_state(max_act)]
            else:
                max_rew = 0
            Qtable[state_action] = Qtable[state_action] + learning_rate * (
                    reward + gamma * max_rew - Qtable[state_action])
        done = reward
        player = 1 - player
        steps += 1

100%|██████████| 100000/100000 [00:20<00:00, 4954.44it/s]


In [None]:
env = TicTacToe()
for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    # Reset the environment
    env.reset()
    player = 1
    done = 0
    steps = 0
    # simulating game
    while done == 0 and steps <=9:
        actions = env.next_moves()
        if player == 1:
            action = epsilon_greedy_policy(Qtable, env, epsilon)
            env.add_x(action)
            state_action = env.get_state()
            reward = env.check()
            if actions:
                max_act = max(actions, key=lambda act: Qtable[env.new_state(act)])
                max_rew  = Qtable[env.new_state(max_act)]
            else:
                max_rew = 0
            Qtable[state_action] = Qtable[state_action] + learning_rate * (
                    reward + gamma * max_rew - Qtable[state_action])
            done = reward
        else:
            action = actions[random.randint(0, len(actions) - 1)]
            env.add_o(action)
            done = env.check()
        player = 1 - player
        steps += 1

In [None]:
print(Qtable.values())

In [13]:
class Player:
    def __init__(self, name, qtable: dict, letter):
        self._name = name
        self._QTable = qtable
        self._letter = letter

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        acts = game_inst.next_moves()
        best_move = max(acts, key=lambda act: Qtable[game_inst.new_state(act)])
        if self._letter == "X":
            game_inst.add_x(best_move)
        else:
            game_inst.add_o(best_move)

In [None]:
def game_again_agent():
    inst = TicTacToe()
    print(inst)
    player = 1
    agent = Player("Agent", Qtable, "X")
    steps = 0
    while inst.check() == 0 and steps < 9:
        if player == 0:
            move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
            xy = move.split("-")
            x, y = int(xy[0]), int(xy[1])
            inst.add_o((x, y))
        else:
            agent.move(inst)
        print(inst)
        print()
        player = 1 - player
        steps += 1
    if inst.check() != 0:
        print(f"Player {1 - player} won!")
    else:
        print(f"It's a draw guys!")

game_again_agent()