Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [2]:
import numpy as np
import itertools
from tqdm import tqdm
import random

In [17]:
class TicTacToe:
    def __init__(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def reset(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def add_x(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = 1

    def add_o(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = -1

    def check_pos(self, player, pos):
        target = 3 if player == "X" else -3
        center = (1, 1)
        right_diag = {(0, 0), (2, 2)}
        left_diag = {(0, 2), (2, 0)}
        # checking row and column
        if self._board[pos[0], :].sum() == target or self._board[:, pos[1]].sum() == target:
            return True
        # check right_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == target:
                return True
        # check left_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == target:
                return True
        return False

    def check(self):
        rows = self._board.sum(axis = 0)
        cols = self._board.sum(axis = 1)
        if(np.any((rows == 3)) or  np.any((cols == 3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == 3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == 3):
            return 1
        if(np.any((rows == -3)) or  np.any((cols == -3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == -3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == -3):
            return -1
        return 0

    def next_moves(self):
        return [(i, j) for j in range(3) for i in range(3) if self._board[i, j] == 0]

    def __str__(self):
        mapping = {0: "_", 1: "X", -1: "O"}
        pretty = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        for i in range(3):
            for j in range(3):
                pretty[i][j] = mapping[self._board[i, j]]
        return "\n".join(["|".join(el) for el in pretty])

    def new_state(self, move: tuple[int], player):
        val = 1 if player == "X" else -1
        # always player O
        board_copy = self._board.copy()
        board_copy[move[0], move[1]] = val
        return tuple([tuple(row) for row in list(board_copy)])

    def get_state(self):
         return tuple([tuple(row) for row in list(self._board)])


In [4]:
board = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
print(board[1, :].sum())
print(board.sum(axis = 0))
print(board.sum(axis=1))

game = TicTacToe()
print(game)
game.add_o((0, 0))
game.add_x((1, 1))
game.add_o((0, 1))
# game.add_o((0, 1))
print(game.new_state((0, 2)))

3
[3 3 3]
[0 3 6]
_|_|_
_|_|_
_|_|_
((-1, -1, 1), (0, 1, 0), (0, 0, 0))


In [63]:
def game():
    inst = TicTacToe()
    print(inst)
    player = 0
    while not inst.check():
        move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
        xy = move.split("-")
        x, y = int(xy[0]), int(xy[1])
        if player == 0:
            inst.add_x((x, y))
        else:
            inst.add_o((x, y))
        print(inst)
        print()
        player = 1 - player

    print(f"Player {1 - player} won!")

game()

_|_|_
_|_|_
_|_|_
_|X|_
_|_|_
_|_|_

_|X|_
_|O|_
_|_|_

_|X|_
_|O|_
_|_|X

_|X|_
_|O|O
_|_|X

X|X|_
_|O|O
_|_|X

X|X|_
O|O|O
_|_|X

Player 1 won!


# Reiforcement Learning Strategy

The idea is to use a *Model-Free* Approach

Reward = 1 for game won, 0 for draw, -1 for game lost

In [98]:
disps = [disp for disp in list(itertools.product([0, 1, -1], repeat = 9)) if -1 <= sum(disp) <= 1] # filter valid states
disps = [tuple([disp[ind:ind+3] for ind in range(0, 9, 3)]) for disp in disps]
Qtable = {disp: 0 for disp in disps}

In [99]:
print(len(disps))
print(len(Qtable))
print(sum(Qtable.values()))

8953
8953
0


In [100]:
def epsilon_greedy_policy(Qtable: dict, state: TicTacToe, epsilon: float):
  random_int = random.uniform(0,1)
  actions = state.next_moves()
  if random_int > epsilon:
    return max(actions, key= lambda act: Qtable[state.new_state(act, "X")])
  else:
    action = actions[random.randint(0, len(actions) - 1)]
  return action

In [109]:
# Training parameters
n_training_episodes = 800_000
learning_rate = 0.8

# Environment parameters
gamma = 0.95

# Exploration parameters
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.0007

In [110]:
env = TicTacToe()
for episode in tqdm(range(n_training_episodes)):
    epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
    # Reset the environment
    env.reset()
    player = random.randint(0,1)
    done = 0
    steps = 0
    # simulating game
    state_action = env.get_state()
    while done == 0 and steps <9:
        actions = env.next_moves()
        if player == 1:
            action = epsilon_greedy_policy(Qtable, env, epsilon)
            env.add_x(action)
            state_action = env.get_state()
            reward = env.check()
            if reward:
                max_rew = 1
                Qtable[state_action] = Qtable[state_action] + learning_rate * (
                    reward + gamma * max_rew - Qtable[state_action])
        else:
            action = actions[random.randint(0, len(actions) - 1)]
            env.add_o(action)
            reward = env.check()
            actions = env.next_moves()
            if actions:
                max_act = max(actions, key=lambda act: Qtable[env.new_state(act, "X")])
                max_rew  = Qtable[env.new_state(max_act, "X")]
            else:
                max_rew = -1
            Qtable[state_action] = Qtable[state_action] + learning_rate * (
                    reward + gamma * max_rew - Qtable[state_action])
        done = reward
        player = 1 - player
        steps += 1

100%|██████████| 800000/800000 [02:50<00:00, 4695.73it/s]


In [111]:
print(Qtable.values())

dict_values([1.3452060674839026, 0.8578690382335633, 0, 1.566316690783284, -0.3080970129634528, 0, -0.22681014341328676, 1.7575010090398253, -0.49983546899886777, 1.4810108621689262, -0.17832115286965666, 1.759758309289858, 0, 0, -0.5396832835151537, -0.6973410251456719, -0.02635733841331822, 0, 0, 1.4985318771622071, -0.303531252059436, 0.035142808127334335, 1.233626053801455, 1.8524999999999803, 0, 0.804485256658124, 1.7597395062469068, -0.7570256101136602, -0.40519459712523537, 1.852352984019301, 0, 1.6780950206640424, 1.3357393350942848, 0, 1.837537536786431, 0, -0.16398244166902953, -0.3318346247856667, 1.8524990515175717, 0, 0, 0.024276874668557102, 1.7606159999975715, 0, 0.10096295823014567, -0.8691276720178478, 0, -0.7284975665637882, 0, 0, -0.8465308163222544, 1.5820754252095552, 1.4790259263696197, 0.6801519278266159, 1.7249735602616139, 1.7605922879999996, 0, 0.6539246653299564, 1.7600231923514569, 1.8489899009538802, 1.748735639093041, 0.43899052254072735, 0, 0.652137433469

In [112]:
class Player:
    def __init__(self, name, qtable: dict, letter):
        self._name = name
        self._QTable = qtable
        self._letter = letter

    def get_name(self):
        return self._name

    def move(self, game_inst: TicTacToe):
        acts = game_inst.next_moves()
        if self._letter == "X":
            best_move = max(acts, key=lambda act: Qtable[game_inst.new_state(act, self._letter)])
        else:
            # Tha agent learned to play with +1 strategy => we need to invert the state to properly query the QTable
            def invert_state(state):
                brd = []
                for i in range(3):
                    brd.append([])
                    for el in state[i]:
                        brd[i].append(-el)
                brd = tuple([tuple(row) for row in brd])
                return Qtable[brd]
            best_move = max(acts, key=lambda act: invert_state(game_inst.new_state(act, self._letter)))
        if self._letter == "X":
            game_inst.add_x(best_move)
        else:
            game_inst.add_o(best_move)

In [114]:
def game_again_agent():
    inst = TicTacToe()
    print(inst)
    player = 0
    agent = Player("Agent", Qtable, "X")
    steps = 0
    while inst.check() == 0 and steps < 9:
        if player == 0:
            move = input(f"Board:\n{inst}\n Player {player} enter move in the form <x-y>")
            xy = move.split("-")
            x, y = int(xy[0]), int(xy[1])
            inst.add_o((x, y))
        else:
            agent.move(inst)
        print(inst)
        print()
        player = 1 - player
        steps += 1
    if inst.check() != 0:
        print(f"Player {1 - player} won!")
    else:
        print(f"It's a draw guys!")

game_again_agent()

_|_|_
_|_|_
_|_|_
_|_|_
_|O|_
_|_|_

_|X|_
_|O|_
_|_|_

_|X|_
_|O|_
O|_|_

_|X|X
_|O|_
O|_|_

O|X|X
_|O|_
O|_|_

O|X|X
_|O|_
O|_|X



AssertionError: 

In [118]:
def proof():
    error = 0
    agent1 = Player("Agent1", Qtable, "X")
    agent0 = Player("Agent2", Qtable, "O")
    for i in range(10_000):
        inst = TicTacToe()
        player = 1
        steps = 0
        history = []
        while inst.check() == 0 and steps < 9:
            if player == 0:
                agent0.move(inst)
            else:
                # add randomness otherwise it is performed the same strategy evertime
                if steps == 0:
                    act = inst.next_moves()[random.randint(0, 8)]
                    inst.add_x(act)
                else:
                    agent1.move(inst)
            history.append(f"{inst}")
            player = 1 - player
            steps += 1
        if inst.check() != 0:
            for step in history:
                print(step)
                print()
            print()
            error += 1
    err_rate = error * 100 / 10_000
    print(f"Error Rate: {err_rate}")

proof()

_|_|_
X|_|_
_|_|_

_|_|_
X|_|_
_|_|O

_|_|_
X|_|_
X|_|O

O|_|_
X|_|_
X|_|O

O|_|_
X|X|_
X|_|O

O|_|_
X|X|O
X|_|O

O|_|X
X|X|O
X|_|O


_|_|_
_|X|_
_|_|_

_|O|_
_|X|_
_|_|_

_|O|_
X|X|_
_|_|_

O|O|_
X|X|_
_|_|_

O|O|_
X|X|X
_|_|_


_|_|_
X|_|_
_|_|_

_|_|_
X|_|_
_|_|O

_|_|_
X|_|_
X|_|O

O|_|_
X|_|_
X|_|O

O|_|_
X|X|_
X|_|O

O|_|_
X|X|O
X|_|O

O|_|X
X|X|O
X|_|O


X|_|_
_|_|_
_|_|_

X|_|_
O|_|_
_|_|_

X|X|_
O|_|_
_|_|_

X|X|O
O|_|_
_|_|_

X|X|O
O|X|_
_|_|_

X|X|O
O|X|_
_|_|O

X|X|O
O|X|_
_|X|O


_|X|_
_|_|_
_|_|_

_|X|_
_|O|_
_|_|_

_|X|_
X|O|_
_|_|_

_|X|_
X|O|_
_|_|O

X|X|_
X|O|_
_|_|O

X|X|_
X|O|O
_|_|O

X|X|_
X|O|O
X|_|O


X|_|_
_|_|_
_|_|_

X|_|_
O|_|_
_|_|_

X|X|_
O|_|_
_|_|_

X|X|O
O|_|_
_|_|_

X|X|O
O|X|_
_|_|_

X|X|O
O|X|_
_|_|O

X|X|O
O|X|_
_|X|O


_|_|_
X|_|_
_|_|_

_|_|_
X|_|_
_|_|O

_|_|_
X|_|_
X|_|O

O|_|_
X|_|_
X|_|O

O|_|_
X|X|_
X|_|O

O|_|_
X|X|O
X|_|O

O|_|X
X|X|O
X|_|O


_|X|_
_|_|_
_|_|_

_|X|_
_|O|_
_|_|_

_|X|_
X|O|_
_|_|_

_|X|_
X|O|_
_|_|O

X|X|_
X|O|_
_|_|O

X|X|_