Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [2]:
import numpy as np

In [50]:
class TicTacToe:
    def __init__(self):
        self._board = np.array([[0, 0, 0], [0, 0, 0], [0, 0, 0]])

    def add_x(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = 1

    def add_o(self, pos: tuple[int]):
        assert self._board[pos[0]][pos[1]] == 0
        self._board[pos[0]][pos[1]] = -1

    def check_pos(self, player, pos):
        target = 3 if player == "X" else -3
        center = (1, 1)
        right_diag = {(0, 0), (2, 2)}
        left_diag = {(0, 2), (2, 0)}
        # checking row and column
        if self._board[pos[0], :].sum() == target or self._board[:, pos[1]].sum() == target:
            return True
        # check right_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == target:
                return True
        # check left_diag
        if pos in right_diag or pos == center:
            if sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == target:
                return True
        return False

    def check(self):
        rows = self._board.sum(axis = 0)
        cols = self._board.sum(axis = 1)
        if(np.any((rows == 3)) or  np.any((cols == 3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == 3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == 3):
            return 1
        if(np.any((rows == -3)) or  np.any((cols == -3))
                or sum([self._board[dx, dy] for dx, dy in [(0,0), (1, 1), (2, 2)]]) == -3
                or sum([self._board[dx, dy] for dx, dy in [(0,2), (1, 1), (2, 0)]]) == -3):
            return -1
        return 0

    def next_moves(self):
        return [(i, j) for j in range(3) for i in range(3) if self._board[i, j] == 0]

    def __str__(self):
        mapping = {0: "_", 1: "X", -1: "O"}
        pretty = [[0, 0, 0], [0, 0, 0], [0, 0, 0]]
        for i in range(3):
            for j in range(3):
                pretty[i][j] = mapping[self._board[i, j]]
        return "\n".join(["|".join(el) for el in pretty])




In [51]:
board = np.array([[0, 0, 0], [1, 1, 1], [2, 2, 2]])
print(board[1, :].sum())
print(board.sum(axis = 0))
print(board.sum(axis=1))

game = TicTacToe()
print(game)
game.add_o((0, 0))
game.add_x((1, 1))
game.add_o((0, 1))
# game.add_o((0, 1))
print(game.next_moves())

3
[3 3 3]
[0 3 6]
_|_|_
_|_|_
_|_|_
[(1, 0), (2, 0), (2, 1), (0, 2), (1, 2), (2, 2)]


In [42]:
def game():
    inst = TicTacToe()
    print(inst)
    player = 0
    while not inst.check():
        move = input(f"Player {player} enter move in the form <x-y>")
        xy = move.split("-")
        x, y = int(xy[0]), int(xy[1])
        if player == 0:
            inst.add_x((x, y))
        else:
            inst.add_o((x, y))
        print(inst)
        print()
        player = 1 - player

    print(f"Player {1 - player} won!")

# Reiforcement Learning Strategy

The idea is to use a *Model-Free* Approach

Reward = 1 for game won, 0 for draw, -1 for game lost

In [None]:
Q_Table = dict() # it will be updated at runtime
learning_rate = 0.0005
gamma = 0.002

In [None]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
  for episode in range(n_training_episodes):

    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    # Reset the environment
    state = env.reset()
    step = 0
    done = False

    # repeat
    for step in range(max_steps):

      action = epsilon_greedy_policy(Qtable, state, epsilon)


      new_state, reward, done, info = env.step(action)


      Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])

      # If done, finish the episode
      if done:
        break

      # Our state is the new state
      state = new_state
  return Qtable