Copyright **`(c)`** 2023 Giovanni Squillero `<giovanni.squillero@polito.it>`  
[`https://github.com/squillero/computational-intelligence`](https://github.com/squillero/computational-intelligence)  
Free for personal or classroom use; see [`LICENSE.md`](https://github.com/squillero/computational-intelligence/blob/master/LICENSE.md) for details.  

# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: [Dies Natalis Solis Invicti](https://en.wikipedia.org/wiki/Sol_Invictus)
* Reviews: [Befana](https://en.wikipedia.org/wiki/Befana)

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [3]:
from itertools import combinations
from collections import namedtuple, defaultdict
import random as random
from copy import deepcopy

from tqdm.auto import tqdm
import numpy as np
import json

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class Tic_Tac_Toe:
    def __init__(self, size=3) -> None:
        self._board = np.ones((size, size), dtype=np.uint8) * -1
        self._size = size
        
    def print_board(self):
        """Nicely prints the board"""
        for r in range(3):
            for c in range(3):
                if self._board[r][c] == 0:
                    print('X', end='')
                elif self._board[r][c] == 1:
                    print('O', end='')
                else:
                    print('_', end='')
            print()
        print()

    def get_board(self):
        return self._board
    
    def get_string_board(self) -> str:
        return json.dumps(self._board.tolist())
        #return self._board.tobytes()

    def make_move(self, player, action):
        if(player == 0) or (player == 1):
            self._board[action[0]][action[1]] = player
        else:
            raise ValueError("Giocatore non valido")
    
    def is_available(self, move):
        if(self._board[move[0]][move[1]] == -1):
            return True
        else:
            return False

    def check_winner(self) -> int:
        '''Check the winner. Returns the player ID of the winner if any, otherwise returns -1'''
        # for each row
        for x in range(self._board.shape[0]):
            # if a player has completed an entire row
            if self._board[x, 0] != -1 and all(self._board[x, :] == self._board[x, 0]):
                # return the relative id
                return self._board[x, 0]

        # for each column
        for y in range(self._board.shape[1]):
            # if a player has completed an entire column
            if self._board[0, y] != -1 and all(self._board[:, y] == self._board[0, y]):
                # return the relative id
                return self._board[0, y]
            
        # if a player has completed the principal diagonal
        if self._board[0, 0] != -1 and all(
            [self._board[x, x]
                for x in range(self._board.shape[0])] == self._board[0, 0]
        ):
            # return the relative id
            return self._board[0, 0]
        # if a player has completed the secondary diagonal
        if self._board[0, -1] != -1 and all(
            [self._board[x, -(x + 1)]
             for x in range(self._board.shape[0])] == self._board[0, -1]
        ):
            # return the relative id
            return self._board[0, -1]
        
        return -1

In [5]:
tmp = Tic_Tac_Toe(3)

tmp.make_move(1, [0, 0])
tmp.make_move(1, [0, 1])
tmp.make_move(1, [0, 2])

print( tmp.is_available( (1,1) ) )

print( tmp.check_winner() )


True
1


In [27]:
#size = dim playing grid
#learning_rate = Check how much the agent should update its values 
#discount_factor = This value reflects how much the agent takes into account future rewards during learning
#exploration_prob = Represents the probability that the agent performs random action instead of choosing the action that maximizes the value Q.
class QAgent:
    def __init__(self, size=3, learning_rate=0.1, discount_factor=0.9, exploration_prob=0.1):
        self.size = size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob

        self.Q = {} # self.Q = {(state, player): {action: value} }

    
    def choose_action(self, state:Tic_Tac_Toe, current_player, available):
        #full state to Q dict
        hash_state = (state.get_string_board(), current_player[0])

        #check if use random or max from q table
        if random.uniform(0, 1) > self.exploration_prob:
            #best action to do in that state
            if hash_state in self.Q:
                return max(self.Q[hash_state], key=lambda k: self.Q[hash_state][k])
        
        #if there is no exploration or no state in Q 
        action = random.choice(list(available))

        return action

    def evaluate_reward(self, state: Tic_Tac_Toe, q_player, available):
        # Check if there is a win or if the game ended in a draw
        if state.check_winner() == q_player :
            done = True
            reward = 2  # positive reward for win the game
        elif state.check_winner() != -1 :
            done = True
            reward = -2  # negative reward for lose the game
        elif not available:
            done = True
            reward = 0  # Zero reward for draw
        else:
            done = False
            reward = 0  # No reward if the game is not over

        return reward, done

    def update_q_value(self, state:Tic_Tac_Toe, next_state:Tic_Tac_Toe, action, current_player, reward, available):
        hash_state = (state.get_string_board(), current_player[0])
        hash_next_state = (next_state.get_string_board(), 1-current_player[0])
        
        # Q(s,a)
        current_q = self.Q[hash_state].get(action, 0)  # Returns 0 if the key does not exist
        # Q(s',a')
        if not hash_next_state in self.Q:
            best_next_q = 0
        else:
            best_next_q = max([self.Q[hash_next_state].get(next_action, 0) for next_action in available], default=0)
        #q learning forumla => Q(s,a)=(1−α)⋅Q(s,a)+α⋅(r+γ⋅maxa′ Q(s',a'))
        self.Q[hash_state][action] = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * best_next_q)


    def play(self, episodes=1000):
        for _ in tqdm(range(episodes)):
            #initial state
            state = Tic_Tac_Toe()
            #avaiable moves for this episode
            available = [(r, c) for r in range(state._size) for c in range(state._size)]
            #random choose training player
            q_player = random.choice((0,1))
            #check if the game is over
            done = False
            #inizialize current player
            current_player = [random.choice((0,1))]

            #Play until there are moves to make and neither player has won
            while not done:
                if(q_player == current_player[0]):
                    # control whether to take a move from those available or from Q table
                    action = self.choose_action(state, current_player, available)
                else:
                    #random choise a move
                    action = random.choice(list(available))
  
                #check if the action is valid, despite the pick of the action is generally from available list,
                #sometimes it can arrive from max() of q table and in this case it is could be not valid
                if(action in available):
                    # remove the action from available 
                    available.remove(action)

                    #full state to Q dict
                    hash_state = (state.get_string_board(), current_player[0])
                    #if there isn't this state create it in q table
                    if not hash_state in self.Q:
                        self.Q[hash_state] = {}
                    #add this action to q table in this state
                    self.Q[hash_state][action] = 0
                    
                    # Copies the current state so you don't change it directly
                    next_state = deepcopy(state)
                    # play the action in x or o according to the player turn
                    next_state.make_move(current_player[0], action)
                  
                    #evaluate_reward
                    reward, done = self.evaluate_reward(next_state, q_player, available)
                    
                    #update the q table
                    if(q_player == current_player[0]):
                        self.update_q_value(state, next_state, action, current_player, reward, available)
                    
                    #update the state with the new one
                    state = next_state
                    
                    #change player turn
                    current_player[0] = 1 - current_player[0]
                else:
                    #reward -1?
                    pass
            

In [31]:
q_agent = QAgent()
q_agent.play(episodes=100_000)

100%|██████████| 100000/100000 [00:59<00:00, 1689.75it/s]


In [30]:
print(q_agent.Q)

{('[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1]]', 1): {(2, 2): 0.0, (1, 0): 0, (0, 2): 0, (2, 1): 0.0, (1, 1): 0, (0, 0): 0, (1, 2): 0, (0, 1): 0.0, (2, 0): 0}, ('[[-1, -1, -1], [-1, -1, -1], [-1, -1, 1]]', 0): {(1, 1): 0, (2, 1): 0, (0, 0): 0, (0, 2): 0, (0, 1): 0, (1, 2): 0, (2, 0): 0, (1, 0): 0}, ('[[-1, -1, -1], [-1, 0, -1], [-1, -1, 1]]', 1): {(0, 1): 0.0, (0, 0): 0, (1, 0): 0, (1, 2): 0, (0, 2): 0, (2, 0): 0, (2, 1): 0}, ('[[-1, 1, -1], [-1, 0, -1], [-1, -1, 1]]', 0): {(0, 2): 0, (1, 0): 0, (1, 2): 0, (2, 1): 0, (2, 0): 0, (0, 0): 0}, ('[[-1, 1, 0], [-1, 0, -1], [-1, -1, 1]]', 1): {(2, 1): 0.018000000000000002, (1, 0): 0, (2, 0): 0}, ('[[-1, 1, 0], [-1, 0, -1], [-1, 1, 1]]', 0): {(2, 0): 0.2, (1, 0): 0}, ('[[-1, -1, -1], [-1, -1, -1], [-1, -1, -1]]', 0): {(0, 0): 0.0, (0, 2): 0, (2, 2): 0, (2, 0): 0, (2, 1): 0, (1, 1): 0, (1, 0): 0, (0, 1): 0, (1, 2): 0}, ('[[0, -1, -1], [-1, -1, -1], [-1, -1, -1]]', 1): {(2, 2): 0, (1, 1): 0, (2, 0): 0, (0, 1): 0, (1, 0): 0, (1, 2): 0, (2, 1): 0, 