# LAB10

Use reinforcement learning to devise a tic-tac-toe player.

### Deadlines:

* Submission: Sunday, December 17 ([CET](https://www.timeanddate.com/time/zones/cet))
* Reviews: Dies Natalis Solis Invicti ([CET](https://en.wikipedia.org/wiki/Sol_Invictus))

Notes:

* Reviews will be assigned  on Monday, December 4
* You need to commit in order to be selected as a reviewer (ie. better to commit an empty work than not to commit)

In [1]:
from abc import ABC, abstractmethod
from itertools import combinations, product
from tqdm import trange
import numpy as np
from copy import deepcopy

### Game

In [427]:
class Player(ABC):
    def __init__(self) -> None:
        '''You can change this for your player if you need to handle state/have memory'''
        pass

    @abstractmethod
    def make_move(self, game: 'Tic_Tac_Toe',id: int) -> tuple[int, int]:
        '''
        game: the Quixo game. You can use it to override the current game with yours, but everything is evaluated by the main game
        return values: this method shall return a tuple of X,Y positions and a move among TOP, BOTTOM, LEFT and RIGHT
        '''
        pass


class Tic_Tac_Toe(object):
    def __init__(self) -> None:
        self._board = np.array([[1, 6, 5], [8, 4, 0], [3, 2, 7]])
        self._o_cells = [] # start the game  (player_id = 0)
        self._x_cells = [] # play for second (player_id = 1)
        

    def get_state(self):
        return (deepcopy(sorted(self._o_cells)), deepcopy(sorted(self._x_cells)))

    def draw(self):
        return ((len(self._x_cells) + len(self._o_cells)) == 9) and self.check_winner() == -1

    def won(self, cells):
        return any(sum(h) == 12 for h in combinations(cells, 3))

    def check_winner(self) -> int:
        if self.won(self._o_cells):
            return 0
        elif self.won(self._x_cells):
            return 1
        return -1

    def print(self):
        '''Prints the board. -1 are neutral pieces, 0 are pieces of player 0, 1 pieces of player 1'''

        pretty_board = np.chararray(self._board.shape, itemsize=1, unicode=True)
        for r in range(self._board.shape[0]):
            for c in range(self._board.shape[1]):
                if self._board[r, c] in self._x_cells:
                    pretty_board[(r, c)] = '❌'
                elif self._board[r, c] in self._o_cells:
                    pretty_board[(r, c)] = '⭕'
                else:
                    pretty_board[(r, c)] = '⬜'

        print(f'Board:\n{pretty_board}')

    def play(self, player1: Player, player2: Player,view=True) -> int:
        '''Play the game. Returns the winning player'''
        players = [player1, player2]
        current_player_idx = 1
        winner = -1
        draw = False
        while winner < 0 and not draw:
            current_player_idx += 1
            current_player_idx %= len(players)
            ok = False
            while not ok:
                pos = players[current_player_idx].make_move(self,current_player_idx)
                ok = self.move(pos, current_player_idx)
            winner = self.check_winner()
            draw = self.draw()
            if view:
                self.print()
        return winner

    def move(self, pos: tuple[int, int], player_id: int) -> bool:
        '''Perform a move'''
        if player_id > 1:
            return False
        acceptable: bool = self.valid_move(pos)
        if acceptable:
            # put the player id in the piece
            if player_id == 0:
                self._o_cells.append(self._board[pos])
            elif player_id == 1:
                self._x_cells.append(self._board[pos])
        return acceptable

    def valid_move(self, pos):
        return (self._board[pos] not in self._x_cells) and (self._board[pos] not in self._o_cells)

### Random Player

In [283]:
import random

class RandomPlayer(Player):
    def __init__(self) -> None:
        super().__init__()
        self.__name__ = 'Random Player'


    def make_move(self, game: 'Tic_Tac_Toe',id:int) -> tuple[tuple[int, int]]:
        # choose random position (row,col)
        pos = (random.randint(0, 2), random.randint(0, 2))
        return pos

### Q-Learning Player


In [393]:
class QLearningPlayer(Player):
    def __init__(self, num_episodes=100_000, learning_rate=0.1, discount_rate=0.99, exploration_rate=1, max_expolation_rate = 1, min_exploration_rate = 0.05, exploration_decay_rate = 5e-5 ) -> None:
        super().__init__()
        self._num_episodes = num_episodes
        self._learning_rate = learning_rate
        self._discount_rate = discount_rate
        self._exploration_rate = exploration_rate
        self._max_expolation_rate = max_expolation_rate
        self._min_exploration_rate = min_exploration_rate
        self._exploration_decay_rate = exploration_decay_rate
        self._q_table = {}
        self._action_to_move = dict(enumerate(product([0, 1, 2], repeat=2)))
        self.__name__ = 'Q-Learning RL Player'

    def make_move(self, game: 'Tic_Tac_Toe',id: int) -> tuple[tuple[int, int]]:
        state = (game.get_state(), id)

        # check if the agent have the state in the q_table
        if str(state) in self._q_table.keys():
            action = np.argmax(self._q_table[str(state)])
            move = self._action_to_move[action]
            if game.valid_move(move):
                return move
    
        # choose random position (row,col)
        return (random.randint(0, 2), random.randint(0, 2))

    def training(self, game: Tic_Tac_Toe) -> None:
        rewards_all_episodes = []

        pbar = trange(0, self._num_episodes)
        for episode in pbar:
            if episode>0:
                pbar.set_description(f"Rewards: {rewards_all_episodes[episode-1]}, Exploration_rate:{self._exploration_rate}")

            g = deepcopy(game)

            rewards_current_episode = 0

            # random start
            players = [RandomPlayer(), self]
            np.random.shuffle(players)
            current_player_idx = 1

            winner = -1
            draw = False

            # game
            while winner < 0 and not draw:
                current_player_idx += 1
                current_player_idx %= 2
                ok = False

                # valid move
                while not ok:
                    
                    if players[current_player_idx] == self:
                        state = (g.get_state(),current_player_idx)

                        # if the state is not in the table, add a row with 0 values
                        if str(state) not in self._q_table.keys():
                            self._q_table[str(state)] = [0] * 9

                        if random.random() > self._exploration_rate:
                            action = np.argmax(self._q_table[str(state)])
                            move = self._action_to_move[action]
                        else:
                            action = random.choice(range(9))
                            move = self._action_to_move[action]
                            
                        # check if the move is valid
                        if g.valid_move(move):         
                            # apply the move and get +1 reward
                            g.move(move, current_player_idx )
                            reward = 1
                            new_state = (g.get_state(), current_player_idx)

                            # update for my move
                            self.update_q_table(new_state,state,action,reward)

                            rewards_current_episode += reward
                            ok = True
                        else:
                            # -inf q-value for illegal moves
                            self._q_table[str(state)][action] = float('-inf')
                            ok = False
                    else:
                        
                        # random move for the opponent
                        pos = players[current_player_idx].make_move(g,current_player_idx)
                        ok = g.move(pos, current_player_idx)

                    winner = g.check_winner()
                    draw = g.draw()

            if winner == -1:
                # 0 for draw
                reward = 0
            elif players[winner] == self:
                # 10 for win
                reward = 10
            else:
                # -10 for lose
                reward = -10

            rewards_current_episode += reward
            self.update_q_table(new_state,state,action,reward)

            # update exploration rate
            self._exploration_rate = self._min_exploration_rate + (self._max_expolation_rate - self._min_exploration_rate) * np.exp(-self._exploration_decay_rate*episode)
            rewards_all_episodes.append(rewards_current_episode)
        print(f'Mean rewards: {sum(rewards_all_episodes)/self._num_episodes}\nKnowed states:{len(self._q_table)}')
            
        return rewards_all_episodes

        
    def update_q_table(self, new_state, state, action, reward) -> None:
        # if the new_state is not in the table, add a row with 0 values
        if str(new_state) not in self._q_table.keys():
            self._q_table[str(new_state)] = [0] * 9

        # update q_table
        self._q_table[str(state)][action] = self._q_table[str(state)][action] * (
            1 - self._learning_rate
        ) + self._learning_rate * (
            reward + self._discount_rate * np.max([self._q_table[str(new_state)]])
        )
        return

#### Training

In [401]:
g = Tic_Tac_Toe()
q_learning_player = QLearningPlayer()
rewards = q_learning_player.training(g)

Rewards: 13, Exploration_rate:0.056401689786102424: 100%|██████████| 100000/100000 [02:59<00:00, 557.65it/s]

Mean rewards: 9.64301
Knowed states:9878





#### Test

In [415]:
def test(player_0,player_1,reverse=False,num_games=1_000):
    wins = 0
    draws = 0
    loses = 0
    for _ in range(num_games):
        g = Tic_Tac_Toe()
        if reverse:
            winner = g.play(player_1,player_0)
            if winner == 1:
                wins += 1
            elif winner == 0:
                loses += 1
            else:
                draws += 1
        else:
            winner = g.play(player_0,player_1)
            if winner == 0:
                wins += 1
            elif winner == 1:
                loses += 1
            else:
                draws += 1
    if reverse:
        print(f'{player_0.__name__} playing as second\nWins:{wins}\nLoses:{loses}\nDraws:{draws}\nPercentage not loses:{(wins+draws)/num_games:0.2%}')
    else:
        print(f'{player_0.__name__} playing as first\nWins:{wins}\nLoses:{loses}\nDraws:{draws}\nPercentage not loses:{(wins+draws)/num_games:0.2%}')


#### Test vs Random

In [421]:
test(q_learning_player,RandomPlayer())
print()
test(q_learning_player,RandomPlayer(),reverse=True)

Q-Learning RL Player playing as first
Wins:892
Loses:6
Draws:102
Percentage not loses:99.40%

Q-Learning RL Player playing as second
Wins:672
Loses:73
Draws:255
Percentage not loses:92.70%


### Montecarlo Learning

In [387]:
class MonteCarloLearningPlayer(Player):
    def __init__(self, num_episodes=100_000, discount_rate=0.99,exploration_rate=1, max_expolation_rate = 1, min_exploration_rate = 0.05, exploration_decay_rate = 5e-5) -> None:
        super().__init__()
        self._num_episodes = num_episodes
        self._discount_rate = discount_rate
        self._exploration_rate = exploration_rate
        self._max_expolation_rate = max_expolation_rate
        self._min_exploration_rate = min_exploration_rate
        self._exploration_decay_rate = exploration_decay_rate
        self._q_table = {}
        self._q_returns = {}
        self._action_to_move = dict(enumerate(product([0, 1, 2], repeat=2)))
        self.__name__ = 'Montecarlo RL Player'

    def make_move(self, game: 'Tic_Tac_Toe',id: int) -> tuple[tuple[int, int]]:
        state = (game.get_state(), id)

        # check if the agent have the state in the q_table
        if str(state) in self._q_table.keys():
            action = np.argmax(self._q_table[str(state)])
            move = self._action_to_move[action]
            if game.valid_move(move):
                return move
    
        # choose random position (row,col)
        return (random.randint(0, 2), random.randint(0, 2))

    def training(self, game: Tic_Tac_Toe) -> None:
        rewards_all_episodes = []

        pbar = trange(0, self._num_episodes)
        for episode in pbar:
            if episode>0:
                pbar.set_description(f"Rewards: {rewards_all_episodes[episode-1]}, Exploration_rate:{self._exploration_rate}")

            g = deepcopy(game)

            rewards_current_episode = 0

            # random start
            players = [RandomPlayer(), self]
            np.random.shuffle(players)
            current_player_idx = 1

            winner = -1
            draw = False

            visited_states = []
            actions = []
            rewards = []

            # game
            while winner < 0 and not draw:
                current_player_idx += 1
                current_player_idx %= 2
                ok = False

                
                # valid move
                while not ok:
                    # my turn
                    if players[current_player_idx] == self:
                        state = (g.get_state(),current_player_idx)

                        # if the state is not in the table, add a row with 0 values
                        if str(state) not in self._q_table.keys():
                            self._q_table[str(state)] = [0] * 9
                        if str(state) not in self._q_returns.keys():
                            self._q_table[str(state)] = [0] * 9

                        # check for exploration-exploitation trade-off
                        if random.random() > self._exploration_rate:
                            action = np.argmax(self._q_table[str(state)])
                            move = self._action_to_move[action]
                        else:
                            action = random.choice(range(9))
                            move = self._action_to_move[action]
                            
                        # check if the move is valid
                        if g.valid_move(move):         
                            # apply the move and get +1 reward
                            g.move(move, current_player_idx)
                            reward = 1
                            rewards_current_episode += reward
                            
                            # save trajectory
                            visited_states.append(state)
                            rewards.append(reward)
                            actions.append(action)
                            
                            ok = True
                        else:
                            # -inf q-value for illegal moves
                            self._q_table[str(state)][action] = float('-inf')
                            ok = False
                    else:
                        
                        # random move for the opponent
                        pos = players[current_player_idx].make_move(g,current_player_idx)
                        ok = g.move(pos, current_player_idx)

                    winner = g.check_winner()
                    draw = g.draw()

            if winner == -1:
                # 0 for draw
                reward = 0
            elif players[winner] == self:
                # 10 for win
                reward = 10
            else:
                # -10 for lose
                reward = -10

            # chenge final reward
            rewards[-1] = reward

            rewards_current_episode += reward
            
            Gt = 0
            for t in range(len(visited_states)-1,0,-1):

                Gt = self._discount_rate*Gt + rewards[t]

                self.update_q_table(visited_states[t],actions[t],Gt)


            # update exploration rate
            self._exploration_rate = self._min_exploration_rate + (self._max_expolation_rate - self._min_exploration_rate) * np.exp(-self._exploration_decay_rate*episode)
            rewards_all_episodes.append(rewards_current_episode)
        print(f'Mean rewards: {sum(rewards_all_episodes)/self._num_episodes}\nKnowed states:{len(self._q_table)}')
            
        return rewards_all_episodes

        
    def update_q_table(self,state, action, gt) -> None:
        # if the new_state is not in the table, add a row with 0 values
        if str(state) not in self._q_table.keys():
            self._q_table[str(state)] = [0] * 9

        if str(state) not in self._q_returns.keys():
            self._q_returns[str(state)] = [0] * 9

        self._q_returns[str(state)][action] += 1

        # update q_table
        self._q_table[str(state)][action] = (gt - self._q_table[str(state)][action]) * (1 / self._q_returns[str(state)][action]) + self._q_table[str(state)][action]
    
        return



In [388]:
g = Tic_Tac_Toe()
montecarlo_player = MonteCarloLearningPlayer()
rewards = montecarlo_player.training(g)

Rewards: 13, Exploration_rate:0.056401689786102424: 100%|██████████| 100000/100000 [02:42<00:00, 616.74it/s]

Mean rewards: 10.8927
Knowed states:4497





Test vs Random

In [422]:
test(montecarlo_player,RandomPlayer())
print()
test(montecarlo_player,RandomPlayer(),True)

Montecarlo RL Player playing as first
Wins:983
Loses:0
Draws:17
Percentage not loses:100.00%

Montecarlo RL Player playing as second
Wins:875
Loses:49
Draws:76
Percentage not loses:95.10%


#### Montecarlo vs Q-Learning

In [424]:
test(q_learning_player,montecarlo_player)
print()
test(q_learning_player,montecarlo_player,True)

Q-Learning RL Player playing as first
Wins:507
Loses:0
Draws:493
Percentage not loses:100.00%

Q-Learning RL Player playing as second
Wins:0
Loses:0
Draws:1000
Percentage not loses:100.00%


#### Observing agents playing

In [425]:
def view_single_match(player1,player2):    
    g = Tic_Tac_Toe()
    winner = g.play(player1,player2,view=True)
    g.print()
    if winner == -1:
        print("Game endend in Draw.")
    else:
        print(f"Winner: Player {winner}")

In [454]:
view_single_match(montecarlo_player,q_learning_player)

Board:
[['⭕' '⬜' '⬜']
 ['⬜' '⬜' '⬜']
 ['⬜' '⬜' '⬜']]
Board:
[['⭕' '⬜' '⬜']
 ['⬜' '⬜' '⬜']
 ['❌' '⬜' '⬜']]
Board:
[['⭕' '⬜' '⬜']
 ['⬜' '⭕' '⬜']
 ['❌' '⬜' '⬜']]
Board:
[['⭕' '⬜' '⬜']
 ['⬜' '⭕' '⬜']
 ['❌' '⬜' '❌']]
Board:
[['⭕' '⬜' '⬜']
 ['⬜' '⭕' '⬜']
 ['❌' '⭕' '❌']]
Board:
[['⭕' '❌' '⬜']
 ['⬜' '⭕' '⬜']
 ['❌' '⭕' '❌']]
Board:
[['⭕' '❌' '⬜']
 ['⭕' '⭕' '⬜']
 ['❌' '⭕' '❌']]
Board:
[['⭕' '❌' '⬜']
 ['⭕' '⭕' '❌']
 ['❌' '⭕' '❌']]
Board:
[['⭕' '❌' '⭕']
 ['⭕' '⭕' '❌']
 ['❌' '⭕' '❌']]
Board:
[['⭕' '❌' '⭕']
 ['⭕' '⭕' '❌']
 ['❌' '⭕' '❌']]
Game endend in Draw.
