Tic Tac Toe
---

<img style="float:center" src="../images/tris.png" alt="drawing" width="200"/>

In [8]:
import numpy as np
import pickle
from tqdm.auto import tqdm

### Board State
---
The ``TicTacToe`` class reflects the state of the board.
We use 1 to indicate player1 and -1 for player 2.

#### Parameter description
- ``board``: numpy array of dimension 3x3 that represents the game board.
- ``p1``: player1 class.
- ``p2``: player2 class.
- ``current_player``: indicates who has to take the turn.
- ``isEnd``: boolean that indicate if the game has finished.
- ``boardHash``: it is the board status but as a string.

#### Methods description
- ``available_positions``: it returns an array with the list of possible moves (each element is a tuple of two integers that indicates where to play).
- ``make_move``: takes as input the location of where a player played and puts the value of the ``current player`` in that place on the board, i.e. who is playing at that moment. It also gives the other player the turn by changing the ``current_player`` parameter.
- ``get_hash``: it returns the board state but in a string format.
- ``check_winner``: It checks if there is a winner. 
- ``reward``: It calls the function of the players that update the value estimation of states giving them the reward (1 if a player won, 0 if he loses). 
- ``reset``: It resets the state of the board by emptying the boxes with also all the other parameters.
- ``show_board``: It prints the board status. ``player1`` is the X and ``player2`` is the O.
- ``train``: We used two agents that use Reinforcement Learning to play against each other. During training the process of each player is: look for available positions, choose action, update board state and add the action to player's states, judge if reach the end of the game and give reward accordingly.
- ``test``: We test our trained policy with a random player.

In [1]:
class TicTacToe:
    def __init__(self,p1,p2):
        self.board = np.zeros((3,3))
        self.p1 = p1
        self.p2 = p2
        self.isEnd = False
        self.boardHash = None
        self.current_player = 1 #1 is p1, -1 is p2

    def available_positions(self):
        pos = []
        for i in range(3):
            for j in range(3):
                if self.board[i,j] == 0:
                    pos.append((i,j))
        return pos
    
    def make_move(self, position):
        if position not in self.available_positions():
            return None
        self.board[position] = self.current_player
        self.current_player = self.current_player*-1

    def get_hash(self):
        self.boardHash = str(self.board.reshape(3 * 3))
        return self.boardHash

    def check_winner(self):
        #check if rows contains 3 or -3 (someone win)
        for i in range(3): 
            if sum(self.board[i,:]) == 3:
                self.isEnd = True
                return 1 #player 1 won
        for i in range(3): #loop on the rows
            if sum(self.board[i,:]) == -3:
                self.isEnd = True
                return -1 #player 2 won
        
        #check if col contains 3 or -3
        for i in range(3):
            if sum(self.board[:,i]) == 3:
                self.isEnd = True
                return 1
        for i in range(3):
            if sum(self.board[:,i]) == -3:
                self.isEnd = True
                return -1
        
        #check diagonal win
        diag_sum = sum([self.board[i,i] for i in range(3)])
        if diag_sum == 3:
            self.isEnd= True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        diag_sum = sum([self.board[i,3-i-1] for i in range(3)])
        if diag_sum == 3:
            self.isEnd= True
            return 1
        if diag_sum == -3:
            self.isEnd = True
            return -1
        
        #here no one won..
        if len(self.available_positions())==0 :
            self.isEnd = True
            return 0 #no one won
        
        return None #Here there are still moves, so keep playing !!!
    
    def reward(self, result):
        if result == 1:
            self.p1.give_rew(1) #player 1 won, so give 1 reward
            self.p2.give_rew(0)
        elif result == -1:
            self.p1.give_rew(0)
            self.p2.give_rew(1)
        else:
            self.p1.give_rew(0.1) #give a less reward because we don't want ties
            self.p2.give_rew(0.5)

    def reset(self):
        self.board = np.zeros((3, 3))
        self.boardHash = None
        self.isEnd = False
        self.current_player = 1

    def show_board(self):
        # p1: x  p2: o
        for i in range(0, 3):
            print('-------------')
            out = '| '
            for j in range(0, 3):
                if self.board[i, j] == 1:
                    token = 'x'
                if self.board[i, j] == -1:
                    token = 'o'
                if self.board[i, j] == 0:
                    token = ' '
                out += token + ' | '
            print(out)
        print('-------------')    

    def train(self, rounds=10000):
        for epochs in tqdm(range(rounds)):
            while not self.isEnd:
                
                # Player 1
                positions = self.available_positions()
                p1_action = self.p1.choose_action(positions, self.board, self.current_player)
                # take action and update board state
                self.make_move(p1_action)
                board_hash = self.get_hash()
                self.p1.add_state(board_hash)
                # check the board status if it is ended
                win = self.check_winner()
                
                if win is not None: #It returns None only when no one finished or tied.
                    # self.showBoard()
                    # ended with p1 either win or draw
                    self.reward(win) #send rewards to the players, the game has ended
                    self.p1.reset()
                    self.p2.reset()
                    self.reset()
                    break

                else:
                    # Player 2
                    positions = self.available_positions()
                    p2_action = self.p2.choose_action(positions, self.board, self.current_player)
                    self.make_move(p2_action)
                    board_hash = self.get_hash()
                    self.p2.add_state(board_hash)

                    win = self.check_winner()
                    if win is not None:
                        # self.showBoard()
                        # ended with p2 either win or draw
                        self.reward(win)
                        self.p1.reset()
                        self.p2.reset()
                        self.reset()
                        break

    def test(self):
        while not self.isEnd:
            # Player 1
            positions = self.available_positions()
            p1_action = self.p1.choose_action(positions, self.board, self.current_player)
            # take action and update board state
            self.make_move(p1_action)
            # check board status if it is ended
            win = self.check_winner()
            if win is not None: #if win is not None means someone win or tie
                return win

            else:
                # Player 2
                positions = self.available_positions()
                p2_action = self.p2.choose_action(positions, self.board, self.current_player)

                self.make_move(p2_action)
                win = self.check_winner()
                if win is not None:
                    return win

## Reinforcement Learning Player
---

This class represents a player that uses Reinforcement Learning to make decisions in Quixo. More precisely, our player uses [Temporal difference (TD) learning](https://it.wikipedia.org/wiki/Temporal_difference_learning). TD in reinforcement learning is an unsupervised learning technique very commonly used in it for the purpose of predicting the total reward expected over the future. Essentially, TD learning is a combination of Monte Carlo ideas and dynamic programming (DP) ideas. Like Monte Carlo methods, TD methods can learn directly from raw experience without a model of the environment’s dynamics. Like DP, TD methods perform state value function updates based on current estimates.

#### Parameters description
- ``states_value``: Dictionary that has as key the states that a player has seen during all the matches and as value the parameter that we want to train.
- ``exp_rate``: Probability of doing a random move instead of the action with max Q-value.
- ``decay_gamma``: The exploration decay rate used during the training
- ``states``: All state-action pairs a player has seen during a single match. It is used at the end of each match to update the ``states_value``.

#### Methods description
- ``get_hash``: it returns the board state but in a string format.
- ``add_state``: It adds to the ``states`` array the state that a player has seen during a game.
- ``reset``: It reset the ``states`` array to be able to start a new game.
- ``choose_action``: It receives as input all the possible ``positions`` to play, the ``current_board`` that is the status of the board and the ``simbol`` that indicate the current player (1 for player1 and -1 for player2). This function has the job to decide the move of a player that can be random or based on the value of the dictionary. It takes from the dictionary, for each possible move, the value associated with the state of the board with the move performed. The maximum value will be the move to execute. We use the following recursive (bellman equation) formula to compute the state-value table: 
$$
V(S_t) \leftarrow V(S_t) + \alpha * (\gamma * V(S_t +1) - V(S_t))
$$
The formula simply tells us that the updated value of state t equals the current value of state t adding the difference between the value of the next state , which is multiplied by the discount factor of the Bellman Equation, and the value of the current state, which is multiplied by a learning rate α. The logic is that we update the current value slowly based on our latest observation.
- ``give_rew``: This function is called at every end of each game. It updates the values of the ``states_value`` dictionary based on the states that the player has seen during the game and the reward that they have provided.
- ``save_policy``: It saves the ``states_value`` dictionary that we have trained to a file.
- ``load_policy``: It loads the ``states_value`` dictionary from a file.

In [3]:
class RLPlayer:
    def __init__(self, name, lr=0.2, decay_gamma=0.9, exp_rate = 0.2):
        self.name = name
        self.states = []  # record all positions taken
        self.lr = lr
        self.exp_rate = exp_rate
        self.decay_gamma = decay_gamma
        self.states_value = {}  # state -> value

    def get_hash(self, board):
        boardHash = str(board.reshape(3*3))
        return boardHash

    def add_state(self, state):
        self.states.append(state)

    def choose_action(self, positions, current_board, symbol):
        if np.random.uniform(0, 1) <= self.exp_rate: # Do exploration, take random 
            # take random action
            idx = np.random.choice(len(positions))
            action = positions[idx]
        else: #Here do exploitation, take the action that has the highest value
            value_max = -999
            for p in positions:
                next_board = current_board.copy() #create a tmp board
                next_board[p] = symbol #do the action
                next_board_hash = self.get_hash(next_board) #get the hash
                value = 0 if self.states_value.get(next_board_hash) is None else self.states_value.get(next_board_hash)
                # print("value", value)
                if value >= value_max: #find the action that has max value. 
                    value_max = value
                    action = p
        return action
    
    def reset(self):
        self.states = []

    def give_rew(self, reward):
        #At the end of the game, I'll get a reward. The iterating on the states in reverse.
        # Set the value of the state to 0 if not existing, otherwise update it with the reward. 
        for st in reversed(self.states):
            if self.states_value.get(st) is None: #if the state doesn't have a value, set it to 0
                self.states_value[st] = 0
            #this is V(t) = V(t) + lr * (gamma*V(t+1) - V(t))
            self.states_value[st] += self.lr * (self.decay_gamma * reward - self.states_value[st])
            reward = self.states_value[st]
            
    def save_policy(self):
        fw = open('policy_' + str(self.name), 'wb')
        pickle.dump(self.states_value, fw)
        fw.close()

    def load_policy(self, file):
        fr = open(file,'rb')
        self.states_value = pickle.load(fr)
        fr.close()

## Random Player

In [11]:
class RandomPlayer:
    def __init__(self, name):
        self.name = "random"

    def choose_action(self, positions, board, current_player):
        x = np.random.randint(0,len(positions)-1)
        return positions[x]
    
    def add_state(self, state):
        pass

    def give_rew(self, reward):
        pass
            
    def reset(self):
        pass

## Hyperparameters
---
- ``epochs``: training epochs
- ``alpha``: learning rate
- ``epsilon``: probability of doing a random move instead of the action with max value
- ``discount_factor``: the discount rate of the Bellman equation
- ``num_games``: number of games for testing

In [1]:
epochs = 50000
alpha = 0.2
epsilon = 0.2
discount_factor = 0.9
num_games = 1000

## Let's do some computation

In [5]:
p1 = RLPlayer("p1_RL", lr=alpha, decay_gamma=discount_factor, exp_rate=epsilon)
p2 = RLPlayer("p2_RL", lr=alpha, decay_gamma=discount_factor, exp_rate=epsilon)
st = TicTacToe(p1, p2)

print("training...")
st.train(rounds=epochs)

## Test Reinforcement Learning

In [27]:
p2 = RandomPlayer("Random")
st = TicTacToe(p1,p2)
win_comp = 0
num_draws = 0

for epoch in range(num_games):
    win = st.test()
    if win == 1:
        win_comp+=1
    if win == 0:
        num_draws+=1
    st.reset()

print(f"Over 1000 matches: {win_comp} wins, {1000 - win_comp - num_draws} losses, {num_draws} draws")
print(f"Wins + Draws percentage: {(win_comp + num_draws) / epochs * 100}")

Over 1000 matches: 910 wins, 51 losses, 39 draws
Wins + Draws percentage: 94.89999999999999
