In [1]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from abc import ABC, abstractmethod

from tqdm.auto import tqdm
import numpy as np

## Problem : Reinforcement Learning for Tic-Tac-Toe Game

In general, to develop a reinforcement learning algorithm, we need to define the following components:
* **Environment** 
  * Possible states in the game environment
  * Possible actions in each state
  * Rewards for each action in each state

* **Agent**
  * Policy: the strategy to choose an action given a state
  * Value function: the expected return of each state under a given policy
  * Model: the agent's representation of the environment

* **Learning Algorithm**
  * How the agent updates its policy and value function based on the experience

In this problem, we will implement a reinforcement learning algorithm for the Tic-Tac-Toe game.

State is a namedtuple with two fields, x and o, representing the positions of X and O in the board.

MAGIC is a list of values that can be used to check whether a player has won the game. They are based on the magic square of order 3.

<table>
  <tr>
    <td>2</td>
    <td>7</td>
    <td>6</td>
  </tr>
  <tr>
    <td>9</td>
    <td>5</td>
    <td>1</td>
  </tr>
  <tr>
    <td>4</td>
    <td>3</td>
    <td>8</td>
  </tr>
</table>

In this way, the sum of three numbers in any row, column, or diagonal is always 15.

In [2]:
State = namedtuple('State', ['x', 'o'])
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

## General Player

In [3]:
# stolen from quixo repo
class Player(ABC):
    def __init__(self) -> None:
        '''You can change this for your player if you need to handle state/have memory'''
        pass

    @abstractmethod
    def make_move(self):
        '''
        game: the Quixo game. You can use it to override the current game with yours, but everything is evaluated by the main game
        return values: this method shall return a tuple of X,Y positions and a move among TOP, BOTTOM, LEFT and RIGHT
        '''
        pass

## Game

In [4]:
class Game:
    def __init__(self):
        self.state = State(set(), set()) # actual state of the game
        self.trajectory = list() # list of states of the game
        self.available_moves = set(range(1, 10)) # available moves
        self.winner = None # winner of the game

    def play(self, player1, player2):
        """Play a game between two players"""
        # print("available moves: ", self.available_moves)
        local_winner = -1
        while local_winner == -1 and len(self.available_moves) > 0:
            # player1 makes a move
            move = player1.make_move(self.state, self.available_moves)
            # print("player1 move: ", move)

            # the move is added to the state
            self.state.x.add(move)

            # the trajectory is updated
            self.trajectory.append(deepcopy(self.state))

            # the move is removed from the available moves
            self.available_moves.remove(move)
            # print("available moves: ", self.available_moves)

            # check if the game is over
            local_winner = self.check_winner()
            if local_winner != -1 or len(self.available_moves) == 0:
                break

            # same for player2
            move = player2.make_move(self.state, self.available_moves)
            # print("player2 move: ", move)
            self.state.o.add(move)
            self.trajectory.append(deepcopy(self.state))
            self.available_moves.remove(move)
            # print("available moves: ", self.available_moves)
            local_winner = self.check_winner()
            if local_winner != -1 or len(self.available_moves) == 0:
                break

        self.winner = local_winner
        # print("winner: ", self.winner)

    def check_winner(self):
        """Set the winner: 1 for player1, 2 for player2, -1 for draw"""
        if self.win(self.state.x):
            return 1
        elif self.win(self.state.o):
            return 2
        else:
            return -1
        
    # win() function checks if any of the combinations of 3 elements in the set sums to 15 (winning condition)
    def win(self, elements):
        """Checks if elements is winning"""
        return any(sum(c) == 15 for c in combinations(elements, 3))

defaultdict is a subclass of dict that returns a default value when the key is not found, so that it is not needed to check whether a key is in the dictionary.

frozenset is an immutable version of set, which can be used as a key in a dictionary.

## Random Player

In [5]:
class RandomPlayer(Player):
    def __init__(self):
        super().__init__()

    def make_move(self, state, available_moves):
        return choice(list(available_moves))

## Reinforcement Learning Player

In [6]:
class reinforcement_player():
    def __init__(self, player_index, random_move = 0.0):
        self.value_dictionary = defaultdict(float) # state of the game and its value
        # self.trajectory = list() # list of states visited during the game, used to update the value_dictionary
        self.hit_state = defaultdict(int) # state of the game and how many times it was visited during the training phase
        self.epsilon = 0.2 # learning rate
        self.player_index = player_index # index of the player (1 or 2)
        self.random_move = random_move # a value between 0 and 1, used to choose a random move when training

    # in make_moves we have to sometimes choose a random move when training
    def make_move(self, state, available_moves):
        """Returns best move for the actual state"""
        # it checks the value of the new_state for each possible move and returns the move with the highest value
        best_move_score = -10_000
        best_move = None
        if np.random.rand() < self.random_move:
            return choice(list(available_moves))
        else:
            for move in available_moves:
                new_state = deepcopy(state)
                if self.player_index == 1:
                    new_state.x.add(move)
                else:
                    new_state.o.add(move)
                hashable_state = (frozenset(new_state.x), frozenset(new_state.o))
                actual_move_score = self.value_dictionary[hashable_state]
                if actual_move_score > best_move_score:
                    best_move_score = actual_move_score
                    best_move = move

        return best_move
    
    def give_reward(self, reward, trajectory):
        """Updates the value of the states visited during the game"""
        for state in reversed(trajectory):
            hashable_state = (frozenset(state.x), frozenset(state.o))
            self.hit_state[hashable_state] += 1
            self.value_dictionary[hashable_state] += self.epsilon * (0.9 * reward - self.value_dictionary[hashable_state])
            reward = self.value_dictionary[hashable_state]

    def print_value_dictionary(self):
        """Prints the value of each state"""
        return sorted(self.value_dictionary.items(), key=lambda e: e[1], reverse=True)
    
    def set_random_move(self, random_move):
        """Sets the value of random_move"""
        self.random_move = random_move

    def set_self_debug(self, debug_value):
        """Sets the value of debug"""
        self.debug = debug_value


In [8]:
player1 = reinforcement_player(1, 0.3)
player2 = reinforcement_player(2, 0.3)
random_player = RandomPlayer()

# training phase
for _ in tqdm(range(100_000)):
    game = Game()
    game.play(player1, random_player)
    if game.winner == 1:
        player1.give_reward(1, game.trajectory)
    elif game.winner == 2:
        player1.give_reward(-1, game.trajectory)
    else:
        player1.give_reward(0, game.trajectory)


for _ in tqdm(range(100_000)):
    game = Game()
    game.play(random_player, player2)
    if game.winner == 1:
        player2.give_reward(-0.8, game.trajectory)
    elif game.winner == 2:
        player2.give_reward(1, game.trajectory)
    else:
        player2.give_reward(0.5, game.trajectory)

  0%|          | 0/100000 [00:00<?, ?it/s]

  0%|          | 0/100000 [00:00<?, ?it/s]

In [10]:
# testing phase
player1.set_random_move(0.0)
player2.set_random_move(0.0)

winning_rate_1 = 0
loss_rate_1 = 0
draw_rate_1 = 0

for _ in tqdm(range(10_000)):
    game = Game()
    game.play(player1, random_player)
    if game.winner == 1:
        winning_rate_1 += 1
    elif game.winner == 2:
        loss_rate_1 += 1
    else:
        draw_rate_1 += 1


winning_rate_2 = 0
loss_rate_2 = 0
draw_rate_2 = 0

for _ in tqdm(range(10_000)):
    game = Game()
    game.play(random_player, player2)
    if game.winner == 1:
        loss_rate_2 += 1
    elif game.winner == 2:
        winning_rate_2 += 1
    else:
        draw_rate_2 += 1

print("player 1:")
print("winning rate: ", winning_rate_1/100)
print("loss rate: ", loss_rate_1/100)
print("draw rate: ", draw_rate_1/100)
print("--------------------------------------")
print("player 2:")
print("winning rate: ", loss_rate_2/100)
print("loss rate: ", winning_rate_2/100)
print("draw rate: ", draw_rate_2/100)

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

player 1:
winning rate:  98.74
loss rate:  0.0
draw rate:  1.26
--------------------------------------
player 2:
winning rate:  2.88
loss rate:  89.0
draw rate:  8.12


In [None]:
train_matches = [100_000, 250_000, 500_000]
test_matches = 10_000
test_game = Game()
# player1 = [reinforcement_player(1, 1), reinforcement_player(1, 0.5), reinforcement_player(1, 0.3)]
params = [1, 0.5, 0.3]
# player1 = [reinforcement_player(1, param) for param in params]
rewards_player1 = [[1, -1, 0.1], [1, 0, 0.1]]
# player2 = [reinforcement_player(2, 1), reinforcement_player(2, 0.5), reinforcement_player(2, 0.3)]
# player2 = [reinforcement_player(2, param) for param in params]
rewards_player2 = [[1, -1, 0.3], [1, 0, 0.3]]
player_random = RandomPlayer()

# Train Phase
for matches in train_matches:
    for rewards in rewards_player1:
        player1 = [reinforcement_player(1, 1), reinforcement_player(1, 0.5), reinforcement_player(1, 0.3)]
        for reinforcement_player1 in player1:
            for _ in range(matches):
                test_game = Game()
                test_game.play(reinforcement_player1, player_random)
                if (test_game.winner == 1):
                    reinforcement_player1.give_reward(rewards[0], test_game.trajectory)
                elif (test_game.winner == 2):
                    reinforcement_player1.give_reward(rewards[1], test_game.trajectory)
                else:
                    if(reinforcement_player1.player_index == 1):
                        reinforcement_player1.give_reward(rewards[2], test_game.trajectory)
                    else:
                        reinforcement_player1.give_reward(0.3, test_game.trajectory)

            print("Player 1")
            print("random_rate: ", reinforcement_player1.random_move)

            win_rate = 0
            draw_rate = 0
            loss_rate = 0
            reinforcement_player1.set_random_move(0)
            for _ in range(test_matches):
                test_game = Game()
                test_game.play(reinforcement_player1, player_random)
                if (test_game.winner == 1):
                    win_rate += 1
                elif (test_game.winner == -1):
                    draw_rate += 1
                else:
                    loss_rate += 1

            print("training matches: ", matches)
            print("rewards: ", rewards)
            print("wins: ", win_rate/100, "%")
            print("draws: ", draw_rate/100, "%")
            print("losses: ", loss_rate/100, "%")
            print("---------------------------------------------")
        
    for rewards in rewards_player2:
        player2 = [reinforcement_player(2, 1), reinforcement_player(2, 0.5), reinforcement_player(2, 0.3)]
        for reinforcement_player2 in player2:
            for _ in range(matches):
                test_game = Game()
                test_game.play(player_random, reinforcement_player2)
                if (test_game.winner == 2):
                    reinforcement_player2.give_reward(rewards[0], test_game.trajectory)
                elif (test_game.winner == 1):
                    reinforcement_player2.give_reward(rewards[1], test_game.trajectory)
                else:
                    if(reinforcement_player2.player_index == 1):
                        reinforcement_player2.give_reward(0.3, test_game.trajectory)
                    else:
                        reinforcement_player2.give_reward(rewards[2], test_game.trajectory)


            print("Player 2")
            print("random_rate: ", reinforcement_player2.random_move)

            win_rate = 0
            draw_rate = 0
            loss_rate = 0
            reinforcement_player2.set_random_move(0)
            for _ in range(test_matches):
                test_game = Game()
                test_game.play(player_random, reinforcement_player2)
                if (test_game.winner == 2):
                    win_rate += 1
                elif (test_game.winner == -1):
                    draw_rate += 1
                else:
                    loss_rate += 1

            print("training matches: ", matches)
            print("rewards: ", rewards)
            print("wins: ", win_rate/100, "%")
            print("draws: ", draw_rate/100, "%")
            print("losses: ", loss_rate/100, "%")
            print("---------------------------------------------")