In [1]:
from itertools import combinations
from collections import namedtuple, defaultdict
from random import choice
from copy import deepcopy
from abc import ABC, abstractmethod
import os
import pickle

from tqdm.auto import tqdm
import numpy as np

## Problem : Reinforcement Learning for Tic-Tac-Toe Game

In general, to develop a reinforcement learning algorithm, we need to define the following components:
* **Environment** 
  * Possible states in the game environment
  * Possible actions in each state
  * Rewards for each action in each state

* **Agent**
  * Policy: the strategy to choose an action given a state
  * Value function: the expected return of each state under a given policy
  * Model: the agent's representation of the environment

* **Learning Algorithm**
  * How the agent updates its policy and value function based on the experience

In this problem, we will implement a reinforcement learning algorithm for the Tic-Tac-Toe game.

State is a namedtuple with two fields, x and o, representing the positions of X and O in the board.

MAGIC is a list of values that can be used to check whether a player has won the game. They are based on the magic square of order 3.

<table>
  <tr>
    <td>2</td>
    <td>7</td>
    <td>6</td>
  </tr>
  <tr>
    <td>9</td>
    <td>5</td>
    <td>1</td>
  </tr>
  <tr>
    <td>4</td>
    <td>3</td>
    <td>8</td>
  </tr>
</table>

In this way, the sum of three numbers in any row, column, or diagonal is always 15.

In [2]:
State = namedtuple('State', ['x', 'o'])
MAGIC = [2, 7, 6, 9, 5, 1, 4, 3, 8]

## General Player

In [3]:
# stolen from quixo repo
class Player(ABC):
    def __init__(self) -> None:
        '''You can change this for your player if you need to handle state/have memory'''
        pass

    @abstractmethod
    def make_move(self):
        '''
        game: the Quixo game. You can use it to override the current game with yours, but everything is evaluated by the main game
        return values: this method shall return a tuple of X,Y positions and a move among TOP, BOTTOM, LEFT and RIGHT
        '''
        pass

## Game

In [4]:
class Game:
    def __init__(self):
        self.state = State(set(), set()) # actual state of the game
        self.trajectory = list() # list of states of the game
        self.available_moves = set(range(1, 10)) # available moves
        self.winner = None # winner of the game

    def play(self, player1, player2):
        """Play a game between two players"""
        local_winner = -1
        while local_winner == -1 and len(self.available_moves) > 0:
            if isinstance(player1, Human_Player) or isinstance(player2, Human_Player):
                self.print_board()
            # player1 makes a move
            move = player1.make_move(self.state, self.available_moves)

            # the move is added to the state
            self.state.x.add(move)

            # the trajectory is updated
            self.trajectory.append(deepcopy(self.state))

            # the move is removed from the available moves
            self.available_moves.remove(move)

            # check if the game is over
            local_winner = self.check_winner()
            if local_winner != -1 or len(self.available_moves) == 0:
                break

            if isinstance(player1, Human_Player) or isinstance(player2, Human_Player):
                self.print_board()
            # same for player2
            move = player2.make_move(self.state, self.available_moves)
            self.state.o.add(move)
            self.trajectory.append(deepcopy(self.state))
            self.available_moves.remove(move)
            local_winner = self.check_winner()
            if local_winner != -1 or len(self.available_moves) == 0:
                break

        self.winner = local_winner

    def check_winner(self):
        """Set the winner: 1 for player1, 2 for player2, -1 for draw"""
        if self.win(self.state.x):
            return 1
        elif self.win(self.state.o):
            return 2
        else:
            return -1
        
    # win() function checks if any of the combinations of 3 elements in the set sums to 15 (winning condition)
    def win(self, elements):
        """Checks if elements is winning"""
        return any(sum(c) == 15 for c in combinations(elements, 3))
    
    def print_board(self):
        """Nicely prints the board"""
        for r in range(3):
            for c in range(3):
                i = r * 3 + c
                if MAGIC[i] in self.state.x:
                    print('❌', end='')
                elif MAGIC[i] in self.state.o:
                    print('⭕️', end='')
                else:
                    print('⬜️', end='')
            print()
        print()

defaultdict is a subclass of dict that returns a default value when the key is not found, so that it is not needed to check whether a key is in the dictionary.

frozenset is an immutable version of set, which can be used as a key in a dictionary.

## Random Player

In [5]:
class RandomPlayer(Player):
    def __init__(self):
        super().__init__()

    # just choose randomly among the available moves
    def make_move(self, state, available_moves):
        return choice(list(available_moves))

## Reinforcement Learning Player

In [6]:
class reinforcement_player(Player):
    def __init__(self, player_index, random_move = 0.0):
        self.value_dictionary = defaultdict(float) # state of the game and its value
        self.hit_state = defaultdict(int) # state of the game and how many times it was visited during the training phase
        self.epsilon = 0.1 # learning rate
        self.player_index = player_index # index of the player (1 or 2)
        self.random_move = random_move # a value between 0 and 1, used to choose a random move when training

    # in make_moves we have to sometimes choose a random move when training
    def make_move(self, state, available_moves):
        """Returns best move for the actual state"""
        # it checks the value of the new_state for each possible move and returns the move with the highest value
        best_move_score = -10_000
        best_move = None
        if np.random.rand() < self.random_move:
            return choice(list(available_moves))
        else:
            for move in available_moves:
                new_state = deepcopy(state)
                if self.player_index == 1:
                    new_state.x.add(move)
                elif self.player_index == 2:
                    new_state.o.add(move)
                else:
                    raise ValueError("player_index must be 1 or 2")
                hashable_state = (frozenset(new_state.x), frozenset(new_state.o))
                actual_move_score = self.value_dictionary[hashable_state]
                if actual_move_score > best_move_score:
                    best_move_score = actual_move_score
                    best_move = move

        return best_move
    
    def give_reward(self, reward, trajectory):
        """Updates the value of the states visited during the game"""
        for state in reversed(trajectory):
            hashable_state = (frozenset(state.x), frozenset(state.o))
            self.hit_state[hashable_state] += 1
            self.value_dictionary[hashable_state] += self.epsilon * (reward - self.value_dictionary[hashable_state])
            reward = self.value_dictionary[hashable_state]

    def print_value_dictionary(self):
        """Prints the value of each state"""
        return sorted(self.value_dictionary.items(), key=lambda e: e[1], reverse=True)
    
    # used to switch between train and test phases
    def set_random_move(self, random_move):
        """Sets the value of random_move"""
        self.random_move = random_move

    # used to switch between player 1 and player 2
    def set_player_index(self, player_index):
        """Sets the value of player_index"""
        self.player_index = player_index

    # creates the policy file where it is stored the value of each state
    def create_policy(self):
        """Creates the policy file"""
        fw = open('policy_' + str(self.player_index), 'wb')
        pickle.dump(self.value_dictionary, fw)
        fw.close()

    # loads the policy file
    def load_policy(self):
        """Loads the policy file"""
        fr = open('policy_' + str(self.player_index), 'rb')
        self.value_dictionary = pickle.load(fr)
        fr.close()



## Human Player

In [7]:
class Human_Player(Player):
    def __init__(self):
        super().__init__()

    # just ask the user to choose a move between the available ones
    def make_move(self, state, available_moves):
        os.system('cls' if os.name == 'nt' else 'clear')  # Clear the terminal
        print("Available moves: ", available_moves)
        move = input("Your move: ")
        return int(move)

## Train Player 1 vs Random Player

In [8]:
player1 = reinforcement_player(1, 0.3)
random_player = RandomPlayer()

# training phase
for _ in tqdm(range(100_000)):
    game = Game()
    game.play(player1, random_player)
    if game.winner == 1:
        player1.give_reward(1, game.trajectory)
    elif game.winner == 2:
        player1.give_reward(-1, game.trajectory)
    else:
        player1.give_reward(0, game.trajectory)

# save the policy
player1.create_policy()

  0%|          | 0/100000 [00:00<?, ?it/s]

## Test Player 1 vs Random Player

In [9]:
player1 = reinforcement_player(1, 0.0)
player1.load_policy()
wins_player1 = 0
loss_player1 = 0
draw_player1 = 0

# testing phase
for _ in tqdm(range(10_000)):
    game = Game()
    game.play(player1, random_player)
    if game.winner == 1:
        wins_player1 += 1
    elif game.winner == 2:
        loss_player1 += 1
    else:
        draw_player1 += 1

print(f"Player 1 wins: {wins_player1/100} %")
print(f"Player 1 losses: {loss_player1/100} %")
print(f"Player 1 draws: {draw_player1/100} %")


  0%|          | 0/10000 [00:00<?, ?it/s]

Player 1 wins: 98.9 %
Player 1 losses: 0.0 %
Player 1 draws: 1.1 %


## Train Random Player vs Player 2

In [10]:
player2 = reinforcement_player(2, 0.3)
random_player = RandomPlayer()

# training phase
for _ in tqdm(range(100_000)):
    game = Game()
    game.play(random_player, player2)
    if game.winner == 1:
        player2.give_reward(-1, game.trajectory)
    elif game.winner == 2:
        player2.give_reward(1, game.trajectory)
    else:
        player2.give_reward(0, game.trajectory)

# save the policy
player2.create_policy()

  0%|          | 0/100000 [00:00<?, ?it/s]

## Test Random Player vs Player 2

In [11]:
player2 = reinforcement_player(2, 0.0)
player2.load_policy()
wins_player2 = 0
loss_player2 = 0
draw_player2 = 0
debug_val = 0

# testing phase
for _ in tqdm(range(10_000)):
    game = Game()
    game.play(random_player, player2)
    if game.winner == 1:
        loss_player2 += 1
    elif game.winner == 2:
        wins_player2 += 1
    else:
        draw_player2 += 1

print(f"Player 2 wins: {wins_player2/100} %")
print(f"Player 2 losses: {loss_player2/100} %")
print(f"Player 2 draws: {draw_player2/100} %")

  0%|          | 0/10000 [00:00<?, ?it/s]

Player 2 wins: 91.16 %
Player 2 losses: 1.06 %
Player 2 draws: 7.78 %


## Train Player 1 vs Player 2

In [12]:
player1 = reinforcement_player(1, 0.3)
player2 = reinforcement_player(2, 0.3)

# training phase
for _ in tqdm(range(100_000)):
    game = Game()
    game.play(player1, player2)
    if game.winner == 1:
        player1.give_reward(1, game.trajectory)
        player2.give_reward(-1, game.trajectory)
    elif game.winner == 2:
        player1.give_reward(-1, game.trajectory)
        player2.give_reward(1, game.trajectory)
    else:
        player1.give_reward(0, game.trajectory)
        player2.give_reward(0, game.trajectory)

# save the policies
player1.create_policy()
player2.create_policy()

  0%|          | 0/100000 [00:00<?, ?it/s]

## Test Player 1 vs Player 2

In [13]:
player1 = reinforcement_player(1, 0.0)
player1.load_policy()
player2 = reinforcement_player(2, 0.0)
player2.load_policy()

wins_player1 = 0
loss_player1 = 0
draw_player1 = 0

wins_player2 = 0
loss_player2 = 0
draw_player2 = 0

# testing phase
for _ in tqdm(range(10_000)):
    game = Game()
    game.play(player1, player2)
    if game.winner == 1:
        wins_player1 += 1
        loss_player2 += 1
    elif game.winner == 2:
        loss_player1 += 1
        wins_player2 += 1
    else:
        draw_player1 += 1
        draw_player2 += 1

print(f"Player 1 wins: {wins_player1/100} %")
print(f"Player 1 losses: {loss_player1/100} %")
print(f"Player 1 draws: {draw_player1/100} %")
print("--------------------------------------")
print(f"Player 2 wins: {wins_player2/100} %")
print(f"Player 2 losses: {loss_player2/100} %")
print(f"Player 2 draws: {draw_player2/100} %")

  0%|          | 0/10000 [00:00<?, ?it/s]

Player 1 wins: 0.0 %
Player 1 losses: 0.0 %
Player 1 draws: 100.0 %
--------------------------------------
Player 2 wins: 0.0 %
Player 2 losses: 0.0 %
Player 2 draws: 100.0 %


In [14]:
"""
MAGIC = [2, 7, 6, 
         9, 5, 1, 
         4, 3, 8]
"""

'\nMAGIC = [2, 7, 6, \n         9, 5, 1, \n         4, 3, 8]\n'

## Test Human vs Player 2

In [15]:
human_player = Human_Player()
game = Game()
game.play(human_player, player2)
print(game.winner)

⬜️⬜️⬜️
⬜️⬜️⬜️
⬜️⬜️⬜️

[H[2JAvailable moves:  {1, 2, 3, 4, 5, 6, 7, 8, 9}


⬜️⬜️⬜️
⬜️❌⬜️
⬜️⬜️⬜️

⬜️⬜️⬜️
⬜️❌⬜️
⭕️⬜️⬜️

[H[2JAvailable moves:  {1, 2, 3, 6, 7, 8, 9}
❌⬜️⬜️
⬜️❌⬜️
⭕️⬜️⬜️

❌⬜️⬜️
⬜️❌⬜️
⭕️⬜️⭕️

[H[2JAvailable moves:  {1, 3, 6, 7, 9}
❌⬜️⬜️
⬜️❌⬜️
⭕️❌⭕️

❌⭕️⬜️
⬜️❌⬜️
⭕️❌⭕️

[H[2JAvailable moves:  {1, 6, 9}
❌⭕️⬜️
❌❌⬜️
⭕️❌⭕️

❌⭕️⬜️
❌❌⭕️
⭕️❌⭕️

[H[2JAvailable moves:  {6}
-1
