In [1]:
import numpy as np

class QLearningAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = {}
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon

    def get_q_value(self, state, action):
        return self.q_table.get(state, {}).get(action, 0)

    def choose_action(self, state, possible_actions):
        if np.random.uniform(0, 1) < self.epsilon:
            # Exploration: Choose a random action from possible actions
            return possible_actions[np.random.randint(len(possible_actions))]
        else:
            # Exploitation: Choose the action with the highest Q-value
            q_values = {action: self.get_q_value(state, action) for action in possible_actions}
            max_q = max(q_values.values())
            actions_with_max_q = [action for action, q in q_values.items() if q == max_q]
            # Randomly choose among the actions with the highest Q-value
            return actions_with_max_q[np.random.randint(len(actions_with_max_q))]

    def update_q_table(self, state, action, reward, next_state):
        max_q_next = max(self.q_table.get(next_state, {}).values(), default=0)
        self.q_table.setdefault(state, {})[action] = self.get_q_value(state, action) + \
                                                      self.alpha * (reward + self.gamma * max_q_next - self.get_q_value(state, action))


In [4]:
from tictactoe_class import TicTacToe
from td_utils import state_to_string

def train_agent(episodes=1000, agent=QLearningAgent(), show_progress=False):

    for episode in range(episodes):
        env = TicTacToe()
        game_over = False
        current_state = state_to_string(env.board)
        possible_actions = [(i, j) for i in range(3) for j in range(3) if env.board[i][j] == 0]
        action = agent.choose_action(current_state, possible_actions)

        while not game_over:
            # Take action and observe the outcome.
            env.make_move(*action)
            next_state = state_to_string(env.board)
            game_over, reward = env.is_game_over()

            if not game_over:
                # Choose next action based on policy (ε-greedy).
                next_possible_actions = [(i, j) for i in range(3) for j in range(3) if env.board[i][j] == 0]
                next_action = agent.choose_action(next_state, next_possible_actions)
            else:
                next_action = None  # No next action if game is over

            agent.update_q_table(current_state, action, reward, next_state)

            current_state, action = next_state, next_action

        if (episode + 1) % 100 == 0 and show_progress:
            print(f"Episode {episode + 1}: Training in progress...")

    print("Training completed.")
    return agent

In [5]:
agent = QLearningAgent(alpha=0.1, gamma=0.9, epsilon=0.2)
trained_agent = train_agent(episodes=500_000, agent=agent)

Training completed.


In [6]:
from td_utils import simulate_game

simulate_game(trained_agent)

Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ O │ X │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ - │ - │
├───┼───┼───┤
│ O │ X │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ X │ O │ - │
├───┼───┼───┤
│ O │ X │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ O │ - │
├───┼───┼───┤
│ O │ X │ - │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ X │ O │ - │
├───┼───┼───┤
│ O │ X │ O │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ O │ X │
├───┼───┼───┤
│ O │ X │ O │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Trained agent wins!


In [9]:

def train_agent_self_play(episodes=1000):
    agent = QLearningAgent(alpha=0.1, gamma=0.9, epsilon=0.2)
    for episode in range(episodes):
        env = TicTacToe()
        game_over = False
        while not game_over:
            # Agent plays for both player 1 and player 2 in alternate turns
            for player in [1, 2]:
                env.current_player = player
                current_state = state_to_string(env.board, player)
                possible_actions = [(i, j) for i in range(3) for j in range(3) if env.board[i][j] == 0]

                action = agent.choose_action(current_state, possible_actions)
                env.make_move(*action)

                next_state = state_to_string(env.board, player)
                game_over, reward = env.is_game_over()

                # Adjust reward based on player perspective
                if player == 2:
                    reward = -reward

                agent.update_q_table(current_state, action, reward, next_state)

                if game_over:
                    break

        if (episode + 1) % 100 == 0:
            print(f"Episode {episode + 1}: Training in progress...")

    print("Training completed.")
    return agent

def state_to_string(board, player):
    """Converts the board state to a string, adjusting the perspective based on the player."""
    # Adjust the board representation based on the player's perspective
    state = ''.join(str((cell if cell != player else 1) if cell != 0 else cell) for row in board for cell in row)
    return state.replace(str(3-player), '2')  # Convert the opponent's marks to '2'


In [15]:
train_agent1 = train_agent_self_play(episodes=50_000)
train_agent2 = train_agent_self_play(episodes=100_000)

Episode 100: Training in progress...
Episode 200: Training in progress...
Episode 300: Training in progress...
Episode 400: Training in progress...
Episode 500: Training in progress...
Episode 600: Training in progress...
Episode 700: Training in progress...
Episode 800: Training in progress...
Episode 900: Training in progress...
Episode 1000: Training in progress...
Episode 1100: Training in progress...
Episode 1200: Training in progress...
Episode 1300: Training in progress...
Episode 1400: Training in progress...
Episode 1500: Training in progress...
Episode 1600: Training in progress...
Episode 1700: Training in progress...
Episode 1800: Training in progress...
Episode 1900: Training in progress...
Episode 2000: Training in progress...
Episode 2100: Training in progress...
Episode 2200: Training in progress...
Episode 2300: Training in progress...
Episode 2400: Training in progress...
Episode 2500: Training in progress...
Episode 2600: Training in progress...
Episode 2700: Trainin

In [13]:
def simulate_game(agent1, agent2, verbose=True):
    env = TicTacToe()
    game_over = False
    env.current_player = 1  # Player 1 starts

    while not game_over:
        current_state = state_to_string(env.board, env.current_player)
        possible_actions = [(i, j) for i in range(3) for j in range(3) if env.board[i][j] == 0]

        if env.current_player == 1:
            action = agent1.choose_action(current_state, possible_actions)
        else:
            action = agent2.choose_action(current_state, possible_actions)

        env.make_move(*action)

        if verbose:
            print(f"Player {env.current_player}'s move: {action}")
            env.render()

        game_over, reward = env.is_game_over()

        # Switch to the other player
        env.current_player = 1 if env.current_player == 2 else 2

    if verbose:
        if reward == 1:
            print("Player 1 wins!")
        elif reward == -1:
            print("Player 2 wins!")
        else:
            print("It's a draw!")

    return reward


In [16]:
simulate_game(train_agent1, train_agent2)

Player 2's move: (1, 2)
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ X │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Player 2's move: (2, 1)
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ X │
├───┼───┼───┤
│ - │ X │ - │
╘═══╧═══╧═══╛
Player 2's move: (0, 2)
╒═══╤═══╤═══╕
│ - │ - │ X │
├───┼───┼───┤
│ - │ - │ X │
├───┼───┼───┤
│ - │ X │ - │
╘═══╧═══╧═══╛
Player 2's move: (2, 0)
╒═══╤═══╤═══╕
│ - │ - │ X │
├───┼───┼───┤
│ - │ - │ X │
├───┼───┼───┤
│ X │ X │ - │
╘═══╧═══╧═══╛
Player 2's move: (1, 1)
╒═══╤═══╤═══╕
│ - │ - │ X │
├───┼───┼───┤
│ - │ X │ X │
├───┼───┼───┤
│ X │ X │ - │
╘═══╧═══╧═══╛
Player 1 wins!


1