In [10]:
import numpy as np

class SARSAAgent:
    def __init__(self, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.q_table = {}
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate

    def get_q_value(self, state, action):
        return self.q_table.get(state, {}).get(action, 0)

    def choose_action(self, state, possible_actions):
        if np.random.uniform(0, 1) < self.epsilon:
            # Exploration: Choose a random action.
            return possible_actions[np.random.randint(len(possible_actions))]
        else:
            # Exploitation: Choose the action with the highest Q-value.
            q_values = {action: self.get_q_value(state, action) for action in possible_actions}
            max_q = max(q_values.values())
            actions_with_max_q = [action for action, q in q_values.items() if q == max_q]
            return actions_with_max_q[np.random.randint(len(actions_with_max_q))]

    def update_q_table(self, state, action, reward, next_state, next_action):
        # SARSA update rule.
        next_q = self.get_q_value(next_state, next_action)
        self.q_table.setdefault(state, {})[action] = self.get_q_value(state, action) + \
                                                      self.alpha * (reward + self.gamma * next_q -
                                                                    self.get_q_value(state, action))


In [14]:
from tictactoe_class import TicTacToe
from td_utils import state_to_string

def train_agent(episodes=1000, agent=SARSAAgent(), show_progress=False):

    for episode in range(episodes):
        env = TicTacToe()
        game_over = False
        current_state = state_to_string(env.board)
        possible_actions = [(i, j) for i in range(3) for j in range(3) if env.board[i][j] == 0]
        action = agent.choose_action(current_state, possible_actions)

        while not game_over:
            # Take action and observe the outcome.
            env.make_move(*action)
            next_state = state_to_string(env.board)
            game_over, reward = env.is_game_over()

            if not game_over:
                # Choose next action based on policy (ε-greedy).
                next_possible_actions = [(i, j) for i in range(3) for j in range(3) if env.board[i][j] == 0]
                next_action = agent.choose_action(next_state, next_possible_actions)
            else:
                next_action = None  # No next action if game is over

            # SARSA update
            agent.update_q_table(current_state, action, reward, next_state, next_action)

            current_state, action = next_state, next_action

        if (episode + 1) % 100 == 0 and show_progress:
            print(f"Episode {episode + 1}: Training in progress...")

    print("Training completed.")
    return agent

In [13]:
agent = SARSAAgent(alpha=0.1, gamma=0.9, epsilon=0.2)
trained_agent = train_agent(episodes=500_000, agent=agent)

Episode 100: Training in progress...
Episode 200: Training in progress...
Episode 300: Training in progress...
Episode 400: Training in progress...
Episode 500: Training in progress...
Episode 600: Training in progress...
Episode 700: Training in progress...
Episode 800: Training in progress...
Episode 900: Training in progress...
Episode 1000: Training in progress...
Episode 1100: Training in progress...
Episode 1200: Training in progress...
Episode 1300: Training in progress...
Episode 1400: Training in progress...
Episode 1500: Training in progress...
Episode 1600: Training in progress...
Episode 1700: Training in progress...
Episode 1800: Training in progress...
Episode 1900: Training in progress...
Episode 2000: Training in progress...
Episode 2100: Training in progress...
Episode 2200: Training in progress...
Episode 2300: Training in progress...
Episode 2400: Training in progress...
Episode 2500: Training in progress...
Episode 2600: Training in progress...
Episode 2700: Trainin

In [15]:
from td_utils import simulate_game

simulate_game(trained_agent)

Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ X │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ X │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ O │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ X │ X │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ O │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ X │ X │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ O │ - │ O │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ X │ X │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ O │ X │ O │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ O │ X │ X │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ O │ X │ O │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ O │ X │ X │
├───┼───┼───┤
│ - │ X │ - │
├───┼───┼───┤
│ O │ X │ O │
╘═══╧═══╧═══╛
Trained agent wins!


In [16]:
simulate_game(trained_agent)

Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ O │
├───┼───┼───┤
│ X │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ O │
├───┼───┼───┤
│ X │ X │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ O │
├───┼───┼───┤
│ X │ X │ - │
├───┼───┼───┤
│ O │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ O │
├───┼───┼───┤
│ X │ X │ X │
├───┼───┼───┤
│ O │ - │ - │
╘═══╧═══╧═══╛
Trained agent wins!


In [6]:
simulate_game(sarsa_agent)

Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ O │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ - │ - │
├───┼───┼───┤
│ - │ - │ O │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ X │ O │ - │
├───┼───┼───┤
│ - │ - │ O │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ O │ - │
├───┼───┼───┤
│ X │ - │ O │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Trained agent wins!


In [17]:
simulate_game(trained_agent)

Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
├───┼───┼───┤
│ - │ O │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
├───┼───┼───┤
│ - │ O │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ X │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
├───┼───┼───┤
│ - │ O │ O │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
├───┼───┼───┤
│ X │ O │ O │
╘═══╧═══╧═══╛
Trained agent wins!


In [8]:
simulate_game(sarsa_agent)

Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ O │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ O │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ X │ O │ - │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ O │ X │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ - │ - │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ X │ O │ X │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ O │ - │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ X │ O │ X │
├───┼───┼───┤
│ O │ X │ - │
├───┼───┼───┤
│ X │ O │ - │
╘═══╧═══╧═══╛
Trained agent wins!


In [18]:
simulate_game(trained_agent)

Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ - │
├───┼───┼───┤
│ - │ - │ X │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ - │ - │ X │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ - │ X │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ - │ - │ - │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ O │ X │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ - │ - │ X │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ O │ X │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ O │ - │ X │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ O │ X │
╘═══╧═══╧═══╛
Agent's move (Player 1):
╒═══╤═══╤═══╕
│ O │ X │ X │
├───┼───┼───┤
│ O │ - │ - │
├───┼───┼───┤
│ X │ O │ X │
╘═══╧═══╧═══╛
Random agent's move (Player 2):
╒═══╤═══╤═══╕
│ O │ X │ X │
├───┼───┼───┤
│ O │ O │ - │
├───┼───┼───┤
│ X │ O │ X │
╘═