In [27]:
import numpy as np
import random

In [28]:
# constants for the game
BOARD_SIZE = 3  # size of the ttt board
# for a board of size 3x3=9 each slot with 3 state we have 3^9 possible states 
EMPTY = 0       # represents an empty cell on the board
PLAYER_X = 1
PLAYER_O = -1 
REWARDS = {'win': 1.0, 'lose': -1.0, 'draw': 0.5, 'move': 0.0}  # rewards for different outcomes

In [29]:
def available_actions(state):
    """ Function to get the list of available cells of the current state
    
    Args:
        state (np.array): state of the board
        
    Returns: 
        list of tuples: (i, j) of available cells"""
    # remember the grid is board_size*board_size
    return [(i, j) for i in range(BOARD_SIZE) for j in range(BOARD_SIZE) if state[i, j] == EMPTY]

In [30]:
def board_to_key(state):
    """Function to convert the board state to a string
    Args:
        state (np.array): state of the board
    """
    return str(state.reshape(BOARD_SIZE * BOARD_SIZE))

In [31]:

def apply_action(state, action, player):
    """ Function to apply an action to the board
    
    Args:
        state (np.array): state of the board
        action (tuple (i,j)): possible action
        player (int): player playing, used to place their mark
        
    Returns: 
        new_state (np.array): state of the board
    """
    new_state = state.copy()
    new_state[action] = player  
    # action  is a tuple so you are accessing in 2 dimension    
    # and applying the number of the player associated with that turn to that 
    return new_state

In [32]:
def best_action(state, q_table):
    """ Function to choose the best action based on the Q-values
    
    Args: 
        state (np.array): state of the board
        q_table (dict): maps states to possible actions and their q-value
        
    Returns: 
        best action"""
    # picks all empty cells, which are where the possible actions can be placed (list of tuples (i,j))
    actions = available_actions(state)
    if not actions: # if no actions available
        return None

    # get q-values for each possible action
    max_q_values = []
    for action in actions:
        next_state = apply_action(state, action, PLAYER_X)  # apply action to get the next state
        next_state_key = board_to_key(next_state)   # convert next state to string key

        # get all q-values for the next state and find the maximum q-value
        next_state_q_values = q_table.get(next_state_key, {})   # returns empty dict if not found
        if next_state_q_values:
            max_q_value = max(next_state_q_values.values())
        else:   # default to 0 if there are no q-values for the next state
            max_q_value = 0  

        max_q_values.append(max_q_value)

    # return the action corresponding to the highest future reward
    return actions[np.argmax(max_q_values)]

In [33]:
def choose_action(state, q_table, epsilon):
    """ Function to choose an action using the epsilon-greedy strategy, meaning it chooses a random action if it is less than epsilon or the best action otherwise

    Args:
        state (np.array): state of the board
        q_table (dict): maps states to possible actions and their q-value
        epsilon (float): to balance between exploration/exploitation

    Returns:
        tuple: The best action or a random action
    """    
    # exploration: choose a random action
    if random.uniform(0, 1) < epsilon:
        return random.choice(available_actions(state))
    # exploitation: choose the best known action
    else:
        return best_action(state, q_table)

In [34]:
def check_winner(state):
    """Function to check if the game is over and return the winner"""
    # check rows, columns, and diagonals for a win
    for player in [PLAYER_X, PLAYER_O]:
        # first cond: if any row is filled with the player's mark
        # second cond: if any column is filled with the player's mark
        # third/fourth cond: if any row is filled with the player's mark
        if any(np.all(state[i, :] == player) for i in range(BOARD_SIZE)) or \
           any(np.all(state[:, j] == player) for j in range(BOARD_SIZE)) or \
           np.all(np.diag(state) == player) or \
           np.all(np.diag(np.fliplr(state)) == player):
            return player   # if you get here it means the player won
    # check for a draw 
    if not available_actions(state):    # no empty spaces left
        return 'draw'
    # if no win and still available actions remain
    return None

In [35]:
def update_q_table(q_table, state, action, reward, next_state, alpha, gamma):
    '''Updates the q_table based on the action done. We want to maximize the discounted (gamma) return of rewards
    Args: 
        q_table (dict): maps states to possible actions and their q-value
        state (np.array): state of the board
        action (tuple (i,j)): action to be executed
        reward (float): reward of the action
        next_state (np.array): next state of the board
        alpha (float): learning rate of the q_value formula
        gamma (float): discount rate
    Returns:
        q_table (dict): updated q_table duh
    '''
 
    key = board_to_key(state)   # get the str which works as a key for q_table
    next_key = board_to_key(next_state) # same
    
    # calculate the maximum q-value for the next state
    # if the get does not find the next state's key, it returns an empty dictionary
    # which means getting value 0, then you retrieve the maximum value inside the nested dictionary's values
    # this wil be used to dictate the next best action
    next_max = max(q_table.get(next_key, {}).values(), default=0)   
    
    # update the Q-value for the current state and action
    # formula taken from slide 50 of pack 10 RL, which states:
    # watch the agent make action a, transition to a new state s', and receive reward r,
    # then update Q-table based on how much (alpha) you want the new action to impact
    # an alpha close to 1 will overload the old learning, while close to 0 will give 
    # more importance to ld experiences and slower convergence, we set it low to give
    # a slow and stable learning pattern
    q_table.setdefault(key, {})[action] = q_table.get(key, {}).get(action, 0) + \
                                          alpha * (reward + gamma * next_max - q_table.get(key, {}).get(action, 0))
    # what the previous line does:
    # setdefault returns the value of 'key' or insert the key and then returns {} if it does not exist
    # then inside the dictionary access the key 'action' (remember it is a nested dictionary)
    # now we update the q_value with the old q_value and the new information


In [36]:
def print_board(state):
    """Function to print the current board state
    Args:
        state (np.array): state of the board
    """    
    chars = {PLAYER_X: 'X', PLAYER_O: 'O', EMPTY: ' '}
    for row in state:
        # convert each cell to the str associated and separate them with |
        print(' | '.join(chars[cell] for cell in row))
        # this is used just to create a horizontal separation line
        print('-' * (BOARD_SIZE * 4 - 1))

In [37]:
def train(games, alpha, gamma, epsilon, epsilon_decay):
    """Train the player

    Args:
        games (int): how many games to play
        alpha (float): learning rate of the q_value formula
        gamma (float): discount rate
        epsilon (float): to balance between exploration/exploitation
        epsilon_decay (float): how much to decay epsilon

    Returns:
        q_table (dict): maps states to possible actions and their q-value
    """    
    q_table = {}    # dictionary that stores the state (as a str) and the action's q-values inside nested dictionaries
    for game in range(games):
        state = np.zeros((BOARD_SIZE, BOARD_SIZE), dtype=int)   # create an empty board
        # print(f"Game {game + 1}:")
        starting_player = random.choice([PLAYER_X, PLAYER_O])
        done = False
        while not done:
            if starting_player == PLAYER_X:
                # choose and apply action
                action = choose_action(state, q_table, epsilon) # choose either random or best action
                state = apply_action(state, action, PLAYER_X)   # apply action to the board, return new board
                #print("Your move:")
                #print_board(state)
                winner = check_winner(state)
                if winner or not available_actions(state):
                    done = True
                else:
                    starting_player = PLAYER_O

            else:
            # opponent's turn, plays following a random approach
                action = random.choice(available_actions(state))   # between all actions choose a random one
                state = apply_action(state, action, PLAYER_O)  # apply action to the board, return new board
                #print("Opponent's move:")
                #print_board(state)
                winner = check_winner(state)
                if winner or not available_actions(state):
                    done = True
                else:
                    starting_player = PLAYER_X

            # determine reward and update q-table
            # this does not go into an if win since we just check it inside the get, and the move action continues the game without impact
            reward = REWARDS.get('win' if winner == PLAYER_X else 'lose' if winner == PLAYER_O else 'draw' if winner == 'draw' else 'move', 0)
            update_q_table(q_table, state, action, reward, state, alpha, gamma) # updates q_table based on the action done
            if winner is not None or not available_actions(state):  # if there is a winner or there are no more actions available
                done = True
            else:
                done = False
                
        # print the outcome of the game
        # if winner == PLAYER_X:
        #     print("Result: player X wins.")
        # elif winner == PLAYER_O:
        #     print("Result: player O wins.")
        # else:
        #     print("Result: it's a draw.")

        # diminish epsilon for less exploration over time
        epsilon *= epsilon_decay
    return q_table

In [38]:
# training parameters
games = 10000
alpha = 0.1
gamma = 0.9
epsilon = 0.9
decay = 0.995

In [39]:
q_table = train(games=games, alpha=alpha, gamma=gamma, epsilon=epsilon, epsilon_decay=decay)
print("Training finished")

Training finished


In [40]:
def evaluate_agent(q_table, num_games=1000):
    results = {"win": 0, "loss": 0, "draw": 0}
    for _ in range(num_games):
        state = np.zeros((BOARD_SIZE, BOARD_SIZE), dtype=int)
        while True:
            # random player move
            random_action = random.choice(available_actions(state))
            state = apply_action(state, random_action, PLAYER_O)

            winner = check_winner(state)
            if winner or not available_actions(state):
                results["loss" if winner == PLAYER_X else "draw" if winner == 'draw' else "win"] += 1
                break

            # AI move
            ai_action = best_action(state, q_table)
            state = apply_action(state, ai_action, PLAYER_X)

            winner = check_winner(state)
            if winner or not available_actions(state):
                results["win" if winner == PLAYER_X else "draw" if winner == 'draw' else "loss"] += 1
                break

    return results

In [41]:
results = evaluate_agent(q_table=q_table)
print(results)

{'win': 971, 'loss': 0, 'draw': 29}
