In [None]:
import gym
import gym_maze
import numpy as np
import time
import matplotlib.pyplot as plt
from matplotlib.table import Table
from matplotlib.colors import Normalize
from matplotlib.patches import Polygon

In [None]:
# Create an environment
env = gym.make("maze-random-10x10-plus-v0")
observation = env.reset()

In [None]:
def exploration_explotation(state, epsilon, q_table):
    random = np.random.rand()
    if random < epsilon: # exploration
        action = env.action_space.sample()
    else: # explotation
        x, y = state
        action = np.argmax(q_table[int(x), int(y), :])
    return action

In [None]:
def update_q_table(q_table, cur_state, action, next_state, reward, alpha, gamma):
    cur_x, cur_y = cur_state
    next_x, next_y = next_state
    # Q(S, A) <- (1 - α) Q(S, A) + [α * (r + (γ * max(Q(S', A*))))]
    sample = reward + (gamma * np.argmax(q_table[next_x, next_y, :])) 
    new_q_value = ((1 - alpha) * q_table[int(cur_x), int(cur_y), action]) + (alpha * sample)
    q_table[int(cur_x), int(cur_y), action] = new_q_value

In [None]:
def q_learning(num_episodes, q_table, epsilon, alpha, gamma):

    actions = np.zeros((env.maze_size[0], env.maze_size[1]))
    limit_action = 200
    
    for i in range(num_episodes):
        
        # start from the initial state
        cur_state = env.reset()
        game_over = False

        num_actions = 0
        total_reward = 0
        
        while not game_over:

            # env.render()
            # time.sleep(0.05)

            # choose an action
            action = exploration_explotation(cur_state, epsilon, q_table)
            
            x, y = cur_state
            actions[int(x), int(y)] = action

            # num_actions += 1
            # if num_actions > limit_action:
            #     break

            # Perform the action and receive feedback from the environment
            next_state, reward, done, truncated = env.step(action)
                
            if truncated:
                break

            if done:
                game_over = True

            else:
                total_reward += reward

            # calculate reward 
            # TODO: define function to calculate reward to reach the goal faster

            # update Q_table
            update_q_table(q_table, cur_state, action, next_state, reward, alpha, gamma)

            # update state
            cur_state = next_state

    return q_table, actions

In [None]:
num_actions = 4
num_row = env.maze_size[0]
num_column = env.maze_size[1]

# initialize Q-Table
Q_table = np.zeros((num_row, num_column, num_actions))


# Define the maximum number of iterations
NUM_EPISODES = 10000
# Define explroration explotation trade-off
epsilon = 0.2
# Define learning rate
alpha = 0.3
# Define discount factor
gamma = 0.7

# Q-Learning Algorithm
q_table, actions = q_learning(NUM_EPISODES, Q_table, epsilon, alpha, gamma)

# show results
show_q_table(q_table)
show_actions(actions)

In [None]:
def show_q_table(q_table):
    rows, cols = q_table.shape[:2]

    fig, ax_array = plt.subplots(rows, cols, figsize=(cols, rows))
    plt.subplots_adjust(wspace=0.1, hspace=0.1)

    for i in range(rows):
        for j in range(cols):
            values = q_table[i, j, :]
            ax = ax_array[i, j]

            center = (0.5, 0.5)
            triangle1 = Polygon([
                [center[0] - 0.5, center[1] + 0.5],
                [center[0] + 0.5, center[1] + 0.5],
                [center[0], center[1]],
            ], closed=True, edgecolor='black', facecolor='none')

            triangle2 = Polygon([
                [center[0] - 0.5, center[1] + 0.5],
                [center[0] - 0.5, center[1] - 0.5],
                [center[0], center[1]],
            ], closed=True, edgecolor='black', facecolor='none')

            triangle3 = Polygon([
                [center[0] - 0.5, center[1] - 0.5],
                [center[0] + 0.5, center[1] - 0.5],
                [center[0], center[1]],
            ], closed=True, edgecolor='black', facecolor='none')

            triangle4 = Polygon([
                [center[0] + 0.5, center[1] - 0.5],
                [center[0] + 0.5, center[1] + 0.5],
                [center[0], center[1]],
            ], closed=True, edgecolor='black', facecolor='none')

            ax.text(center[0], center[1] + 0.25, f'{values[0]:.4f}', ha='center', va='center', fontsize=6)
            ax.text(center[0], center[1] - 0.25, f'{values[1]:.4f}', ha='center', va='center', fontsize=6)
            ax.text(center[0] + 0.25, center[1], f'{values[2]:.4f}', ha='center', va='center', fontsize=6)
            ax.text(center[0] - 0.25, center[1], f'{values[3]:.4f}', ha='center', va='center', fontsize=6)

            ax.add_patch(triangle1)
            ax.add_patch(triangle2)
            ax.add_patch(triangle3)
            ax.add_patch(triangle4)

            ax.set_xlim(0, 1)
            ax.set_ylim(0, 1)
            ax.axis('off')

    plt.tight_layout()
    plt.show()

In [None]:
def show_actions(actions):
    actions = actions.astype(int)

    fig, ax = plt.subplots()
    table = ax.table(cellText=actions, loc='center', cellLoc='center', edges='open')
    ax.axis('off')

    for i in range(actions.shape[0]):
        for j in range(actions.shape[1]):
            value = actions[i, j]
            if value == 0:
                arrow = u'$\u2191$'   # Up arrow
            elif value == 1:
                arrow = u'$\u2193$'   # Down arrow
            elif value == 2:
                arrow = u'$\u2192$'   # Right arrow
            elif value == 3:
                arrow = u'$\u2190$'   # Left arrow
            else:
                arrow = ''
        
            table[i, j].get_text().set_text(arrow)

    plt.show()