In [1]:
import gym
import gym_maze
import numpy as np
import time

In [2]:
# Create an environment
env = gym.make("maze-random-10x10-plus-v0")
observation = env.reset()

In [3]:
num_actions = 4
num_row = env.maze_size[0]
num_column = env.maze_size[1]

# initialize Q-Table
Q_table = np.zeros((num_row, num_column, num_actions))
print(Q_table.shape)

(10, 10, 4)


In [4]:
def exploration_explotation(state, epsilon, q_table):
    random = np.random.rand()
    if random < epsilon: # exploration
        action = env.action_space.sample()
    else: # explotation
        x, y = state
        action = np.argmax(q_table[int(x), int(y), :])
    return action

In [5]:
def update_q_table(q_table, cur_state, action, next_state, reward, alpha, gamma):
    cur_x, cur_y = cur_state
    next_x, next_y = next_state
    # Q(S, A) <- (1 - α) Q(S, A) + [α * (r + (γ * max(Q(S', A*))))]
    sample = reward + (gamma * np.argmax(q_table[next_x, next_y, :])) 
    new_q_value = (1 - alpha) * q_table[int(cur_x), int(cur_y), action] + alpha * sample
    q_table[int(cur_x), int(cur_y), action] = new_q_value

In [6]:
def q_learning(num_episodes, q_table, epsilon, alpha, gamma):
    
    for i in range(num_episodes):
        
        # start from the initial state
        cur_state = env.reset()
        game_over = False

        num_actions = 0
        total_reward = 0
        
        while not game_over:

            # env.render()
            # time.sleep(0.05)

            # choose an action
            action = exploration_explotation(cur_state, epsilon, q_table)

            num_actions += 1

            # Perform the action and receive feedback from the environment
            next_state, reward, done, truncated = env.step(action)

            if np.array_equal(env.maze_view.robot, env.maze_view.goal):
                reward = 2000

                
            if truncated:
                break

            if done:
                game_over = True

            else:
                total_reward += reward

            # calculate reward 
            # TODO: define function to calculate reward to reach the goal faster

            # update Q_table
            update_q_table(q_table, cur_state, action, next_state, reward, alpha, gamma)

            # update state
            cur_state = next_state

    return q_table

In [7]:
# Define the maximum number of iterations
NUM_EPISODES = 100
# Define explroration explotation trade-off
epsilon = 0.09
# Define learning rate
alpha = 0.3
# Define discount factor
gamma = 0.7

# Q-Learning Algorithm
q_table = q_learning(NUM_EPISODES, Q_table, epsilon, alpha, gamma)

print(q_table)

[[[ 1.96361896e+00  1.78713504e+00  2.01379085e+00  2.09696533e+00]
  [ 1.26328533e+00  1.39677817e+00  1.39905200e+00  1.39900000e+00]
  [ 1.38718886e+00  1.39821843e+00  1.39900000e+00  1.39900000e+00]
  [ 1.39900000e+00  1.39434972e+00  1.39900000e+00  1.37963189e+00]
  [ 6.99000000e-01  6.99000000e-01  6.94342242e-01  6.99000000e-01]
  [ 2.09900000e+00  1.67900000e+00  6.29082354e-01  1.67900000e+00]
  [ 6.12926295e-01  6.99000000e-01  3.56082354e-01  6.98999925e-01]
  [ 1.07048966e+00  2.09900000e+00  1.07008235e+00  2.09900000e+00]
  [ 5.08930605e-01  8.43057931e-01  6.98609576e-01  6.98999782e-01]
  [ 1.39010387e+00  1.32684887e+00  1.39881238e+00  9.90887648e-01]]

 [[ 1.32447548e+00  1.39897218e+00  1.39900000e+00  1.09827235e+00]
  [ 1.39829258e+00  9.60200100e-01  1.39900000e+00  1.14869335e+00]
  [ 6.99000000e-01  6.99000000e-01  6.99000000e-01  6.99000000e-01]
  [ 2.09493066e+00  2.09876541e+00  2.09886205e+00  2.09900000e+00]
  [ 1.39896536e+00  1.39900000e+00  1.39900000