In [1]:
import gym
import gym_maze
import numpy as np

In [2]:
# Create an environment
env = gym.make("maze-random-10x10-plus-v0")
observation = env.reset()

In [8]:
num_actions = 4
num_row = 10
num_column = 10

# initialize Q-Table
Q_table = np.zeros((num_row, num_column, num_actions))

In [48]:
def exploration_explotation(state, epsilon, q_table):
    random = np.random.rand()
    if random < epsilon: # exploration
        action = env.action_space.sample()
    else: # explotation
        x, y = state
        action = np.argmax(q_table[int(x), int(y), :])
    return action

In [49]:
def update_q_table(q_table, cur_state, action, next_state, reward, alpha, gamma):
    cur_x, cur_y = cur_state
    next_x, next_y = next_state
    # Q(S, A) <- (1 - α) Q(S, A) + [α * (r + (γ * max(Q(S', A*))))]
    sample = reward + (gamma * np.argmax(q_table[next_x, next_y, :])) 
    new_q_value = (1 - alpha) * q_table[int(cur_x), int(cur_y), action] + alpha * sample
    q_table[int(cur_x), int(cur_y), action] = new_q_value

In [59]:
def q_learning(num_episodes, q_table, epsilon, alpha, gamma):
    
    for i in range(num_episodes):
        env.render()
        # start from the initial state
        cur_state = env.reset()
        game_over = False

        while not game_over:
            # choose an action
            action = exploration_explotation(cur_state, epsilon, q_table)

            # Perform the action and receive feedback from the environment
            next_state, reward, done, truncated = env.step(action)

            if done:
                game_over = True

            # calculate reward 
            # TODO: define function to calculate reward to reach the goal faster

            # update Q_table
            update_q_table(q_table, cur_state, action, next_state, reward, alpha, gamma)

            cur_state = next_state
            
    return q_table

In [60]:
# Define the maximum number of iterations
NUM_EPISODES = 100
# Define explroration explotation trade-off
epsilon = 0.1
# Define learning rate
alpha = 0.1
# Define discount factor
gamma = 0.9

# Q-Learning Algorithm
q_table = q_learning(NUM_EPISODES, Q_table, epsilon, alpha, gamma)
print(q_table)

[[[ 8.98999566e-01  1.43990386e+00  1.05388200e+00  8.99000000e-01]
  [ 2.69899992e+00  2.69807232e+00  2.62149340e+00  2.69900000e+00]
  [ 8.99000000e-01  8.99000000e-01  8.99000000e-01  8.99000000e-01]
  [ 1.79273258e+00  1.79900000e+00  1.79900000e+00  1.79900000e+00]
  [ 2.69897737e+00  2.69881245e+00  2.69900000e+00  2.69900000e+00]
  [ 1.79899950e+00  1.79900000e+00  1.79900000e+00  1.79900000e+00]
  [ 2.69900000e+00  2.69899383e+00  2.69899595e+00  2.69900000e+00]
  [ 2.69899992e+00  2.69900000e+00  2.69892264e+00  2.69900000e+00]
  [ 1.79226846e+00  1.79637273e+00  1.79901023e+00  1.79900000e+00]
  [ 1.76945302e+00  1.79900000e+00  1.79900000e+00  1.79900000e+00]]

 [[ 8.98268789e-01  8.99000000e-01  8.73996258e-01  8.19298168e-01]
  [ 7.49055459e-01  8.99000000e-01  7.49072137e-01  8.98046056e-01]
  [ 8.98999993e-01  8.99000000e-01  8.99000000e-01  8.99000000e-01]
  [ 2.69900000e+00  2.69900000e+00  2.69897815e+00  2.69900000e+00]
  [ 1.79900000e+00  1.79900000e+00  1.79900000

In [52]:
# Define the maximum number of iterations
NUM_EPISODES = 1000

'''
for episode in range(NUM_EPISODES):

    #env.render()

    # TODO: Implement the agent policy here
    # Note: .sample() is used to sample random action from the environment's action space

    # Choose an action (Replace this random action with your agent's policy)
    action = env.action_space.sample()

    # Perform the action and receive feedback from the environment
    next_state, reward, done, truncated = env.step(action)
    print(type(next_state))
    x, y = next_state
    print(type(x))

    if done or truncated:
        observation = env.reset()

# Close the environment
env.close()
'''

"\nfor episode in range(NUM_EPISODES):\n\n    #env.render()\n\n    # TODO: Implement the agent policy here\n    # Note: .sample() is used to sample random action from the environment's action space\n\n    # Choose an action (Replace this random action with your agent's policy)\n    action = env.action_space.sample()\n\n    # Perform the action and receive feedback from the environment\n    next_state, reward, done, truncated = env.step(action)\n    print(type(next_state))\n    x, y = next_state\n    print(type(x))\n\n    if done or truncated:\n        observation = env.reset()\n\n# Close the environment\nenv.close()\n"