In [37]:
import gym
import gym_maze
import numpy as np
import random

In [38]:
# Create an environment
env = gym.make("maze-random-10x10-plus-v0")
observation = env.reset()

In [39]:
n_observations = 100
n_actions = env.action_space.n

In [40]:
#Initialize the q-table to 0
q_table = np.zeros((n_observations,n_actions))

In [41]:
# Define the maximum number of iterations
NUM_EPISODES = 1000

In [42]:
# Hyperparameters
alpha = 0.1
gamma = 0.9

In [43]:
#initialize the exploration probability to 1
exploration_proba = 1
#exploartion decreasing decay for exponential decreasing
exploration_decreasing_decay = 0.001
# minimum of exploration proba
min_exploration_proba = 0.01

In [44]:
for episode in range(NUM_EPISODES):
    
    #we initialize the first state of the episode
    state = env.reset()
    row = int(state[0])
    col = int(state[1])
    state_num = (row * 10) + col
    done = False
    
    step = 0  
    while not done:
        # env.render()
        if random.uniform(0, 1) < exploration_proba:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state_num]) # Exploit learned values

        next_state, reward, done, truncated = env.step(action) 
        
        next_state_row = next_state[0]
        next_state_col = next_state[1]
        next_state_num = (next_state_row * 10) + next_state_col
        
        old_value = q_table[state_num, action]
        next_max = np.max(q_table[next_state_num])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state_num, action] = new_value

        state_num = next_state_num
        
        #We update the exploration proba using exponential decay formula 
        exploration_proba = max(min_exploration_proba, np.exp(-exploration_decreasing_decay * episode))

    if done or truncated:
        observation = env.reset()

KeyboardInterrupt: 

In [None]:
print(q_table)

In [36]:
# Close the environment
env.close()