In [18]:
import gym
import gym_maze
import numpy as np
import random

In [19]:
# Create an environment
env = gym.make("maze-random-10x10-plus-v0")
observation = env.reset()

In [20]:
n_observations = 100
n_actions = env.action_space.n

In [21]:
#Initialize the q-table to 0
q_table = np.zeros((n_observations,n_actions))

In [22]:
# Define the maximum number of iterations
NUM_EPISODES = 1000

In [32]:
# Hyperparameters
alpha = 0.1
gamma = 0.9

In [33]:
# Exploration parameters
# Exploration rate
epsilon = 1.0   
# Exploration probability at start
max_epsilon = 1.0 
# Minimum exploration probability 
min_epsilon = 0.01   
# Exponential decay rate for exploration prob
decay_rate = 0.01             

In [35]:
for episode in range(NUM_EPISODES):
    
    #we initialize the first state of the episode
    state = env.reset()
    row = int(state[0])
    col = int(state[1])
    state_num = (row * 10) + col
    done = False
    
    while not done:
        # env.render()
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state_num]) # Exploit learned values

        next_state, reward, done, truncated = env.step(action) 
        
        next_state_row = next_state[0]
        next_state_col = next_state[1]
        next_state_num = (next_state_row * 10) + next_state_col
        
        old_value = q_table[state_num, action]
        next_max = np.max(q_table[next_state_num])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state_num, action] = new_value

        state_num = next_state_num
        
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)

    if done or truncated:
        observation = env.reset()

In [36]:
print(q_table)

[[0.01765174 0.01713025 0.02205605 0.01801862]
 [0.01647644 0.02277155 0.01654099 0.01622143]
 [0.02100577 0.02224728 0.02894963 0.01910615]
 [0.0217807  0.02176953 0.02929129 0.02134372]
 [0.04426165 0.04536756 0.04284896 0.04805708]
 [0.05433645 0.05916488 0.06482453 0.0582112 ]
 [0.08589598 0.0894032  0.08952522 0.08044717]
 [0.07727887 0.07451371 0.07650812 0.07234185]
 [0.11414802 0.14130949 0.13034491 0.13304777]
 [0.14244943 0.17547176 0.19112309 0.16206062]
 [0.02421852 0.02425281 0.0288317  0.02227136]
 [0.03188638 0.03030272 0.03782003 0.03187405]
 [0.0280648  0.02661076 0.0362542  0.02641276]
 [0.02648331 0.03636031 0.0267847  0.02793778]
 [0.03560534 0.03631403 0.04589014 0.03771176]
 [0.08116409 0.0962203  0.08825402 0.07063049]
 [0.10052149 0.11808272 0.12331074 0.08785445]
 [0.07651101 0.08740735 0.080766   0.07649139]
 [0.09190168 0.10354288 0.09477471 0.10956584]
 [0.20841764 0.20749764 0.23034726 0.18091825]
 [0.0291212  0.03884268 0.03213342 0.03053603]
 [0.03901861 

In [17]:
# Close the environment
env.close()