In [8]:
import numpy as np

In [9]:
from tqdm import tqdm

In [40]:
# Define the labyrinth
labyrinth = np.array([
    [0, 0, 0, 0, 0],
    [0, 1, 0, 1, 0],
    [0, 1, 0, 1, 0],
    [0, 1, 0, 1, 0],
    [0, 0, 0, 0, 0]
])

In [41]:
# Define the possible actions
actions = ["up", "down", "left", "right"]

In [42]:
# Define the Q-table
q_table = np.zeros((labyrinth.shape[0], labyrinth.shape[1], len(actions)))

In [43]:
# Define the learning rate
alpha = 0.8

# Define the discount factor
gamma = 0.95

# Define the exploration rate
epsilon = 0.1

# Define the maximum number of episodes
episodes = 10000

In [None]:
# Train the Q-learning algorithm
for episode in tqdm(range(episodes)):
    # Set the initial state
    state = (0, 0)

    # Loop until the agent reaches the goal
    while (abs(state[0]) < len(labyrinth[0])) and (abs(state[1]) < len(labyrinth[1])):
        # Select the action with the highest Q-value
        if np.random.uniform(0, 1) < epsilon:
            action = np.random.choice(actions)
        else:
            action = actions[np.argmax(q_table[state[0], state[1]])]

        # Take the action and observe the new state and reward
        if action == "up":
            new_state = (state[0] - 1, state[1])
            reward = -1 if labyrinth[new_state[0], new_state[1]] == 1 else 0
        elif action == "down":
            new_state = (state[0] + 1, state[1])
            reward = -1 if labyrinth[new_state[0], new_state[1]] == 1 else 0
        elif action == "left":
            new_state = (state[0], state[1] - 1)
            reward = -1 if labyrinth[new_state[0], new_state[1]] == 1 else 0
        elif action == "right":
            new_state = (state[0], state[1] + 1)
            reward = -1 if labyrinth[new_state[0], new_state[1]] == 1 else 0

        # Update the Q-value
        q_table[state[0], state[1], actions.index(action)] = q_table[state[0], 
                                                                     state[1], 
                                                                     actions.index(action)] + alpha * (reward + gamma * np.max(q_table[new_state[0], new_state[1]]) - q_table[state[0], state[1], actions.index(action)])

        # Update the state
        state = new_state

        # Check if the goal is reached
        if state == (labyrinth.shape[0] - 1, labyrinth.shape[1] - 1):
            break

In [46]:
state

(-1, 4)

In [52]:
# Print the Q-table
q_table

array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        , -1.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        , -1.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , -1.        ],
        [ 0.        , -0.99999949,  0.        ,  0.        ],
        [ 0.        ,  0.        , -0.999936  , -0.99999744],
        [ 0.        , -0.992     ,  0.        ,  0.        ],
        [ 0.        ,  0.        , -1.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , -1.        ],
        [-0.9999872 , -0.99968   ,  0.        ,  0.        ],
        [ 0.        ,  0.        , -0.9984    , -0.99999998],
        [-0.8       , -0.96      ,  0.        ,  0.        ],
        [ 0.        ,  0.        , -1.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        , -1.        ],
  