Importing libraries

In [1]:
import numpy as np
import random

Maze Environment Configuration

In [2]:
maze = [
    [0, -1, 0, 0, 1],
    [0, -1, 0, -1, -1],
    [0, 0, 0, 0, 0],
    [-1, -1, 0, -1, 0],
    [0, 0, 0, -1, 0]
]

start = (0, 0)  # Starting point
goal = (0, 4)   # Goal point

Q-Learning Parameters

In [3]:
actions = ['up', 'down', 'left', 'right']
q_table = np.zeros((len(maze), len(maze[0]), len(actions)))
alpha = 0.1     # Learning rate
gamma = 0.9     # Discount factor
epsilon = 0.1   # Exploration rate
episodes = 1000 # Number of episodes

Action Mapping

In [4]:
action_dict = {
    'up': (-1, 0),
    'down': (1, 0),
    'left': (0, -1),
    'right': (0, 1)
}

Function to check if a position is valid

In [5]:
def is_valid_position(position):
    row, col = position
    return 0<=row<len(maze) and 0<=col<len(maze[0]) and maze[row][col] != -1

Function to choose an action (ε-greedy)

In [6]:
def choose_action(state):
    if random.uniform(0, 1) < epsilon:
        return random.choice(actions)  # Exploration
    else:
        row, col = state
        return actions[np.argmax(q_table[row, col])]  # Exploitation

Q-Learning Algorithm

In [7]:
for episode in range(episodes):
    state = start
    while state != goal:
        row, col = state
        action = choose_action(state)
        move = action_dict[action]
        next_state = (row + move[0], col + move[1])

        if not is_valid_position(next_state):
            reward = -1  # Penalty for hitting a wall
            next_state = state  # Stay in the same position
        elif next_state == goal:
            reward = 1  # Reward for reaching the goal
        else:
            reward = -0.1  # Small penalty for each move

        # Update Q-Value
        next_row, next_col = next_state
        best_next_action = np.max(q_table[next_row, next_col])
        q_table[row, col, actions.index(action)] += alpha * (reward + gamma * best_next_action - q_table[row, col, actions.index(action)])

        # Update state
        state = next_state

    # Decrease exploration rate over time
    epsilon = max(0.01, epsilon * 0.99)

Display the learned Q-Table

In [8]:
print("Trained Q-Table:")
print(q_table)

Trained Q-Table:
[[[-0.66834614 -0.0434062  -0.68690074 -0.65862256]
  [ 0.          0.          0.          0.        ]
  [-0.28217252  0.01544064 -0.22633692  0.8       ]
  [-0.15592538 -0.10904868  0.22995088  1.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.25253409  0.062882   -0.60845351 -0.51489876]
  [ 0.          0.          0.          0.        ]
  [ 0.62        0.02233587 -0.27946679 -0.17050279]
  [ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]]

 [[-0.24594887 -0.37915319 -0.56801193  0.18098   ]
  [-0.25619442 -0.46858785 -0.16200752  0.3122    ]
  [ 0.458      -0.14526511 -0.01069267 -0.14891004]
  [-0.199      -0.199      -0.04194172 -0.13867746]
  [-0.199      -0.13896771 -0.14110972 -0.28843942]]

 [[ 0.          0.          0.          0.        ]
  [ 0.          0.          0.          0.        ]
  [ 0.03810066 -0.12992155 -0.199      -0.199     ]
  [ 0.          0.          0.          0

Testing the agent's learned policy

In [9]:
state = start
path = [state]
while state != goal:
    row, col = state
    action = actions[np.argmax(q_table[row, col])]
    move = action_dict[action]
    next_state = (row + move[0], col + move[1])
    if not is_valid_position(next_state):
        break
    state = next_state
    path.append(state)

print("Path taken by the agent:", path)

Path taken by the agent: [(0, 0), (1, 0), (2, 0), (2, 1), (2, 2), (1, 2), (0, 2), (0, 3), (0, 4)]
