In [1]:
# qmaze.py — short Q-learning maze example
import numpy as np
import random

In [2]:
# Maze: 0 free, 1 wall, S=start(2), G=goal(3)
maze = np.array([
    [2,0,0,1,0],
    [1,0,1,0,0],
    [0,0,0,0,1],
    [0,1,0,1,0],
    [0,0,0,0,3]
])
rows,cols = maze.shape
start = tuple(np.argwhere(maze==2)[0])
goal  = tuple(np.argwhere(maze==3)[0])

actions = [( -1,0),(1,0),(0,-1),(0,1)]  # up,down,left,right
n_states = rows*cols
n_actions = len(actions)

In [3]:
def to_idx(pos): return pos[0]*cols + pos[1]
def valid(pos):
    r,c = pos
    return 0<=r<rows and 0<=c<cols and maze[r,c]!=1

In [4]:
# Q-table
Q = np.zeros((n_states, n_actions))
alpha = 0.6
gamma = 0.9
epsilon = 0.2

def step(pos, a):
    dr,dc = actions[a]
    new = (pos[0]+dr, pos[1]+dc)
    if not valid(new): new = pos
    reward = 1.0 if new==goal else -0.01
    done = new==goal
    return new, reward, done

In [5]:
# Training
episodes = 2000
for ep in range(episodes):
    pos = start
    for _ in range(200):
        s = to_idx(pos)
        if random.random() < epsilon:
            a = random.randrange(n_actions)
        else:
            a = int(np.argmax(Q[s]))
        new,pos_reward,done = step(pos,a)
        s2 = to_idx(new)
        Q[s,a] += alpha*(pos_reward + gamma*np.max(Q[s2]) - Q[s,a])
        pos = new
        if done: break

In [6]:
# Derive policy and show one greedy path
pos = start
path = [pos]
for _ in range(200):
    s = to_idx(pos)
    a = int(np.argmax(Q[s]))
    pos,_,done = step(pos,a)
    path.append(pos)
    if done: break

In [7]:
print("Maze (S=start, G=goal, 1=wall):")
print(maze)
print("\nLearned path (row,col):")
print(path)
if path[-1]==goal:
    print("Reached goal in", len(path)-1, "steps")
else:
    print("Did not reach goal — try increasing episodes or tuning params.")

Maze (S=start, G=goal, 1=wall):
[[2 0 0 1 0]
 [1 0 1 0 0]
 [0 0 0 0 1]
 [0 1 0 1 0]
 [0 0 0 0 3]]

Learned path (row,col):
[(0, 0), (0, 1), (1, 1), (2, 1), (2, 2), (3, 2), (4, 2), (4, 3), (4, 4)]
Reached goal in 8 steps
