In [1]:
import numpy as np
import random


## Define Environment

In [4]:
# Grid dimensions
grid_size = 5

# Rewards: goal=10, penalty=-10, step=-1
reward_matrix = -1 * np.ones((grid_size, grid_size))
reward_matrix[4, 4] = 10  # Goal
walls = [(1,1), (2,3), (3,1)]  # Wall positions
for wall in walls:
    reward_matrix[wall] = -10

# Actions: Up, Down, Left, Right
actions = ['up', 'down', 'left', 'right']
action_map = {0: 'up', 1: 'down', 2: 'left', 3: 'right'}


## Initialize Q-Table

In [9]:
# Q-table of shape (grid_size, grid_size, num_actions)
Q = np.zeros((grid_size, grid_size, len(actions)))


## Define Helper Function to Take Action

In [12]:
def take_action(state, action):
    i, j = state
    if action == 0:  # up
        i = max(i-1, 0)
    elif action == 1:  # down
        i = min(i+1, grid_size-1)
    elif action == 2:  # left
        j = max(j-1, 0)
    elif action == 3:  # right
        j = min(j+1, grid_size-1)
    
    # Reward
    reward = reward_matrix[i, j]
    return (i, j), reward


## Q-Learning Algorithm

In [15]:
# Hyperparameters
alpha = 0.1       # Learning rate
gamma = 0.9       # Discount factor
epsilon = 0.2     # Exploration rate
episodes = 1000

for episode in range(episodes):
    state = (0, 0)  # Start state
    done = False
    
    while not done:
        i, j = state
        
        # Epsilon-greedy action selection
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, 3)  # Explore
        else:
            action = np.argmax(Q[i,j,:])   # Exploit
        
        next_state, reward = take_action(state, action)
        ni, nj = next_state
        
        # Update Q-value
        Q[i,j,action] = Q[i,j,action] + alpha * (reward + gamma * np.max(Q[ni,nj,:]) - Q[i,j,action])
        
        state = next_state
        
        if reward == 10 or reward == -10:
            done = True


## Extract Policy

In [18]:
policy = np.empty((grid_size, grid_size), dtype=str)
for i in range(grid_size):
    for j in range(grid_size):
        if (i,j) in walls:
            policy[i,j] = 'WALL'
        elif (i,j) == (4,4):
            policy[i,j] = 'GOAL'
        else:
            policy[i,j] = action_map[np.argmax(Q[i,j,:])]

print("Learned Policy:")
print(policy)


Learned Policy:
[['d' 'r' 'd' 'r' 'd']
 ['d' 'W' 'd' 'r' 'd']
 ['r' 'r' 'd' 'W' 'd']
 ['d' 'W' 'd' 'd' 'd']
 ['r' 'r' 'r' 'r' 'G']]
