In [1]:
import numpy as np
import random

In [3]:
# Create a simple 4x4 GridWorld
grid_size = 4
state_space = grid_size * grid_size
action_space = 4  # up, down, left, right
# Define actions: 0-up, 1-down, 2-left, 3-right
actions = ['↑', '↓', '←', '→']

In [5]:
# Rewards and terminal state
goal_state = 15  # bottom-right corner
rewards = np.full(state_space, -1)
rewards[goal_state] = 10

# Q-table initialization
Q = np.zeros((state_space, action_space))

# Learning parameters
alpha = 0.1     # Learning rate
gamma = 0.9     # Discount factor
epsilon = 0.2   # Exploration rate
episodes = 500

def get_next_state(state, action):
    row, col = divmod(state, grid_size)
    if action == 0 and row > 0: row -= 1
    elif action == 1 and row < grid_size - 1: row += 1
    elif action == 2 and col > 0: col -= 1
    elif action == 3 and col < grid_size - 1: col += 1
    return row * grid_size + col

# Training 
for ep in range(episodes):
    state = random.randint(0, state_space - 2) 
    while state != goal_state:
        if random.uniform(0, 1) < epsilon:
            action = random.randint(0, action_space - 1)
        else:
            action = np.argmax(Q[state])
        next_state = get_next_state(state, action)
        reward = rewards[next_state]
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state]) - Q[state, action])
        state = next_state

print("\nLearned Policy:")
for i in range(state_space):
    if i == goal_state:
        print(" G ", end="\t")
    else:
        best_action = np.argmax(Q[i])
        print(f" {actions[best_action]} ", end="\t")
    if (i + 1) % grid_size == 0:
        print()


Learned Policy:
 ↓ 	 ↓ 	 → 	 ↓ 	
 → 	 ↓ 	 → 	 ↓ 	
 → 	 ↓ 	 → 	 ↓ 	
 → 	 → 	 → 	 G 	
