In [1]:
import numpy as np

# Gridworld setup
grid_size = (5, 5)
special_states = {'A': (0, 1), 'B': (0, 3)}
next_to_states = {"A'": (4, 1), "B'": (2, 3)}
special_rewards = {'A': 10, 'B': 5}
default_reward = 0
edge_penalty = -1

# Actions: north, south, east, west
actions = ['north', 'south', 'east', 'west']
action_arrows = {'north': '↑', 'south': '↓', 'east': '→', 'west': '←'}

# Q-learning parameters
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate
alpha = 0.2  # Learning rate
episodes = 5000
steps_per_episode = 5000

# Initialize Q-values
Q = np.zeros((grid_size[0], grid_size[1], len(actions)))

def get_next_state(state, action):
    row, col = state
    if action == 'north':
        next_row, next_col = row - 1, col
    elif action == 'south':
        next_row, next_col = row + 1, col
    elif action == 'east':
        next_row, next_col = row, col + 1
    elif action == 'west':
        next_row, next_col = row, col - 1
    
    # Check if next state is valid
    if (0 <= next_row < grid_size[0]) and (0 <= next_col < grid_size[1]):
        return (next_row, next_col)
    else:
        return state  # Stay in place if off-grid

def get_reward(state, action):
    # Check for special states
    for special_state, pos in special_states.items():
        if state == pos:
            return special_rewards[special_state]
    
    # Check if action leads off-grid
    next_state = get_next_state(state, action)
    if next_state == state:  # Hit edge
        return edge_penalty
    else:
        return default_reward

def get_teleport_state(state):
    # Teleport from A to A' or B to B'
    for special_state, pos in special_states.items():
        if state == pos:
            return next_to_states.get(f"{special_state}'", state)
    return state

# Q-learning algorithm
for episode in range(episodes):
    state = (np.random.randint(grid_size[0]), np.random.randint(grid_size[1]))  # Random start
    for step in range(steps_per_episode):
        # Epsilon-greedy action selection
        if np.random.rand() < epsilon:
            action = np.random.choice(actions)
        else:
            action_idx = np.argmax(Q[state[0], state[1], :])
            action = actions[action_idx]
        
        # Execute action
        next_state = get_next_state(state, action)
        reward = get_reward(state, action)
        
        # Handle teleportation for special states
        if state in special_states.values():
            next_state = get_teleport_state(state)
        
        # Q-value update
        best_next_action = np.argmax(Q[next_state[0], next_state[1], :])
        td_target = reward + gamma * Q[next_state[0], next_state[1], best_next_action]
        td_error = td_target - Q[state[0], state[1], actions.index(action)]
        Q[state[0], state[1], actions.index(action)] += alpha * td_error
        
        state = next_state

# Extract optimal value function and policy
optimal_value_function = np.max(Q, axis=2)
optimal_policy_indices = np.argmax(Q, axis=2)
optimal_policy = [[actions[idx] for idx in row] for row in optimal_policy_indices]
optimal_policy_arrows = [[action_arrows[action] for action in row] for row in optimal_policy]

# Print results
print("$ python gridworld_qlearning.py")
print("Initializing Gridworld...")
print(f"Grid size: {grid_size[0]}x{grid_size[1]}")
print(f"Special_states = {special_states}")
print(f"Next_to_states = {next_to_states}")
print(f"Special_rewards = {special_rewards}")
print("Starting Q-learning with parameters:")
print(f"  γ = {gamma}")
print(f"  ε = {epsilon}")
print(f"  α = {alpha}")
print(f"  Episodes = {episodes}")
print(f"  Steps = {steps_per_episode}\n")
print("Evaluating optimal value function and policy...")
print("Optimal Value Function:")
for row in optimal_value_function:
    print("  ".join(f"{val:5.2f}" for val in row))
print("\nOptimal Policy:")
for row in optimal_policy:
    print("  ".join(f"{action:<6}" for action in row))
print("\nOptimal Policy (arrows):")
for row in optimal_policy_arrows:
    print("  ".join(f"{arrow:^6}" for arrow in row))

$ python gridworld_qlearning.py
Initializing Gridworld...
Grid size: 5x5
Special_states = {'A': (0, 1), 'B': (0, 3)}
Next_to_states = {"A'": (4, 1), "B'": (2, 3)}
Special_rewards = {'A': 10, 'B': 5}
Starting Q-learning with parameters:
  γ = 0.9
  ε = 0.1
  α = 0.2
  Episodes = 5000
  Steps = 5000

Evaluating optimal value function and policy...
Optimal Value Function:
21.98  24.42  21.98  19.42  17.48
19.78  21.98  19.78  17.80  16.02
17.80  19.78  17.80  16.02  14.42
16.02  17.80  16.02  14.42  12.98
14.42  16.02  14.42  12.98  11.68

Optimal Policy:
east    north   west    north   west  
north   north   north   west    west  
north   north   north   north   north 
north   north   north   north   north 
north   north   north   north   north 

Optimal Policy (arrows):
  →       ↑       ←       ↑       ←   
  ↑       ↑       ↑       ←       ←   
  ↑       ↑       ↑       ↑       ↑   
  ↑       ↑       ↑       ↑       ↑   
  ↑       ↑       ↑       ↑       ↑   
