In [1]:
import numpy as np

# Define the grid environment
grid_size = (5, 5)  # 5x5 grid
num_states = grid_size[0] * grid_size[1]
actions = ["up", "down", "left", "right"]
num_actions = len(actions)

# Helper functions to convert between states and coordinates
def state_to_coordinates(state):
    return divmod(state, grid_size[1])

def coordinates_to_state(x, y):
    return x * grid_size[1] + y

# Initialize rewards and transitions
rewards = np.full(num_states, -1)  # -1 reward for every step
terminal_state = coordinates_to_state(4, 4)  # Bottom-right corner is terminal
rewards[terminal_state] = 100  # High reward for reaching the goal

# Function to determine the next state based on the chosen action
def take_action(state, action):
    x, y = state_to_coordinates(state)

    if action == "up":
        x = max(0, x - 1)
    elif action == "down":
        x = min(grid_size[0] - 1, x + 1)
    elif action == "left":
        y = max(0, y - 1)
    elif action == "right":
        y = min(grid_size[1] - 1, y + 1)

    return coordinates_to_state(x, y)

# Q-learning parameters
q_table = np.zeros((num_states, num_actions))  # Initialize Q-table
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.2  # Exploration probability
num_episodes = 500  # Number of episodes

# Q-learning algorithm
for episode in range(num_episodes):
    state = np.random.randint(0, num_states)  # Start from a random state

    while state != terminal_state:
        # Choose an action: Exploration vs. Exploitation
        if np.random.rand() < epsilon:
            action = np.random.randint(0, num_actions)  # Explore
        else:
            action = np.argmax(q_table[state])  # Exploit best known action

        next_state = take_action(state, actions[action])
        reward = rewards[next_state]

        # Update Q-value using Bellman equation
        best_next_action = np.max(q_table[next_state])
        q_table[state, action] += alpha * (reward + gamma * best_next_action - q_table[state, action])

        state = next_state  # Move to the next state

# Analyze agent performance
policy = np.argmax(q_table, axis=1)  # Extract best action per state
policy_grid = np.array([actions[a] for a in policy]).reshape(grid_size)

# Display the results
print("Optimal Policy:")
print(policy_grid)
print("\nQ-Table:")
print(q_table)

Optimal Policy:
[['right' 'right' 'down' 'left' 'down']
 ['right' 'down' 'down' 'right' 'down']
 ['right' 'right' 'down' 'down' 'down']
 ['right' 'right' 'right' 'down' 'down']
 ['up' 'up' 'right' 'right' 'up']]

Q-Table:
[[ 9.04322966e-01 -7.36814469e-02  1.69017228e-01  2.34328158e+01]
 [ 3.10095717e+00  9.18346652e+00  1.98846174e+00  4.49977332e+01]
 [ 1.33039571e+01  5.48168916e+01  9.09018309e+00  9.51583983e+00]
 [-7.64079734e-01  6.91146074e+00  3.87364411e+01  9.81199402e+00]
 [-6.70787610e-01  5.94764630e+01 -4.92566003e-01 -6.79346521e-01]
 [-7.71647482e-01  1.20864282e+00  7.90496297e-01  3.30512746e+01]
 [ 3.77592436e+00  5.24039986e+01  3.51118408e+00  1.41749611e+01]
 [ 1.82863420e+01  6.21671248e+01  1.14651381e+01  7.87411088e+00]
 [ 2.60122885e+00  6.36189844e+00 -4.95276130e-01  6.06326365e+01]
 [ 7.39296784e+00  7.88551414e+01  3.52941583e+00  2.70200824e+01]
 [-5.83705682e-01  4.43780125e+00 -5.85198506e-01  3.46813928e+01]
 [ 1.26830280e+01  2.19381537e+01  3.0040