In [3]:
import numpy as np

# Define environment size
num_states = 9  # Number of states (3x3 grid)
num_actions = 4  # Number of actions (up, down, left, right)

# Initialize Q-table with zeros
Q = np.zeros((num_states, num_actions))

# Define the reward structure and the transitions between states (simplified grid world)
reward_matrix = np.array([
    [-1, -1, -1, -1, 0, -1, -1, -1, 10],  # Rewards for each state (negative values are obstacles)
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
    [-1, -1, -1, -1, -1, -1, -1, -1, -1],
])

# Q-learning parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Epsilon-greedy parameter
num_episodes = 1000  # Number of episodes

# Function to get possible actions in a given state
def get_possible_actions(state):
    return [action for action in range(num_actions) if reward_matrix[state, action] >= 0]

# Q-learning algorithm
for episode in range(num_episodes):
    state = np.random.randint(0, num_states)  # Random starting state for each episode
    done = False

    while not done:
        possible_actions = get_possible_actions(state)

        # Check if there are possible actions in the current state
        if len(possible_actions) > 0:
            # Epsilon-greedy policy: exploration vs. exploitation
            if np.random.uniform(0, 1) < epsilon:
                action = np.random.choice(possible_actions)  # Explore: choose a random action
            else:
                action = np.argmax(Q[state, :])  # Exploit: choose the action with the highest Q-value
        else:
            # If there are no possible actions, choose a random action
            action = np.random.randint(0, num_actions)

        next_state = action  # Transition to the next state (simplified movement)

        reward = reward_matrix[state, action]  # Get reward for this transition

        # Update Q-value for the current state-action pair using the Bellman equation
        Q[state, action] = (1 - alpha) * Q[state, action] + alpha * (reward + gamma * np.max(Q[next_state, :]))

        state = next_state  # Move to the next state

        # Check if the goal state is reached
        if state == 8:  # Goal state
            done = True

# After training, Q-table contains learned Q-values
print("Learned Q-table:")
print(Q)