In [1]:
import numpy as np

In [2]:
# Define the environment as a 3x3 grid
# 0 represents empty, 1 represents obstacle, and 2 represents the goal
env = np.array([[0, 0, 1],
                [0, 1, 0],
                [0, 0, 2]])


In [3]:
# Initialize the Q-table with zeros
q_table = np.zeros((3, 3, 4))  # Add an extra dimension for actions


In [4]:
q_table

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

In [5]:
# Set hyperparameters
learning_rate = 0.1
discount_factor = 0.9
num_episodes = 1000

# Define actions: 0 = Up, 1 = Down, 2 = Left, 3 = Right
actions = [0, 1, 2, 3]

In [6]:
# Q-learning algorithm
for episode in range(num_episodes):
    # Reset the environment
    state = (0, 0)  # Starting position
    done = False

    while not done:
        # Choose an action based on epsilon-greedy policy
        if np.random.rand() < 0.1:  # Exploration
            action = np.random.choice(actions)
        else:  # Exploitation
            action = np.argmax(q_table[state])

        # Perform the action and observe the next state and reward
        if action == 0:  # Move Up
            next_state = (state[0] - 1, state[1])
        elif action == 1:  # Move Down
            next_state = (state[0] + 1, state[1])
        elif action == 2:  # Move Left
            next_state = (state[0], state[1] - 1)
        else:  # Move Right
            next_state = (state[0], state[1] + 1)

        # Check if the next state is within the grid boundaries
        if next_state[0] < 0 or next_state[0] >= env.shape[0] or \
           next_state[1] < 0 or next_state[1] >= env.shape[1]:
            # Invalid move, stay in the current state
            next_state = state

        reward = env[next_state]

        # Update the Q-value using the Bellman equation
        q_table[state][action] = (1 - learning_rate) * q_table[state][action] + \
                                 learning_rate * (reward + discount_factor * np.max(q_table[next_state]))

        # Transition to the next state
        state = next_state

        # Check if the goal is reached or the episode is finished
        if reward == 2 or reward == 1:
            done = True

# Print the learned Q-table
print("Learned Q-table:")
print(q_table)

Learned Q-table:
[[[0.68550222 0.53843436 0.78495493 0.9       ]
  [0.83898563 1.         0.76697739 0.91137062]
  [0.         0.         0.         0.        ]]

 [[0.75469095 0.         0.04872308 0.        ]
  [0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]]

 [[0.06249263 0.         0.         0.        ]
  [0.         0.         0.         0.        ]
  [0.         0.         0.         0.        ]]]


In [7]:
np.zeros((3, 3, 4)),env

(array([[[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],
 
        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]],
 
        [[0., 0., 0., 0.],
         [0., 0., 0., 0.],
         [0., 0., 0., 0.]]]),
 array([[0, 0, 1],
        [0, 1, 0],
        [0, 0, 2]]))

In [None]:
'''
For state (0, 0):

Action 0 (up) has a Q-value of 0.68550222.
Action 1 (down) has a Q-value of 0.53843436.
Action 2 (left) has a Q-value of 0.78495493.
Action 3 (right) has a Q-value of 0.9.
The highest Q-value is 0.9, which corresponds to action 3 (right). Therefore, from state (0, 0), the machine will most likely follow action 3 (right).

Similarly, we can find the actions with the highest Q-values for the other states:

For state (0, 1):

Action 0 (up) has a Q-value of 0.83898563.
Action 1 (down) has a Q-value of 1.0.
Action 2 (left) has a Q-value of 0.76697739.
Action 3 (right) has a Q-value of 0.91137062.
The highest Q-value is 1.0, which corresponds to action 1 (down). Therefore, from state (0, 1), the machine will most likely follow action 1 (down).

For state (0, 2):

All actions have Q-values of 0.0.
In this case, all the Q-values are the same (0.0), which means there is no preference for any action from state (0, 2). The machine may choose any action with equal probability, or the learning process is not complete yet.



'''