In [5]:
import numpy as np
import random

# Define the environment
env = np.array([['S', 'F', 'H'],
                ['F', 'H', 'F'],
                ['F', 'F', 'G']])

# Define the reward table
rewards = {'G': 100, 'H': -100, 'F': -1, 'S': 0}  # Adding 'S' with reward 0

# Initialize the Q-table
Q = np.zeros((3, 3, 4))  # 3x3 grid and 4 actions

# Set hyperparameters
alpha = 0.8  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate

# Define the available actions
actions = ['up', 'down', 'left', 'right']

# Implement the Q-learning algorithm
for _ in range(1000):
    state = (0, 0)  # Start at the top-left corner
    while env[state] != 'G' and env[state] != 'H':
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)  # Explore action space
        else:
            action = actions[np.argmax(Q[state])]

        if action == 'up' and state[0] > 0:
            next_state = (state[0] - 1, state[1])
        elif action == 'down' and state[0] < 2:
            next_state = (state[0] + 1, state[1])
        elif action == 'left' and state[1] > 0:
            next_state = (state[0], state[1] - 1)
        elif action == 'right' and state[1] < 2:
            next_state = (state[0], state[1] + 1)
        else:
            next_state = state

        if env[next_state] not in rewards:  # Check for the presence of the state in rewards
            reward = 0
        else:
            reward = rewards[env[next_state]]

        max_Q_next = np.max(Q[next_state]) if env[next_state] != 'H' else 0  # Avoid max Q-value for the 'H' state
        Q[state][actions.index(action)] = (1 - alpha) * Q[state][actions.index(action)] + alpha * (
                reward + gamma * max_Q_next)

        state = next_state

print("Learned Q-Table:")
print(Q)


Learned Q-Table:
[[[  63.171        70.19         63.171        55.8539    ]
  [  -1.          -99.99998976   63.171       -99.999744  ]
  [   0.            0.            0.            0.        ]]

 [[  63.171        79.1          70.19       -100.        ]
  [   0.            0.            0.            0.        ]
  [   0.            0.            0.            0.        ]]

 [[  70.19         79.1          79.1          89.        ]
  [-100.           89.           79.1         100.        ]
  [   0.            0.            0.            0.        ]]]
