In [1]:
import numpy as np

# Define constants for the grid world
GRID_WIDTH = 3
GRID_HEIGHT = 2
NUM_ACTIONS = 4
NUM_EPISODES = 10

# Define states (representing positions on the grid)
STATES = ["A", "B", "C", "Food"]

# Define actions (representing possible movements: up, down, left, right)
ACTIONS = [0, 1, 2, 3, 4] # 0: up, 1: down, 2: left, 3: right, 4:comsume
ACTIONS2STR = {0: "↑", 1: "↓", 2: "←", 3: "→", 4: "Consume"} 

# Initialize Q-values for each state-action pair
initial_q_values = {s: {ACTIONS2STR[a]: 0 for a in ACTIONS} for s in STATES}

# Function to get the next state for a given state-action pair
GRID = np.array([["#", "Food", "#"], ["A", "B", "C"]])

def get_next_state_and_reward(state, action):
    current_row, current_col = np.where(GRID == state)
    current_row, current_col = current_row[0], current_col[0]

    if action == 0:   # up
        new_row, new_col = current_row - 1, current_col
    elif action == 1: # down
        new_row, new_col = current_row + 1, current_col
    elif action == 2: # left
        new_row, new_col = current_row, current_col - 1
    elif action == 3: # right
        new_row, new_col = current_row, current_col + 1
    elif action == 4: # consume
        new_row, new_col = current_row, current_col
    else:
        raise ValueError("Invalid action")

    rows, cols = GRID.shape

    if 0 <= new_row < rows and 0 <= new_col < cols:
        if action == 4:
            if state == "Food":
                return state, 100
            else:
                return state, 0
        else:
            if GRID[new_row, new_col] == "#":
                return state, - 20
            else:
                return GRID[new_row, new_col], - 10
    else:
        return state, - 20

    # print(f"Move : {current_state} {ACTIONS2STR[action]} {next_state}")
    # return next_state, reward

# Function to check if a state is a terminal state
def is_terminal_state(state, action):
    return state == "Food" and action == 4 # Goal state

# Q-star Reinforcement Learning
def q_star_reinforcement_learning(A, initial_q_values, alpha, gamma, epsilon, max_episodes):
    Q = initial_q_values.copy()
    for episode in range(1, max_episodes + 1):
        global step_count
        step_count = 0 # Reset step count for each episode
        print(f"Episode : {episode}")
        current_state = "A" # Start from the bottom-left corner
        current_i, current_j = np.where(GRID == current_state)
        current_i, current_j = current_i[0], current_j[0]
        while True:
            step_count = step_count + 1
            print(f"Step : {step_count}")
            if np.random.rand() < epsilon: # epsilon = 90%
                current_action = np.random.choice(A) # Go to the desired case 90% of the time
            else:
                current_action = 0 # Go to the upwards case 10% of the time
            
            next_state, reward = get_next_state_and_reward(current_state, current_action)
            next_i, next_j = np.where(GRID == next_state)
            next_i, next_j = next_i[0], next_j[0]
            current_action_str = ACTIONS2STR[current_action]
            Q[current_state][current_action_str] += alpha * (
                reward + gamma * max(list(Q[next_state].values())) - Q[current_state][current_action_str]
            )
            current_state = next_state

            print(Q)

            if is_terminal_state(current_state, current_action):
                    break

    return Q # Return the trained Q-values

# Main program
step_count = 0 # Reset step count for each episode
print(initial_q_values)
trained_q_values = q_star_reinforcement_learning(ACTIONS, initial_q_values, alpha=0.1, gamma=0.9, epsilon=0.9, max_episodes=NUM_EPISODES)

# Use the trained Q-values to navigate through the grid world
# current_state = "A"
# while not is_terminal_state(current_state):
# action = np.argmax(trained_q_values[current_state[0], current_state[1], :])
# next_state, reward = get_next_state_and_reward(current_state, action)
# print(f"Move {ACTIONS2STR[action]}: {current_state} -> {next_state}")
# print(f"Reward: {reward}")
# current_state = next_state

{'A': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'B': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'C': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'Food': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}}
Episode : 1
Step : 1
{'A': {'↑': 0, '↓': 0, '←': -2.0, '→': 0, 'Consume': 0}, 'B': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'C': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'Food': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}}
Step : 2
{'A': {'↑': -2.0, '↓': 0, '←': -2.0, '→': 0, 'Consume': 0}, 'B': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'C': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'Food': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}}
Step : 3
{'A': {'↑': -2.0, '↓': 0, '←': -2.0, '→': 0, 'Consume': 0.0}, 'B': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'C': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}, 'Food': {'↑': 0, '↓': 0, '←': 0, '→': 0, 'Consume': 0}}
Step : 4
{'A': {'↑': -2.0, '↓': 0, '←': -2.0, '→': 0, 'Consume': 0.0}, 'B': {'↑'