In [None]:
import numpy as np

# Constants
LEFT = 0
RIGHT = 1
GRID_SIZE = 5  # Number of cells in the 1D grid
GOAL_STATE = GRID_SIZE - 1
EPISODES = 1000
LEARNING_RATE = 0.1
DISCOUNT_FACTOR = 0.9
EXPLORATION_PROB = 0.3

# Q-learning function
def q_learning(grid_size, episodes, learning_rate, discount_factor, exploration_prob):
    # Initialize Q-table with zeros
    q_table = np.zeros((grid_size, 2))  # 2 actions: Left and Right

    for episode in range(episodes):
        state = 0  # Start from the leftmost cell
        done = False

        while not done:
            # Choose an action using epsilon-greedy strategy
            if np.random.uniform(0, 1) < exploration_prob:
                action = np.random.choice([LEFT, RIGHT])
            else:
                action = np.argmax(q_table[state, :])

            # Perform the action and observe the reward and new state
            if action == RIGHT:
                next_state = min(state + 1, grid_size - 1)
            else:
                next_state = max(state - 1, 0)

            # Define rewards
            if next_state == GOAL_STATE:
                reward = 10
                done = True
            else:
                reward = 0

            # Update the Q-value for the current state-action pair
            q_table[state, action] = q_table[state, action] + \
                learning_rate * (reward + discount_factor * np.max(q_table[next_state, :]) - q_table[state, action])

            state = next_state

    return q_table

# Run Q-learning
q_table = q_learning(GRID_SIZE, EPISODES, LEARNING_RATE, DISCOUNT_FACTOR, EXPLORATION_PROB)

# Print the learned Q-table
print("Learned Q-table:")
print(q_table)

# Choose the best action for each state
best_actions = np.argmax(q_table, axis=1)

# Print the best actions for each state
print("Best actions for each state:")
print(best_actions)


Learned Q-table:
[[ 6.561       7.29      ]
 [ 6.561       8.1       ]
 [ 7.29        9.        ]
 [ 8.09999994 10.        ]
 [ 0.          0.        ]]
Best actions for each state:
[1 1 1 1 0]
