In [None]:
import numpy as np
if not hasattr(np, 'bool8'):
    np.bool8 = np.bool_

import gym
import numpy as np

# --- 1. Set up environment and parameters ---
env = gym.make("Taxi-v3")

alpha = 0.1
gamma = 1.0
epsilon = 0.1
num_episodes = 10000

Q = np.zeros((env.observation_space.n, env.action_space.n))  # Initialize Q-table

# --- 2. Train with Q-learning ---
for episode in range(num_episodes):
    state = env.reset()
    if isinstance(state, tuple):  # Fix for new gym API
        state, _ = state
    done = False

    while not done:
        # Epsilon-greedy action selection
        if np.random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(Q[state])

        step_result = env.step(action)
        if len(step_result) == 5:
            next_state, reward, terminated, truncated, info = step_result
            done = terminated or truncated
        else:
            next_state, reward, done, info = step_result

        if isinstance(next_state, tuple):  # Fix for new gym API
            next_state, _ = next_state

        # Q-learning update
        best_next_action = np.max(Q[next_state])
        Q[state, action] = Q[state, action] + alpha * (reward + gamma * best_next_action - Q[state, action])

        state = next_state

# --- 3. Greedy policy for answering questions ---
def get_optimal_action(state):
    return np.argmax(Q[state])

# --- 4. Query optimal actions ---
states_to_check = [422, 64, 8,386,108]
for s in states_to_check:
    print(f"Optimal action for state {s}: {get_optimal_action(s)}")

while True:
    try:
        s = int(input("Enter a state number to find the optimal action (or -1 to exit): "))
        if s == -1:
            break
        if 0 <= s < env.observation_space.n:
            print(f"Optimal action for state {s}: {get_optimal_action(s)}")
        else:
            print("Invalid state number. Try again.")
    except ValueError:
        print("Please enter an integer.")


Optimal action for state 422: 1
Optimal action for state 64: 2
Optimal action for state 8: 0
Optimal action for state 386: 1
Optimal action for state 108: 0
