In [2]:
import gymnasium as gym
import numpy as np
import random
import time
import os

def clear_console():
    """Clears the console screen."""
    os.system('cls' if os.name == 'nt' else 'clear')

# --- Environment Setup ---
# Create the Taxi-v3 environment.
# 'render_mode="ansi"' returns a string for rendering, which is useful for non-GUI environments.
try:
    env = gym.make("Taxi-v3", render_mode="ansi")
    print("Successfully created Taxi-v3 environment using Gymnasium")
except Exception as e:
    print(f"Error: Taxi-v3 environment not found. {e}")
    print("Please make sure you have 'gymnasium' and its toy_text environments installed.")
    print("You can install it using: pip install gymnasium[toy_text]")
    exit()

env.reset()

# --- Q-Learning Algorithm Implementation ---

# Initialize the Q-table with zeros.
# The size is (number of states x number of actions).
# Taxi-v3 has 500 states and 6 actions.
state_space_size = env.observation_space.n
action_space_size = env.action_space.n
q_table = np.zeros((state_space_size, action_space_size))

print(f"Environment Info:")
print(f"State space size: {state_space_size}")
print(f"Action space size: {action_space_size}")
print(f"Q-table shape: {q_table.shape}")

# --- Hyperparameters ---
num_episodes = 25000         # Total episodes for training
max_steps_per_episode = 100  # Max steps per episode to prevent infinite loops

learning_rate = 0.1          # Alpha: How much we update Q-values based on new info
discount_rate = 0.99         # Gamma: Importance of future rewards

# --- Exploration-Exploitation Parameters ---
epsilon = 1.0                # Initial exploration rate
max_epsilon = 1.0            # Maximum exploration rate
min_epsilon = 0.01           # Minimum exploration rate
# The decay rate is chosen to reduce epsilon over the episodes.
epsilon_decay_rate = (max_epsilon - min_epsilon) / num_episodes

# --- Training the Agent ---
print("\n--- Starting Training ---")
training_start_time = time.time()

for episode in range(num_episodes):
    # Reset the environment to a new random state for each episode
    state, info = env.reset()
    
    done = False
    truncated = False
    
    for step in range(max_steps_per_episode):
        # Epsilon-Greedy Policy: Decide whether to explore or exploit
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore: choose a random action
        else:
            action = np.argmax(q_table[state, :]) # Exploit: choose the best action from Q-table

        # Take the action and observe the outcome
        new_state, reward, done, truncated, info = env.step(action)
        
        # Update the Q-table using the Bellman equation
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + \
            learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
            
        state = new_state
        
        if done or truncated:
            break
            
    # Decay epsilon after each episode
    epsilon = max(min_epsilon, epsilon - epsilon_decay_rate)
    
    if (episode + 1) % 5000 == 0:
        print(f"Episode: {episode + 1}/{num_episodes} | Epsilon: {epsilon:.4f}")

training_end_time = time.time()
print(f"--- Training Finished in {training_end_time - training_start_time:.2f} seconds ---")


# --- Evaluating the Trained Agent ---
print("\n--- Evaluating Trained Agent ---")
total_epochs, total_penalties = 0, 0
num_eval_episodes = 100

for _ in range(num_eval_episodes):
    state, info = env.reset()
    epochs, penalties = 0, 0
    done = False
    truncated = False
    
    while not done and not truncated:
        # We always exploit the learned policy during evaluation (no exploration)
        action = np.argmax(q_table[state, :])
        state, reward, done, truncated, info = env.step(action)

        if reward == -10: # -10 is the penalty for illegal pickup/dropoff
            penalties += 1
        
        epochs += 1
        if epochs >= max_steps_per_episode: # Safeguard
             break

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {num_eval_episodes} evaluation episodes:")
print(f"Average timesteps per episode: {total_epochs / num_eval_episodes:.2f}")
print(f"Average penalties per episode: {total_penalties / num_eval_episodes:.2f}")


# --- Visualizing an episode ---
print("\n--- Visualizing a Single Episode ---")
print("Showing step-by-step agent behavior with the trained policy:")

state, info = env.reset()
done = False
truncated = False
time_step = 0
action_map = {0: "South", 1: "North", 2: "East", 3: "West", 4: "Pickup", 5: "Dropoff"}

print(f"\nInitial State: {state}")
print("Initial Environment:")
print(env.render())

while not done and not truncated and time_step < max_steps_per_episode:
    print(f"\n--- Step {time_step + 1} ---")
    
    # Choose the best action from the Q-table
    action = np.argmax(q_table[state, :])
    
    # Take action and get new state
    state, reward, done, truncated, info = env.step(action)
    
    print(f"Action: {action_map[action]} | New State: {state} | Reward: {reward}")
    print("Environment after action:")
    print(env.render())
    
    time_step += 1
    
    if done:
        print(f"\n🎉 Episode Completed Successfully!")
        print(f"Total Steps: {time_step}")
        print(f"Final Reward: {reward}")
        if reward == 20:
            print("✅ Passenger successfully picked up and dropped off!")
        break
    elif truncated:
        print(f"\n⚠️ Episode truncated after {time_step} steps")
        break

print(f"\n--- Simulation Complete ---")
print(f"The agent has been trained using Q-Learning algorithm.")
print(f"Training episodes: {num_episodes}")
print(f"Final exploration rate (epsilon): {epsilon:.4f}")

Successfully created Taxi-v3 environment using Gymnasium
Environment Info:
State space size: 500
Action space size: 6
Q-table shape: (500, 6)

--- Starting Training ---
Episode: 5000/25000 | Epsilon: 0.8020
Episode: 5000/25000 | Epsilon: 0.8020
Episode: 10000/25000 | Epsilon: 0.6040
Episode: 10000/25000 | Epsilon: 0.6040
Episode: 15000/25000 | Epsilon: 0.4060
Episode: 15000/25000 | Epsilon: 0.4060
Episode: 20000/25000 | Epsilon: 0.2080
Episode: 20000/25000 | Epsilon: 0.2080
Episode: 25000/25000 | Epsilon: 0.0100
--- Training Finished in 41.38 seconds ---

--- Evaluating Trained Agent ---
Results after 100 evaluation episodes:
Average timesteps per episode: 12.99
Average penalties per episode: 0.00

--- Visualizing a Single Episode ---
Showing step-by-step agent behavior with the trained policy:

Initial State: 371
Initial Environment:
+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[43m [0m: |
|[34;1mY[0m| : |[35mB[0m: |
+---------+



--- Step 1 ---
Action: North | New St