<a href="https://colab.research.google.com/github/RoopaliMalhotra/ForDemo/blob/main/Smart_Taxi_Agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gym matplotlib seaborn



In [4]:
env = gym.make("Taxi-v3", render_mode="ansi", new_step_api=True)



In [5]:
import numpy as np

# Q-table banate hain: rows = states, columns = actions
state_size = env.observation_space.n
action_size = env.action_space.n

# Sab Q-values pehle 0 se fill kar dete hain
q_table = np.zeros((state_size, action_size))

print(f"State space size: {state_size}")
print(f"Action space size: {action_size}")
print("Q-table initialized successfully ✅")


State space size: 500
Action space size: 6
Q-table initialized successfully ✅


In [6]:
# Learning rate (alpha): Kitna naya seekhna hai
alpha = 0.1

# Discount factor (gamma): Future reward ki importance
gamma = 0.6

# Exploration rate (epsilon): Random try karne ki chance
epsilon = 0.1

# Episodes: Kitne baar agent training karega
episodes = 100000

# Stats store karne ke liye lists
all_epochs = []
all_penalties = []


In [11]:
import random
from IPython.display import clear_output

# Training loop for 'episodes' times
for i in range(1, episodes + 1):
    # Correct unpacking of the step function output
    next_state, reward, terminated, truncated, info = env.step(action)

    # Done is now a combination of 'terminated' and 'truncated'
    done = terminated or truncated



    epochs, penalties, reward = 0, 0, 0

    while not done:
        # Exploration-exploitation tradeoff
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore: random action
        else:
            action = np.argmax(q_table[state])  # Exploit: best action from Q-table

        next_state, reward, done, _, _ = env.step(action)

        # Reward shaping (bonus for correct drop)
        if reward == 20:
            reward = 25

        # Q-learning update rule
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        new_value = old_value + alpha * (reward + gamma * next_max - old_value)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    all_epochs.append(epochs)
    all_penalties.append(penalties)

    # Clear output for better visual experience (optional)
    if i % 10000 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("\nTraining finished! 🎉")


Episode: 100000

Training finished! 🎉


In [18]:
# Evaluation - test trained agent
total_epochs, total_penalties = 0, 0
episodes = 100  # Set episodes for evaluation

for episode in range(episodes):
    state = env.reset()  # Now returns just the state
    epochs, penalties, reward = 0, 0, 0
    done = False
    max_steps = 1000
    steps = 0

    while not done and steps < max_steps:
        action = np.argmax(q_table[state])  # Best action from Q-table
        step_result = env.step(action)  # Get the result of the action

        # Unpack the first 3 values (next_state, reward, done)
        next_state, reward, done, _, _ = step_result

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        steps += 1

    total_epochs += epochs
    total_penalties += penalties

    # Optional: Print progress every 10 episodes
    if episode % 10 == 0:
        print(f"Episode {episode} completed.")

# Calculate average metrics
average_penalties = total_penalties / episodes
average_epochs = total_epochs / episodes

print("\n✅ Evaluation finished!")
print(f"📉 Average penalties per episode: {average_penalties}")
print(f"⏱️ Average epochs per episode: {average_epochs}")


Episode 0 completed.
Episode 10 completed.
Episode 20 completed.
Episode 30 completed.
Episode 40 completed.
Episode 50 completed.
Episode 60 completed.
Episode 70 completed.
Episode 80 completed.
Episode 90 completed.

✅ Evaluation finished!
📉 Average penalties per episode: 0.0
⏱️ Average epochs per episode: 1000.0
