In [None]:
import gymnasium as gym
import numpy as np
import random
import time


# 1. Initialize the Environment
env = gym.make("Taxi-v3").env

# 2. Q-learning parameters
alpha = 0.1  # Learning rate
gamma = 0.9  # Discount factor
epsilon = 0.1  # Exploration rate


num_episodes = 5000
max_steps_per_episode = 100

# 3. Initialize Q-table
# Q-table dimensions: (number of states, number of actions)
q_table = np.zeros((env.observation_space.n, env.action_space.n))


def implement_bellman_equation(
    q_table,
    state,
    action,
    alpha,
    reward,
    gamma,
    new_state
):
      q_table[state, action] = (
          q_table[state, action] * (1 - alpha) +
          alpha * (reward + gamma * np.max(q_table[new_state, :]))
      )
      return q_table

In [None]:
def vdir(obj):
    return [x for x in dir(obj) if not x.startswith('_')]


vdir(env)
env.render()

  gym.logger.warn(


In [None]:
# 4. Training loop
rewards_per_episode = []

print("Training started...")
for episode in range(num_episodes):
    state = env.reset()[0] # reset() returns (observation, info)
    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
        # Exploration-exploitation trade-off
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state,:]) # Exploit learned values

        # Take action and observe new state and reward
        new_state, reward, done, truncated, info = env.step(action)

        # Update Q-table using the Bellman equation
        q_table = implement_bellman_equation(
          q_table,
          state,
          action,
          alpha,
          reward,
          gamma,
          new_state
        )

        state = new_state
        rewards_current_episode += reward

        if done or truncated:
            break

    rewards_per_episode.append(rewards_current_episode)

    if (episode + 1) % 1000 == 0:
        print(f"Episode: {episode + 1}/{num_episodes} - Average reward: {np.mean(rewards_per_episode[-1000:])}")

print("Training finished!\n")



Training started...
Episode: 1000/5000 - Average reward: -104.811
Episode: 2000/5000 - Average reward: -11.654
Episode: 3000/5000 - Average reward: 1.539
Episode: 4000/5000 - Average reward: 2.406
Episode: 5000/5000 - Average reward: 2.303
Training finished!

