In [1]:
import gym
import wandb
import numpy as np
import random

In [2]:
run = wandb.init(project="Taxi-Q-learning-training", entity="smonnier")
env = gym.make("Taxi-v3", render_mode="ansi").env
q_table = np.zeros([env.observation_space.n, env.action_space.n])
env.reset()

wandb: Currently logged in as: smonnier. Use `wandb login --relogin` to force relogin


(26, {'prob': 1.0, 'action_mask': array([1, 0, 0, 1, 0, 0], dtype=int8)})

In [3]:
DOWN = 0
UP = 1
RIGHT = 2
LEFT = 3
PICKUP = 4
DROP = 5

epochs = 0

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

In [4]:
# Using Q-learning to train via reinforcement

for i in range(1, 3000):
    state = env.reset()[0]

    epochs, penalties, reward, episode_reward = 0, 0, 0, 0
    episode_action = [0, 0, 0, 0, 0, 0]
    done = False

    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()  # Explore action space
        else:
            action = np.argmax(q_table[state])  # Exploit learned values

        episode_action[action] += 1
        next_state, reward, done, info, _ = env.step(action)
        episode_reward += reward

        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

    wandb.log({"reward": episode_reward, "duration": epochs,
               "penalties": penalties,
               "down": episode_action[DOWN],
               "up": episode_action[UP],
               "right": episode_action[RIGHT],
               "left": episode_action[LEFT],
               "Pickup": episode_action[PICKUP],
               "Drop": episode_action[DROP]})
    epochs += 1
run.finish()
print("Training finished.\n")


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
Drop,█▂▃▁▁▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Pickup,█▃▃▂▂▁▁▂▁▁▁▂▁▁▁▁▁▁▂▁▁▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
down,█▃▄▂▂▁▂▃▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
duration,█▃▄▁▂▁▂▃▁▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
left,█▃▅▁▂▁▂▃▁▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
penalties,█▂▃▂▂▁▁▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
reward,▁▆▅█▇█▇▇█▇█▇██████████▇██████▇██████████
right,█▃▄▁▂▁▂▃▁▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁
up,█▃▄▂▂▁▂▃▁▂▁▂▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁

0,1
Drop,1
Pickup,2
down,2
duration,15
left,0
penalties,1
reward,-3
right,5
up,5


Training finished.



In [6]:
# Evaluate agent's performance after Q-learning
run = wandb.init(project="Taxi-Q-learning-Evaluate", entity="smonnier")
total_epochs, total_penalties = 0, 0
episodes = 20

for _ in range(episodes):
    env = gym.make("Taxi-v3", render_mode="human").env
    state = env.reset()[0]

    epochs, penalties, reward, episode_reward = 0, 0, 0, 0
    episode_action = [0, 0, 0, 0, 0, 0]

    done = False

    while not done:
        action = np.argmax(q_table[state])
        episode_action[action] += 1
        state, reward, done, info, _ = env.step(action)
        episode_reward += reward

        if reward == -10:
            penalties += 1

        epochs += 1
        env.render()
    env.close()
    wandb.log({"reward": episode_reward, "duration": epochs,
               "penalties": penalties,
               "down": episode_action[DOWN],
               "up": episode_action[UP],
               "right": episode_action[RIGHT],
               "left": episode_action[LEFT],
               "Pickup": episode_action[PICKUP],
               "Drop": episode_action[DROP]})

    total_penalties += penalties
    total_epochs += epochs
run.finish()
print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333333334608, max=1.0…

0,1
Drop,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Pickup,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
down,▇▅▇▇▇▁▅▄▇▄▇▄▇▅▄█▇▄▇▇
duration,▂▃▅▅▄▁▃▇▄▆▆▆█▇▅▇▆▄▂▇
left,▁▅█▃█▃▁█▃▆▆▅█▆▆█▆▁▃█
penalties,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
reward,▇▆▅▅▅█▆▂▅▃▃▃▁▂▄▂▃▅▇▂
right,▆▁▃▆▁▃▅█▆█▅███▅█▅█▁█
up,▁▆▃▅▃▆▆▆▃▆▆█▆▆█▃▆▆▅▅

0,1
Drop,1
Pickup,1
down,4
duration,17
left,4
penalties,0
reward,4
right,4
up,3


Results after 20 episodes:
Average timesteps per episode: 13.5
Average penalties per episode: 0.0
