## Install libraries

In [None]:
!pip install gymnasium

## Import libraries

In [2]:
import gymnasium as gym
from IPython.display import clear_output
from time import sleep

## Create Taxi-v3 environment

In [3]:
env = gym.make("Taxi-v3", render_mode="ansi")

## Define Naive brute-force algorithm

In [32]:
env.s = 328  # Set environment to illustration's state
epochs = 0
penalties, total_reward = 0, 0
frames = []  # for animation
done = False

# Initial reset
state = env.reset()

while not done:
    action = env.action_space.sample()
    state, reward, done, truncated, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(),
        'state': state,
        'action': action,
        'reward': reward
    })

    total_reward += reward
    epochs += 1

print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))
print("Total reward earned: {}".format(total_reward))

Timesteps taken: 1001
Penalties incurred: 322
Total reward earned: -3878


## Print frames

In [34]:
def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.1)

print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 1001
State: 0
Action: 5
Reward: 20


## Evaluation

In [6]:
num_episodes = 100  # Number of episodes for evaluation
total_penalties = 0
total_timesteps = 0
total_rewards = 0

# Run episodes for evaluation
for _ in range(num_episodes):
    state = env.reset()
    penalties = 0
    episode_reward = 0

    done = False
    while not done:
        action = env.action_space.sample()
        state, reward, done, truncated, info = env.step(action)

        episode_reward += reward
        total_timesteps += 1

        if reward == -10:
            penalties += 1

    total_penalties += penalties
    total_rewards += episode_reward

# Calculate metrics
avg_penalties_per_episode = total_penalties / num_episodes
avg_timesteps_per_episode = total_timesteps / num_episodes
avg_rewards_per_move = total_rewards / total_timesteps

print("Average number of penalties per episode:", avg_penalties_per_episode)
print("Average number of timesteps per episode:", avg_timesteps_per_episode)
print("Average rewards per move:", avg_rewards_per_move)

Average number of penalties per episode: 922.14
Average number of timesteps per episode: 2846.27
Average rewards per move: -3.908459141262073
