In [36]:
import gymnasium as gym
import numpy as np
import random

from time import sleep
from IPython.display import clear_output

In [18]:
env = gym.make('Taxi-v3', render_mode='ansi')

In [19]:
env.reset()


(8, {'prob': 1.0, 'action_mask': array([1, 0, 1, 0, 0, 0], dtype=int8)})

In [20]:
output = env.render()
print(output)

+---------+
|[35m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |B: |
+---------+




In [21]:
# 0: south, 1: north, 2: east, 3: west, 4: pickup, 5: dropoff
print(env.action_space)

Discrete(6)


In [22]:
# 5 x 5 x 5 x 4 = 500
# 5: passenger locations (R, G, Y, B, in taxi), 5: destination locations (R, G, Y, B, in taxi), 5: locations of taxi (5x5 grid), 4: locations of walls
print(env.observation_space)

Discrete(500)


In [23]:
len(env.unwrapped.P)

500

In [24]:
env.unwrapped.P[484]

{0: [(1.0, 484, -1, False)],
 1: [(1.0, 384, -1, False)],
 2: [(1.0, 484, -1, False)],
 3: [(1.0, 464, -1, False)],
 4: [(1.0, 484, -10, False)],
 5: [(1.0, 484, -10, False)]}

In [25]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])
q_table.shape

(500, 6)

In [None]:
%%time

alpha = 0.1 # learning rate
gamma = 0.6 # discount factor
epsilon = 0.1 # exploration rate

# Training
for i in range(100000):
    state, _ = env.reset()
    
    penalties, reward = 0, 0
    done = False
    
    while not done:
        # Exploration
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore the action space
        # Exploitation
        else:
            action = np.argmax(q_table[state])  # Explore the Q table
        
        next_state, reward, done, _, _ = env.step(action)  # Execute the action and observe the reward and next state
        
        q_prev = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        q_new = (1 - alpha) * q_prev + alpha * (reward + gamma * next_max)
        q_table[state, action] = q_new
        
        if reward == -10:
            penalties += 1
        
        state = next_state
    
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 99900
Training finished.

CPU times: total: 1min 31s
Wall time: 2min 29s


In [31]:
# 0: south, 1: north, 2: east, 3: west, 4: pickup, 5: dropoff
q_table[396]

array([-2.00566383, -1.91225302, -1.97644077, -1.870144  , -6.7207919 ,
       -4.43199096])

In [28]:
env.reset()
output = env.render()
print(output)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | :[43m [0m|
|[34;1mY[0m| : |[35mB[0m: |
+---------+




In [30]:
env.unwrapped.encode(3, 4, 3, 4)

396

In [33]:
total_penalties = 0
episodes = 50
frames = []

# Evaluation
for _ in range(episodes):
    state, _ = env.reset()
    penalties, reward = 0, 0
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, _, _ = env.step(action)

        if reward == -10:
            penalties += 1
            
        frames.append({
            'frame': env.render(),
            'state': state,
            'action': action,
            'reward': reward
        })
        
    total_penalties += penalties
    
print(f"Results after {episodes} episodes:")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 50 episodes:
Average penalties per episode: 0.0


In [39]:
# Display the frames as an animation
for frame in frames:
    clear_output(wait=True)
    print(frame['frame'])
    print(f"Timestep: {frame['state']}")
    print(f"Action: {frame['action']}")
    print(f"Reward: {frame['reward']}")
    sleep(.3)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|[35m[34;1m[43mY[0m[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep: 410
Action: 5
Reward: 20
