# Taxi

importing libraries

In [71]:
import numpy as np
import gymnasium as gym
import random
from tqdm import tqdm
import imageio
import matplotlib.pyplot as plt

Creating Taxi-v3 using:
- render_mode = "rgb_array"

In [72]:
env = gym.make("Taxi-v3", render_mode="rgb_array")

Let's see what the environment Looks like

In [73]:
print("_____OBSERVATION SPACE_____ ")
print(f"Observation Space: {env.observation_space}")
print(f"Sample Observation Space: {env.observation_space.sample()}")

_____OBSERVATION SPACE_____ 
Observation Space: Discrete(500)
Sample Observation Space: 370


- There are 500 discrete states since there are 25 taxi positions, 5 possible locations of the passenger (including the case when the passenger is in the taxi), and 4 destination locations.
- Destination on the map are represented with the first letter of the color.
    - Passenger Locations: 
        - 0: Red
        - 1: Green
        - 2: Yellow
        - 3: Blue
        - 4: In taxi
    - Destinations:
        - 0: Red
        - 1: Green
        - 2: Yellow
        - 3: Blue
- An observation is returned as an int() that encodes the corresponding state, calculated by ((taxi_row * 5 + taxi_col) * 5 + passenger_location) * 4 + destination

In [74]:
print(" _____ACTION SPACE_____ ")
print(f"Action Space: {env.action_space.n}")
print(f"Sample Action Space: {env.action_space.sample()}")

 _____ACTION SPACE_____ 
Action Space: 6
Sample Action Space: 1


The action shape is (1,) in the range {0, 5} indicating which direction to move the taxi or to pickup/drop off passengers.
- 0: Move south (down)
- 1: Move north (up)
- 2: Move ea
- 3: Move west (left)
- 4: Pickup passenger
- 5: Drop off passenger

Reward Function:
- -1 per step unless other reward is triggered.
- +20 delivering passenger.
- -10 executing “pickup” and “drop-off” actions illegally.
<br>

An action that results a noop, like moving into a wall, will incur the time step penalty. Noops can be avoided by sampling the action_mask returned in info.

#### Create and Initialize Q-Table

In [75]:
state_space = env.observation_space.n
print(f"There are {state_space} possible states (rows)")

action_space = env.action_space.n
print(f"There are {action_space} possible actions (columns)")

There are 500 possible states (rows)
There are 6 possible actions (columns)


Creating Q-Table of size (state_space, action_space) and initializing values to 0

In [76]:
def initialize_q_table(state_space, action_space):
    Qtable = np.zeros((state_space, action_space))
    return Qtable

Qtable_Taxi = initialize_q_table(state_space, action_space)

#### Defining the Greedy Policy

In [77]:
def greedy_policy(Qtable, state):
    # Exploitation, take the action with the highest state-action value.
    action = np.argmax(Qtable[state][:])
    return action

#### Defining Epsilon-Greedy Policy

In [78]:
def epsilon_greedy_policy(Qtable, state, epsilon):
    # Randomly generate a number from 0 to 1:
    random_num = random.uniform(0,1)
    # if random_num greater than epsilon --> exploitation
    if random_num > epsilon:
        # Take action with the highest value given a state
        action = greedy_policy(Qtable, state)
    # else --> exploration
    else :
        action = env.action_space.sample()

    return action

#### Defining the Hyperparameters

In [79]:
# Training Parameters
n_training_episodes = 50000     # total training episodes
learning_rate = 0.7             # learning rate

# Environment Parameters
env_id = "Taxi-v3"              # name of the environment
max_steps = 99                  # Max steps per episode
gamma = 0.95                    # Discounting Rate

# Exploration Parameters
max_epsilon = 1.0               # Exploration Probability at start
min_epsilon = 0.05              # Minimum Exploration Probabilty
decay_rate = 0.005              # exponential decay rate for exploration probabilty

# Recording Parameters
save_video = True               # Save the video or not
save_frequency = 10000          # no of episodes after which a video is saved
fps = 3                        # fps of the saved video

#### Creating the Training Loop

In [80]:
def train(n_training_episodes, max_epsilon, min_epsilon, decay_rate, save_video, save_frequency, max_steps, Qtable, fps, env):

    images = []     # to store images to convert into videos

    for episode in tqdm(range(n_training_episodes)):
        # Reduce epsilon because we need less and less exploration as we proceed
        epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)

        # Reset the environment
        state, info = env.reset()
        step = 0
        terminated = False
        truncated = False

        # Create the first frame and append it to the images if save frequency has passed or its last episode
        if save_video and (episode%save_frequency==0 or episode==n_training_episodes-1):
            img = env.render()
            images.append(img)

        # repeat
        for step in range(max_steps):
            # Choose the action At using epsilon greedy policy
            action = epsilon_greedy_policy(Qtable, state, epsilon)

            # Take action At and observe state s' and reward r
            new_state, reward, terminated, truncated, info = env.step(action)

            # Update Q(s,a) := Q(s,a) + lr * [R(s,a) + gamma * max(Q(s',a')) - Q(s,a)]
            Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.max(Qtable[new_state]) - Qtable[state][action])

            # Create a frame and append it to the images if save frequency has passed or its last episode
            if save_video and (episode%save_frequency==0 or episode==n_training_episodes-1):
                img = env.render()
                images.append(img)

            # If terminated or truncated, finish the episode
            if truncated or terminated:
                break

            # Our next state is the new state
            state = new_state

        # if save frequency has passed or its last episode convert the images[] to a video
        if save_video and (episode%save_frequency==0 or episode==n_training_episodes-1):
            plt.close()     # Close the plot after saving each frame
            vid_name = f"./Videos/training-{episode}.mp4"
            imageio.mimsave(vid_name, images, fps=fps, macro_block_size=1)      # Save the images as a video file
            images = []     # Reset the images array for new video

    return Qtable

#### Train the Q-Learning Agent

In [81]:
Qtable_Taxi = train(n_training_episodes, max_epsilon, min_epsilon, decay_rate, save_video, save_frequency, max_steps, Qtable_Taxi, fps, env)

100%|██████████| 50000/50000 [00:24<00:00, 2056.30it/s]


In [82]:
Qtable_Taxi

array([[  0.        ,   0.        ,   0.        ,   0.        ,
          0.        ,   0.        ],
       [  2.75200369,   3.94947757,   2.75200369,   3.94947757,
          5.20997639,  -5.05052243],
       [  7.93349184,   9.40367562,   7.93349184,   9.40367562,
         10.9512375 ,   0.40367562],
       ...,
       [  6.81233243,  12.58025   ,  -2.65639999,   5.78016944,
        -11.37820285,  -2.04870545],
       [  4.48501837,  -3.06295667,  -3.03177213,   6.53681725,
         -5.38301653, -10.33515   ],
       [ 15.64639998,  -1.57815   ,  11.06      ,  18.        ,
         -7.        ,   5.3837    ]])