Import all the libraries required by running the cell

In [None]:
import os
import time
import gym
import numpy as np
from tqdm.notebook import trange, tqdm
from IPython.display import clear_output

# Taxi-v3
There are 4 locations (labeled by different letters) and your job is to pick up the passenger at one location and drop him off in another. You receive +20 points for a successful dropoff, and lose 1 point for every timestep it takes. There is also a 10 point penalty for illegal pick-up and drop-off actions.

In [None]:
env = gym.make('Taxi-v3')

  and should_run_async(code)
  deprecation(
  deprecation(


![Taxi environment](images/taxi_env.png)

* **`env.reset`**: Resets the environment and returns a random initial state.
* **`env.step(action)`**: Step the environment by one timestep. Returns
    * **observation**: Observations of the environment
    * **reward**: If your action was beneficial or not
    * **done**: Indicates if we have successfully picked up and dropped off a passenger, also called one episode
    * **info**: Additional info such as performance and latency for debugging purposes
* **`env.render`**: Renders one frame of the environment (helpful in visualizing the environment)

In [None]:
env.reset()  # reset environment to a new, random state
print(env.render(mode='ansi'))

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :[34;1mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+


Action Space Discrete(6)
State Space Discrete(500)


See here for more information: https://www.gymlibrary.ml/content/api/[0m
  deprecation(


* 0 = south
* 1 = north
* 2 = east
* 3 = west
* 4 = pickup
* 5 = dropoff

In [None]:
state = env.encode(3, 1, 2, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
print(env.render(mode='ansi'))



State: 328
+---------+
|R: | : :[34;1mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+




In [None]:
env.P[328]

{0: [(1.0, 428, -1, False)],
 1: [(1.0, 228, -1, False)],
 2: [(1.0, 348, -1, False)],
 3: [(1.0, 328, -1, False)],
 4: [(1.0, 328, -10, False)],
 5: [(1.0, 328, -10, False)]}

In [None]:
# Hyper params:

total_ep = 15000
total_test_ep = 500
max_steps = 100

lr = 0.1
gamma = 0.4

# Exploration Params:

epsilon = 1
max_epsilon = 1.0
min_epsilon = 0.01
decay_rate = 0.01

$$Q(state,action)\leftarrow (1 - \alpha)Q(state, action)+\alpha(reward+\gamma max_a Q(next state, all actions))$$

Where:

- $\large\alpha$ (alpha) is the learning rate ($0<\alpha\leq 1$) - Just like in supervised learning settings, α is the extent to which our Q-values are being updated in every iteration.

- $\large\gamma$ (gamma) is the discount factor ($0\leq\gamma\leq 1$) - determines how much importance we want to give to future rewards. A high value for the discount factor (close to 1) captures the long-term effective award, whereas, a discount factor of 0 makes our agent consider only immediate reward, hence making it greedy.



In [None]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])

Start training the Q Learning model

In [None]:
# Implementing the Q Learning Algorithm:
import random

for episode in range(total_ep):

  # Reset Environment:
  state = env.reset()
  print(env.render(mode='ansi'))
  step = 0
  done = False

  for step in range(max_steps):

    # Choose an action a in the current world state(s) (step 3)
    # First we randomize a number
    exp_exp_tradeoff = random.uniform(0, 1)

    # If this number > greater than epsilon --> exploitation (taking the biggest q value for the current state):
    if exp_exp_tradeoff > epsilon:
      action = np.argmax(q_table[state, :])

    # Else, doing random choice:
    else:
      action = env.action_space.sample()

    # Take the action (a) and observe the outcome state (s') and the reward (r)
    new_state, reward, done, info = env.step(action)
    print(env.render(mode='ansi'))

    # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
    q_table[state, action] = q_table[state, action] + lr * (reward + gamma *
                                    np.max(q_table[new_state, :]) - q_table[state, action])

    # Our new state:
    state = new_state

    # If done True, finish the episode:
    if done == True:
      break

  # Increment number of episodes:
  episode += 1


print("Training finished.\n")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
+---------+
  (East)

+---------+
|R: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)

+---------+
|R: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (North)

+---------+
|R: | : :G|
| : | : :[43m [0m|
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (Dropoff)

+---------+
|R: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | :[43m [0m: |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (Pickup)

+---------+
|R: | : :G|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+
  (West)

+---------+
|R: | : :G|
| : | :[43m [0m: |

In [None]:
"""Evaluate agent's performance after Q-learning"""
# Using Q Table:

env.reset()
rewards = []
tot_penalties = []
frames = []

for episode in range(total_test_ep):
  state = env.reset()
  step = 0
  done = False
  total_rewards = 0
  penalties = 0
  print('=========================')
  print('EPISODE: ', episode)

  for step in range(max_steps):

    print(env.render(mode='ansi'))

    # Take the action based on the Q Table:
    action = np.argmax(q_table[state, :])

    new_state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    total_rewards += reward

    # If episode finishes:
    if done:
      frames.append({
        'frame': env.render(mode='ansi')})
      rewards.append(total_rewards)
      tot_penalties.append(penalties)
      print('Reward: ', total_rewards)
      print('Penalty: ', penalties)
      break

    state = new_state

env.close()



print('Reward Over Time: {}'.format(sum(rewards)/total_test_ep))
print('Penalty Over Time: {}'.format(sum(tot_penalties)/total_test_ep))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
+---------+
  (East)

+---------+
|R: | : :G|
| : | : : |
| : : :[42m_[0m: |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (East)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : |[42m_[0m: |
|Y| : |[35mB[0m: |
+---------+
  (South)

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[42mB[0m[0m: |
+---------+
  (South)

Reward:  10
Penalty:  0
EPISODE:  460
+---------+
|[34;1mR[0m: | : :G|
| : | : : |
| : : : : |
|[43m [0m| : | : |
|Y| : |[35mB[0m: |
+---------+


+---------+
|[34;1mR[0m: | : :G|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)

+---------+
|[34;1mR[0m: | : :G|
|[43m [0m: | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)

+---------+
|[34;1m[43mR[0m[0m: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35mB[0m: |
+---------+
  (North)

+---------+
|[42mR[0m: | : :G|
| : | : : |
| : :