In [1]:
import gym

env = gym.make("Taxi-v2").env

env.render()

+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+



### pickup at blue
### drop at pink

In [5]:
env.reset() # reset environment to a new, random state
env.render()

print("Action Space {}".format(env.action_space))
print("State Space {}".format(env.observation_space))

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|[34;1mY[0m| : |[35mB[0m: |
+---------+

Action Space Discrete(6)
State Space Discrete(500)


### 6 Different Directions
### 500 Different States

In [12]:
state = env.encode(3, 1, 1, 0) # (taxi row, taxi column, passenger index, destination index)
print("State:", state)

env.s = state
env.render()

State: 324
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : : : : |
| : : : : |
| |[43m [0m: | : |
|Y| : |B: |
+---------+



### Encode Args = taxi row, taxi column, passenger index, destination index

In [13]:
env.P[324]

{0: [(1.0, 424, -1, False)],
 1: [(1.0, 224, -1, False)],
 2: [(1.0, 344, -1, False)],
 3: [(1.0, 324, -1, False)],
 4: [(1.0, 324, -10, False)],
 5: [(1.0, 324, -10, False)]}

### {action: [(probability, nextstate, reward, done)]}

A few things to note:

* The 0-5 corresponds to the actions (south, north, east, west, pickup, dropoff) the taxi can perform at our current state in the illustration.

* In this env, probability is always 1.0.

* The nextstate is the state we would be in if we take the action at this index of the dict

* All the movement actions have a -1 reward and the pickup/dropoff actions have -10 reward in this particular state. If we are in a state where the taxi has a passenger and is on top of the right destination, we would see a reward of 20 at the dropoff action (5)

* done is used to tell us when we have successfully dropped off a passenger in the right location. Each successfull dropoff is the end of an episode

In [38]:
env.s = 328

epochs, penalties, rewards = 0, 0, 0
# INIT all variables

frames = []

done = False

while not done:
    action = env.action_space.sample()   # <<<---Method Automatically selects one random action from set of possible actions
    state, rewards, done, info = env.step(action)
    
    if rewards == -10:   # <<<--- Something Wrong
        penalties += 1
        
    frames.append({
        'frame'   : env.render(mode = 'ansi'),
        'state'   : state,
        'action'  : action,
        'rewards' : rewards
    })
    
    epochs += 1
print("Epochs taken =        ", epochs)
print("Number of penalties = ", penalties)

Epochs taken =         523
Number of penalties =  177


In [40]:
from time import sleep
from IPython.display import clear_output

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['rewards']}")
        sleep(.1)
        
print_frames(frames)

+---------+
|[35m[34;1m[43mR[0m[0m[0m: | : :G|
| : : : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Dropoff)

Timestep: 523
State: 0
Action: 5
Reward: 20


Not good. Our agent takes thousands of timesteps and makes lots of wrong drop offs to deliver just one passenger to the right destination.

This is because we aren't learning from past experience. We can run this over and over, and it will never optimize. The agent has no memory of which action was best for each state, which is exactly what Reinforcement Learning will do for us.


Q(state,action) ← (1−α) * Q(state,action) + α(reward + γ * maxaQ(next state,all actions))



Where:

- α (alpha) is the learning rate (0<α≤1) - Just like in supervised learning settings, α is the extent to which our Q-values are being updated in every iteration.

- γ (gamma) is the discount factor (0≤γ≤1) - determines how much importance we want to give to future rewards. A high value for the discount factor (close to 1) captures the long-term effective award, whereas, a discount factor of 0 makes our agent consider only immediate reward, hence making it greedy.

In [42]:
import numpy as np
q_table = np.zeros([env.observation_space.n, env.action_space.n])

In [44]:
q_table.shape

(500, 6)

In [58]:
np.random.uniform(0, 1)

0.646034618277368

There's a tradeoff between exploration (choosing a random action) and exploitation (choosing actions based on already learned Q-values). We want to prevent the action from always taking the same route, and possibly overfitting, so we'll be introducing another parameter called ϵ "epsilon" to cater to this during training.

Instead of just selecting the best learned Q-value action, we'll sometimes favor exploring the action space further. Lower epsilon value results in episodes with more penalties (on average) which is obvious because we are exploring and making random decisions.

In [59]:
%%time
"""Training the agent"""

import random
from IPython.display import clear_output

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.1

# For plotting metrics
all_epochs = []
all_penalties = []

for i in range(1, 100001):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False
    
    while not done:
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = np.argmax(q_table[state]) # Exploit learned values

        next_state, reward, done, info = env.step(action) 
        
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])
        
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1
        
    if i % 100 == 0:
        clear_output(wait=True)
        print(f"Episode: {i}")

print("Training finished.\n")

Episode: 100000
Training finished.

Wall time: 28.4 s


In [60]:
q_table[328]


array([ -2.40481366,  -2.27325184,  -2.40300065,  -2.36180198,
       -10.51516049, -10.85869339])

In [61]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for _ in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0
    
    done = False
    
    while not done:
        action = np.argmax(q_table[state])
        state, reward, done, info = env.step(action)

        if reward == -10:
            penalties += 1

        epochs += 1

    total_penalties += penalties
    total_epochs += epochs

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Results after 100 episodes:
Average timesteps per episode: 12.86
Average penalties per episode: 0.0
