In [2]:
import numpy as np
import pandas as pd
import gym
from IPython.display import clear_output
from time import sleep
import random

def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'])
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        sleep(.01)

In [27]:
env = gym.make("Taxi-v3",)
print(env.reset())
env.render()


292
+---------+
|[35mR[0m: | : :G|
| : | : : |
| : : : :[43m [0m|
| | : | : |
|Y| : |[34;1mB[0m: |
+---------+



In [4]:
env.observation_space

Discrete(500)

# Using Brut-force

In [107]:
env.reset()
env.s = 328  # set environment to illustration's state

epochs = 0
penalties, reward = 0, 0

frames = [] # for animation

done = False

while not done:
    action = env.action_space.sample()
    state, reward, done, info = env.step(action)

    if reward == -10:
        penalties += 1

    # Put each rendered frame into dict for animation
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': state,
        'action': action,
        'reward': reward
        }
    )

    epochs += 1


print("Timesteps taken: {}".format(epochs))
print("Penalties incurred: {}".format(penalties))

Timesteps taken: 200
Penalties incurred: 62


In [105]:
print_frames(frames)

+---------+
|R: | : :[35mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y|[42m_[0m: |B: |
+---------+
  (Dropoff)

Timestep: 200
State: 437
Action: 5
Reward: -10


# Using RL

In [6]:
q_table = np.zeros([env.observation_space.n, env.action_space.n])
# q_table += 1e-6

In [13]:
def chose_action(q_table, state, use_prob=False):
    if use_prob:
        prob = q_table[state]
        prob -= min(prob)
        ss = sum(prob)
        try:
            act_prob = [p/ss for p in prob]
            action = np.random.choice(a=all_action, p=act_prob)
            return action
        except:
            action = np.argmax(q_table[state]) # Exploit learned values
    else:
        action = np.argmax(q_table[state]) # Exploit learned values
        return action

In [15]:

# Hyperparameters
alpha = 0.1
gamma = 0.6
epsilon = 0.9

# For plotting metrics
all_epochs = []
all_penalties = []

all_action = [n for n in range(env.action_space.n)]

for i in range(1, 100_000):
    state = env.reset()

    epochs, penalties, reward, = 0, 0, 0
    done = False

    episode_reward = []
    while not done:

        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample() # Explore action space
        else:
            action = chose_action(q_table, state) # Exploit learned values

        next_state, reward, done, info = env.step(action)
        episode_reward.append(reward)
        old_value = q_table[state, action]
        next_max = np.max(q_table[next_state])

        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state, action] = new_value

        if reward == -10:
            penalties += 1

        state = next_state
        epochs += 1

        if epsilon > 0.001:
            epsilon *= 0.9999

        # print(env.render(mode='ansi'))
        # clear_output(wait=True)

    if i % 100 == 0:
        # clear_output(wait=True)
        print(f"Episode: {i}, Reward: {np.unique(episode_reward, return_counts=True)}", epsilon)

print("Training finished.\n")

Episode: 100, Reward: (array([-10,  -1,  20]), array([11, 38,  1])) 0.5353757257855537
Episode: 200, Reward: (array([-10,  -1,  20]), array([ 1, 22,  1])) 0.4043798911079097
Episode: 300, Reward: (array([-10,  -1,  20]), array([ 5, 27,  1])) 0.3259825932293532
Episode: 400, Reward: (array([-10,  -1,  20]), array([ 1, 11,  1])) 0.2682539856287336
Episode: 500, Reward: (array([-10,  -1,  20]), array([ 2, 23,  1])) 0.2237714399757547
Episode: 600, Reward: (array([-10,  -1,  20]), array([1, 6, 1])) 0.18897537642789963
Episode: 700, Reward: (array([-1, 20]), array([10,  1])) 0.160101570999587
Episode: 800, Reward: (array([-10,  -1,  20]), array([ 1, 16,  1])) 0.13696160444858413
Episode: 900, Reward: (array([-1, 20]), array([5, 1])) 0.11784769984565685
Episode: 1000, Reward: (array([-1, 20]), array([15,  1])) 0.10121890078255277
Episode: 1100, Reward: (array([-1, 20]), array([15,  1])) 0.08750345038480532
Episode: 1200, Reward: (array([-1, 20]), array([10,  1])) 0.07620075037570381
Episode:

KeyboardInterrupt: 

# Evaluate agent's performance after Q-learning

In [23]:
"""Evaluate agent's performance after Q-learning"""

total_epochs, total_penalties = 0, 0
episodes = 100

for i in range(episodes):
    state = env.reset()
    epochs, penalties, reward = 0, 0, 0

    done = False
    episode_reward = []
    while not done:
        action = chose_action(q_table, state) # Exploit learned values

        state, reward, done, info = env.step(action)

        if reward == -10 :
            penalties += 1

        epochs += 1

        episode_reward.append(reward)

        # print(env.render(mode='ansi'))
        # clear_output(wait=True)
        # sleep(.1)

    total_penalties += penalties
    total_epochs += epochs

    print(f"Episode: {i}, Reward: {np.unique(episode_reward, return_counts=True)}", epsilon)

print(f"Results after {episodes} episodes:")
print(f"Average timesteps per episode: {total_epochs / episodes}")
print(f"Average penalties per episode: {total_penalties / episodes}")

Episode: 0, Reward: (array([-1, 20]), array([14,  1])) 0.0009999546366786345
Episode: 1, Reward: (array([-1, 20]), array([12,  1])) 0.0009999546366786345
Episode: 2, Reward: (array([-1, 20]), array([16,  1])) 0.0009999546366786345
Episode: 3, Reward: (array([-1, 20]), array([11,  1])) 0.0009999546366786345
Episode: 4, Reward: (array([-1, 20]), array([10,  1])) 0.0009999546366786345
Episode: 5, Reward: (array([-1, 20]), array([13,  1])) 0.0009999546366786345
Episode: 6, Reward: (array([-1, 20]), array([10,  1])) 0.0009999546366786345
Episode: 7, Reward: (array([-1, 20]), array([11,  1])) 0.0009999546366786345
Episode: 8, Reward: (array([-1, 20]), array([14,  1])) 0.0009999546366786345
Episode: 9, Reward: (array([-1, 20]), array([13,  1])) 0.0009999546366786345
Episode: 10, Reward: (array([-1, 20]), array([8, 1])) 0.0009999546366786345
Episode: 11, Reward: (array([-1, 20]), array([17,  1])) 0.0009999546366786345
Episode: 12, Reward: (array([-1, 20]), array([16,  1])) 0.000999954636678634