In [11]:
import random

import gym
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import clear_output

In [2]:
env = gym.make('Taxi-v3')

In [4]:
episodes = 10

for episode in range(1, episodes):
    state = env.reset()
    done = False
    score = 0

    while not done:
        env.render()
        state, reward, done, info = env.step(env.action_space.sample())
        score += reward

        clear_output(wait=True)

    print("Episode: {}\tScore: {}".format(episode, score))

env.close()

Episode: 9	Score: -830


#### implement RL agent in step function to create intelligent actions rather than random actions

## Q-table

In [8]:
actions = env.action_space.n
state = env.observation_space.n

q_table = np.zeros((state, actions))

In [9]:
# params for Q-learning algo
num_episodes = 10000
max_steps_per_episode = 100

learning_rate = .1
discount_rate = .99

exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = .01
exploration_decay_rate = .001

rewards_all_episodes = []

### Q-learning Algorithm

In [12]:
for episode in range(num_episodes):
    state = env.reset()
    done = False
    rewards_current_episode = 0

    for step in range(max_steps_per_episode):
         # exploration vs exploitation
        exploration_threshold = random.uniform(0, 1)
        if exploration_threshold > exploration_rate:
            action = np.argmax(q_table[state,:])
        else:
            action = env.action_space.sample()

        new_state, reward, done, info = env.step(action)

        # change q-table
        q_table[state, action] = q_table[state, action] * (1 - learning_rate) + (learning_rate * (reward + discount_rate * np.max(q_table[new_state, :])))

        state = new_state
        rewards_current_episode += reward

        if done:
            break

    # decay exploration rate
    exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)

    rewards_all_episodes.append(rewards_current_episode)

print("*************Training Finished*****************")

*************Training Finished*****************


In [13]:
q_table

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ],
       [ 1.07942887,  1.44568165, -3.14666242,  1.29685464,  9.6220697 ,
        -6.75218339],
       [ 4.31002556,  6.12562763, -0.67840626,  6.73398885, 14.11880599,
        -1.97792153],
       ...,
       [-1.43094235,  3.14623869, -1.56490511, -1.6172498 , -5.0384282 ,
        -7.46650038],
       [-2.59530639, -2.58116041, -2.61338885,  1.78866431, -6.2250372 ,
        -8.84550139],
       [ 0.43230025, -0.51444451, -0.279118  , 17.77115802, -2.36387946,
        -2.84844764]])

#### Calculate and print average reward per 1K episodes

In [16]:
rewards_per_thousand_episodes = np.split(np.array(rewards_all_episodes), num_episodes/1000)

count = 1000

print("Avg per thousand")

for r in rewards_per_thousand_episodes:
    print(count, " : ", str(sum(r/1000)))
    count += 1000

Avg per thousand
1000  :  -255.9009999999997
2000  :  -37.73500000000002
3000  :  2.304999999999992
4000  :  5.540999999999974
5000  :  6.518999999999965
6000  :  7.4679999999999644
7000  :  7.530999999999956
8000  :  7.361999999999969
9000  :  7.4419999999999495
10000  :  7.630999999999973


#### visualize agent

In [18]:
import time

for episode in range(3):
    state = env.reset()

    done = False
    print("Episode is: " + str(episode))
    time.sleep(1)

    for step in range(max_steps_per_episode):
        clear_output(wait=True)
        env.render()
        time.sleep(.4)

        action = np.argmax(q_table[state,:])

        new_state, reward, done, info = env.step(action)

        if done:
            clear_output(wait=True)
            env.render()
            if reward == 1:
                print("Reached goal")
                time.sleep(2)
                clear_output(wait=True)
            else:
                print("failed")
                time.sleep(2)
                clear_output(wait=True)

            break

        state = new_state

env.close()

+---------+
|R: | : :G|
| : | : : |
| : : : : |
| | : | : |
|Y| : |[35m[34;1m[43mB[0m[0m[0m: |
+---------+
  (Dropoff)
failed
