In [25]:
import gym
import random
import pygame

In [26]:
env = gym.make("Taxi-v3")

env.reset()
env.render()

+---------+
|R: | :[43m [0m:G|
| : | : : |
| : : : : |
| | : | : |
|[35mY[0m| : |[34;1mB[0m: |
+---------+



In [27]:
states = env.observation_space.n
print(env.observation_space.shape)
# height, width, channels = env.observation_space.shape
print("There are ", states, " possible states")
actions = env.action_space.n
print("There are ", actions, " possible actions")

()
There are  500  possible states
There are  6  possible actions


In [28]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        action = random.choice([0,1,2,3,4,5])
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:-785
Episode:2 Score:-758
Episode:3 Score:-830
Episode:4 Score:-938
Episode:5 Score:-767
Episode:6 Score:-857
Episode:7 Score:-821
Episode:8 Score:-758
Episode:9 Score:-803
Episode:10 Score:-803


In [29]:
episodes = 10
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0

    while not done:
        pick = random.choice([4,5])
        move = random.choice([0,1,2,3])
        n_state, reward, done, info = env.step(move)
        print(n_state)
        score+=reward
    if done:
        print('Episode:{} Score:{}'.format(episode, score))
    else:
        print('Times up')

272
372
392
372
372
392
372
372
472
372
392
372
272
252
152
52
152
152
252
272
372
372
472
372
272
292
272
172
72
72
52
72
172
272
172
72
72
92
72
92
192
172
192
92
192
172
152
252
272
292
392
292
272
372
272
372
372
372
392
492
392
372
392
372
392
292
392
372
472
372
272
372
392
392
392
292
392
372
272
372
272
292
192
292
292
272
252
152
252
352
452
432
432
432
452
452
452
452
452
352
332
352
332
432
452
432
332
232
332
232
332
232
332
352
452
452
452
452
452
432
452
432
432
332
432
432
452
432
432
432
432
332
432
452
432
452
432
432
432
432
332
352
252
232
212
112
112
132
132
112
212
112
212
212
212
112
212
112
132
32
32
32
32
132
32
12
32
132
112
12
12
12
12
112
112
12
32
132
232
252
232
332
352
332
232
332
232
212
312
312
312
312
412
412
412
412
412
412
312
312
Episode:1 Score:-200
444
444
444
344
244
144
144
244
264
364
264
164
64
64
64
164
264
244
344
444
444
424
324
324
324
224
204
304
204
204
224
244
224
244
144
44
64
44
64
44
44
144
44
44
144
244
264
244
264
244
144
164
144
16

In [30]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Convolution2D, Reshape, Embedding
from tensorflow.keras.optimizers import Adam

In [31]:
def build_model(actions):
    model = Sequential()
    model.add(Embedding(500, 10, input_length=1))
    model.add(Reshape((10,)))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    return model


In [32]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

memory = SequentialMemory(limit=50000, window_length=1)
policy = EpsGreedyQPolicy()
dqn = DQNAgent(model=build_model(actions), nb_actions=actions, memory=memory, nb_steps_warmup=500, target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=100000, visualize=False, verbose=1, nb_max_episode_steps=99, log_interval=10000)

Training for 100000 steps ...
Interval 1 (0 steps performed)
101 episodes - episode_reward: -126.891 [-171.000, -99.000] - loss: 1.495 - mae: 16.792 - mean_q: -16.191 - prob: 1.000

Interval 2 (10000 steps performed)
101 episodes - episode_reward: -130.545 [-369.000, -99.000] - loss: 4.167 - mae: 27.178 - mean_q: -28.547 - prob: 1.000

Interval 3 (20000 steps performed)
101 episodes - episode_reward: -130.000 [-360.000, -89.000] - loss: 4.981 - mae: 28.704 - mean_q: -30.459 - prob: 1.000

Interval 4 (30000 steps performed)
102 episodes - episode_reward: -123.578 [-261.000, -18.000] - loss: 5.645 - mae: 30.605 - mean_q: -32.786 - prob: 1.000

Interval 5 (40000 steps performed)
106 episodes - episode_reward: -129.396 [-452.000, 11.000] - loss: 5.226 - mae: 29.501 - mean_q: -31.364 - prob: 1.000

Interval 6 (50000 steps performed)
121 episodes - episode_reward: -98.603 [-171.000, 13.000] - loss: 4.624 - mae: 25.447 - mean_q: -24.876 - prob: 1.000

Interval 7 (60000 steps performed)
118 ep

<tensorflow.python.keras.callbacks.History at 0x2799dfc42e0>

In [33]:
dqn.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=99)

Testing for 5 episodes ...
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
|[43m [0m: : : : |
| | : | : |
|Y| : |B: |
+---------+
  (South)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| :[43m [0m: : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : | : : |
| : :[43m [0m: : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1mG[0m|
| : |[43m [0m: : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: |[43m [0m: :[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (North)
+---------+
|[35mR[0m: | :[43m [0m:[34;1mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[34;1m[43mG[0m[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (East)
+---------+
|[35mR[0m: | : :[42mG[0m|
| : | : : |
| : : : : |
| | : | : |
|Y| : |B: |
+---------+
  (Pickup)
+--

<tensorflow.python.keras.callbacks.History at 0x279ff69f310>

In [34]:
scores = dqn.test(env, nb_episodes=100, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 100 episodes ...
Episode 1: reward: 15.000, steps: 6
Episode 2: reward: -200.000, steps: 200
Episode 3: reward: -200.000, steps: 200
Episode 4: reward: 9.000, steps: 12
Episode 5: reward: -200.000, steps: 200
Episode 6: reward: 11.000, steps: 10
Episode 7: reward: -200.000, steps: 200
Episode 8: reward: -200.000, steps: 200
Episode 9: reward: -200.000, steps: 200
Episode 10: reward: -200.000, steps: 200
Episode 11: reward: -200.000, steps: 200
Episode 12: reward: -200.000, steps: 200
Episode 13: reward: -200.000, steps: 200
Episode 14: reward: 9.000, steps: 12
Episode 15: reward: -200.000, steps: 200
Episode 16: reward: -200.000, steps: 200
Episode 17: reward: 8.000, steps: 13
Episode 18: reward: 8.000, steps: 13
Episode 19: reward: -200.000, steps: 200
Episode 20: reward: -200.000, steps: 200
Episode 21: reward: -200.000, steps: 200
Episode 22: reward: 9.000, steps: 12
Episode 23: reward: 5.000, steps: 16
Episode 24: reward: -200.000, steps: 200
Episode 25: reward: -200.00