In [1]:
import keras
import numpy as np
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import gym

Using TensorFlow backend.


In [2]:
env = gym.make('CartPole-v1')
print(f"Action Space : {env.action_space} \n Observation Space : {env.observation_space}")

Action Space : Discrete(2) 
 Observation Space : Box(4,)


In [3]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=(1, env.observation_space.shape[0])),
    keras.layers.Dense(24, activation=keras.activations.relu),
    keras.layers.Dense(24, activation=keras.activations.relu),
    keras.layers.Dense(env.action_space.n, activation=keras.activations.linear)
])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_2 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [None]:
dqn = DQNAgent(model=model,
               policy=EpsGreedyQPolicy(),
               memory=SequentialMemory(limit=1000000, window_length=1),
               nb_actions=env.action_space.n,
               nb_steps_warmup=10)
dqn.compile(optimizer=keras.optimizers.Adam(), 
            metrics=[keras.metrics.mean_absolute_error])
dqn.fit(env=env,
        nb_steps=100,
        visualize=True,
        nb_max_episode_steps=200)

Training for 100 steps ...
Interval 1 (0 steps performed)


In [34]:
nb_actions = env.action_space.n
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(keras.optimizers.Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

Training for 5000 steps ...
   10/5000: episode: 1, duration: 0.309s, episode steps: 10, steps per second: 32, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.112 [-1.992, 3.004], loss: --, mean_absolute_error: --, mean_q: --
   19/5000: episode: 2, duration: 1.345s, episode steps: 9, steps per second: 7, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.145 [-1.759, 2.851], loss: 0.440488, mean_absolute_error: 0.647189, mean_q: 0.040286
   29/5000: episode: 3, duration: 0.175s, episode steps: 10, steps per second: 57, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.147 [-1.950, 3.091], loss: 0.327157, mean_absolute_error: 0.576285, mean_q: 0.192498
   38/5000: episode: 4, duration: 0.172s, episode steps: 9, steps per second: 52, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean ac



<keras.callbacks.History at 0x24129270358>

In [35]:
dqn.test(env, nb_episodes=5, visualize=True)


Testing for 5 episodes ...
Episode 1: reward: 176.000, steps: 176
Episode 2: reward: 188.000, steps: 188
Episode 3: reward: 190.000, steps: 190
Episode 4: reward: 168.000, steps: 168
Episode 5: reward: 170.000, steps: 170


<keras.callbacks.History at 0x2412a7a8c50>