pip install keras-rl

In [1]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import tensorflow as tf
tf.__version__

Using TensorFlow backend.


'1.14.0'

In [2]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [3]:
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=True, verbose=2)

0.000, 1.000], mean observation: 0.117 [-1.805, 2.614], loss: 1.452632, mae: 5.160711, mean_q: 9.613422
 2415/5000: episode: 232, duration: 0.230s, episode steps: 9, steps per second: 39, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.170 [-1.345, 2.323], loss: 1.546928, mae: 5.115371, mean_q: 9.510553
 2428/5000: episode: 233, duration: 0.266s, episode steps: 13, steps per second: 49, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.154 [0.000, 1.000], mean observation: 0.079 [-1.797, 2.652], loss: 2.014740, mae: 5.108748, mean_q: 9.382926
 2438/5000: episode: 234, duration: 0.207s, episode steps: 10, steps per second: 48, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.137 [-1.188, 2.046], loss: 1.331639, mae: 5.055463, mean_q: 9.469430
 2448/5000: episode: 235, duration: 0.182s, episode steps: 10, steps per second: 55, episode r

<keras.callbacks.callbacks.History at 0x1f0f53aed88>

In [5]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 87.000, steps: 87
Episode 2: reward: 50.000, steps: 50
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 47.000, steps: 47
Episode 5: reward: 103.000, steps: 103


<keras.callbacks.callbacks.History at 0x1f0f40ff3c8>

In [6]:
model = Sequential()
model.add(Dense(16, activation='relu', input_dim=env.observation_space.shape[0]))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_3 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_4 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
env.observation_space.shape

(4,)