In [2]:

import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [4]:
# build your neural network model 

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_1 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=50000, visualize=False, verbose=2)

Training for 50000 steps ...




    79/50000: episode: 1, duration: 0.976s, episode steps: 79, steps per second: 81, episode reward: 79.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.519 [0.000, 1.000], mean observation: 0.060 [-0.402, 0.722], loss: 0.426841, mean_absolute_error: 0.494516, mean_q: 0.053988
   113/50000: episode: 2, duration: 0.107s, episode steps: 34, steps per second: 318, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.529 [0.000, 1.000], mean observation: 0.151 [-0.159, 0.753], loss: 0.350344, mean_absolute_error: 0.444455, mean_q: 0.192739
   163/50000: episode: 3, duration: 0.170s, episode steps: 50, steps per second: 295, episode reward: 50.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.520 [0.000, 1.000], mean observation: 0.082 [-0.295, 0.778], loss: 0.315713, mean_absolute_error: 0.466836, mean_q: 0.319681
   197/50000: episode: 4, duration: 0.122s, episode steps: 34, steps per second: 279, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], m

   705/50000: episode: 30, duration: 0.045s, episode steps: 11, steps per second: 247, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.127 [-0.942, 1.734], loss: 0.373739, mean_absolute_error: 2.326306, mean_q: 4.484053
   720/50000: episode: 31, duration: 0.062s, episode steps: 15, steps per second: 241, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.333 [0.000, 1.000], mean observation: 0.073 [-1.015, 1.653], loss: 0.388497, mean_absolute_error: 2.357814, mean_q: 4.605469
   730/50000: episode: 32, duration: 0.034s, episode steps: 10, steps per second: 294, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.108 [-1.028, 1.700], loss: 0.352324, mean_absolute_error: 2.438974, mean_q: 4.741551
   740/50000: episode: 33, duration: 0.044s, episode steps: 10, steps per second: 230, episode reward: 10.000, mean reward: 1.000 [1.000, 1.00

  1019/50000: episode: 61, duration: 0.035s, episode steps: 10, steps per second: 282, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.300 [0.000, 1.000], mean observation: 0.100 [-0.994, 1.596], loss: 0.733594, mean_absolute_error: 3.551845, mean_q: 6.753257
  1030/50000: episode: 62, duration: 0.043s, episode steps: 11, steps per second: 254, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.111 [-1.171, 1.905], loss: 1.206318, mean_absolute_error: 3.663323, mean_q: 6.831096
  1043/50000: episode: 63, duration: 0.064s, episode steps: 13, steps per second: 204, episode reward: 13.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.308 [0.000, 1.000], mean observation: 0.094 [-1.323, 1.963], loss: 1.128772, mean_absolute_error: 3.673794, mean_q: 6.831375
  1055/50000: episode: 64, duration: 0.058s, episode steps: 12, steps per second: 209, episode reward: 12.000, mean reward: 1.000 [1.000, 1.00

  1471/50000: episode: 90, duration: 0.224s, episode steps: 62, steps per second: 277, episode reward: 62.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.516 [0.000, 1.000], mean observation: 0.078 [-0.364, 0.862], loss: 0.982383, mean_absolute_error: 4.447815, mean_q: 8.290794
  1511/50000: episode: 91, duration: 0.134s, episode steps: 40, steps per second: 297, episode reward: 40.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.093 [-0.795, 0.235], loss: 1.001549, mean_absolute_error: 4.586510, mean_q: 8.615925
  1535/50000: episode: 92, duration: 0.083s, episode steps: 24, steps per second: 288, episode reward: 24.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.095 [-0.975, 0.367], loss: 1.145196, mean_absolute_error: 4.709943, mean_q: 8.834752
  1561/50000: episode: 93, duration: 0.085s, episode steps: 26, steps per second: 304, episode reward: 26.000, mean reward: 1.000 [1.000, 1.

  2056/50000: episode: 119, duration: 0.058s, episode steps: 15, steps per second: 258, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.533 [0.000, 1.000], mean observation: -0.108 [-1.025, 0.418], loss: 2.888992, mean_absolute_error: 6.101038, mean_q: 11.159227
  2077/50000: episode: 120, duration: 0.126s, episode steps: 21, steps per second: 167, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.524 [0.000, 1.000], mean observation: -0.098 [-1.003, 0.269], loss: 1.982455, mean_absolute_error: 6.088154, mean_q: 11.346805
  2105/50000: episode: 121, duration: 0.125s, episode steps: 28, steps per second: 225, episode reward: 28.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.111 [-0.984, 0.188], loss: 2.140847, mean_absolute_error: 6.069653, mean_q: 11.350748
  2168/50000: episode: 122, duration: 0.270s, episode steps: 63, steps per second: 233, episode reward: 63.000, mean reward: 1.000 [1

  3097/50000: episode: 148, duration: 0.118s, episode steps: 48, steps per second: 406, episode reward: 48.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.041 [-0.838, 0.229], loss: 3.670322, mean_absolute_error: 8.102551, mean_q: 15.389377
  3118/50000: episode: 149, duration: 0.062s, episode steps: 21, steps per second: 336, episode reward: 21.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.476 [0.000, 1.000], mean observation: -0.096 [-0.948, 0.409], loss: 2.651317, mean_absolute_error: 8.122108, mean_q: 15.486122
  3150/50000: episode: 150, duration: 0.081s, episode steps: 32, steps per second: 395, episode reward: 32.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.097 [-0.946, 0.523], loss: 2.307862, mean_absolute_error: 8.147066, mean_q: 15.725236
  3190/50000: episode: 151, duration: 0.108s, episode steps: 40, steps per second: 371, episode reward: 40.000, mean reward: 1.000 [1

  4731/50000: episode: 178, duration: 0.252s, episode steps: 102, steps per second: 405, episode reward: 102.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: -0.017 [-1.273, 0.371], loss: 3.109821, mean_absolute_error: 10.451025, mean_q: 20.260960
  4812/50000: episode: 179, duration: 0.204s, episode steps: 81, steps per second: 398, episode reward: 81.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.481 [0.000, 1.000], mean observation: -0.025 [-0.764, 0.577], loss: 3.323465, mean_absolute_error: 10.643510, mean_q: 20.684933
  4842/50000: episode: 180, duration: 0.080s, episode steps: 30, steps per second: 376, episode reward: 30.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.115 [-0.732, 0.214], loss: 2.605819, mean_absolute_error: 10.656039, mean_q: 20.783533
  4886/50000: episode: 181, duration: 0.112s, episode steps: 44, steps per second: 392, episode reward: 44.000, mean reward: 1.0

  6511/50000: episode: 207, duration: 0.230s, episode steps: 89, steps per second: 388, episode reward: 89.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.461 [0.000, 1.000], mean observation: -0.142 [-1.293, 0.372], loss: 4.456431, mean_absolute_error: 12.620161, mean_q: 24.670036
  6569/50000: episode: 208, duration: 0.143s, episode steps: 58, steps per second: 406, episode reward: 58.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.448 [0.000, 1.000], mean observation: -0.165 [-1.095, 0.194], loss: 3.753798, mean_absolute_error: 12.687646, mean_q: 24.958586
  6628/50000: episode: 209, duration: 0.155s, episode steps: 59, steps per second: 380, episode reward: 59.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.441 [0.000, 1.000], mean observation: -0.153 [-1.231, 0.250], loss: 3.577906, mean_absolute_error: 12.799254, mean_q: 25.198744
  6706/50000: episode: 210, duration: 0.196s, episode steps: 78, steps per second: 398, episode reward: 78.000, mean reward: 1.000

  9382/50000: episode: 236, duration: 0.462s, episode steps: 156, steps per second: 337, episode reward: 156.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.209 [-1.869, 0.613], loss: 4.115394, mean_absolute_error: 15.232283, mean_q: 30.246187
  9540/50000: episode: 237, duration: 0.448s, episode steps: 158, steps per second: 353, episode reward: 158.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.181 [-1.800, 0.478], loss: 4.774404, mean_absolute_error: 15.425368, mean_q: 30.529016
  9668/50000: episode: 238, duration: 0.332s, episode steps: 128, steps per second: 386, episode reward: 128.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.231 [-1.499, 0.477], loss: 3.377115, mean_absolute_error: 15.742575, mean_q: 31.347836
  9806/50000: episode: 239, duration: 0.341s, episode steps: 138, steps per second: 405, episode reward: 138.000, mean rewar

 13384/50000: episode: 265, duration: 0.588s, episode steps: 196, steps per second: 333, episode reward: 196.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.474 [0.000, 1.000], mean observation: -0.197 [-1.869, 0.515], loss: 4.749671, mean_absolute_error: 20.595312, mean_q: 41.641850
 13529/50000: episode: 266, duration: 0.361s, episode steps: 145, steps per second: 402, episode reward: 145.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.283 [-1.995, 0.302], loss: 5.860536, mean_absolute_error: 20.938406, mean_q: 42.226463
 13637/50000: episode: 267, duration: 0.268s, episode steps: 108, steps per second: 403, episode reward: 108.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.454 [0.000, 1.000], mean observation: -0.328 [-1.838, 0.334], loss: 4.798449, mean_absolute_error: 21.049585, mean_q: 42.603424
 13777/50000: episode: 268, duration: 0.354s, episode steps: 140, steps per second: 395, episode reward: 140.000, mean rewar

 17987/50000: episode: 294, duration: 0.659s, episode steps: 158, steps per second: 240, episode reward: 158.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.264 [-1.841, 0.441], loss: 6.520350, mean_absolute_error: 27.130466, mean_q: 55.135216
 18116/50000: episode: 295, duration: 0.323s, episode steps: 129, steps per second: 400, episode reward: 129.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.457 [0.000, 1.000], mean observation: -0.366 [-2.247, 0.370], loss: 6.284785, mean_absolute_error: 27.494701, mean_q: 55.851261
 18265/50000: episode: 296, duration: 0.372s, episode steps: 149, steps per second: 401, episode reward: 149.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.317 [-2.210, 0.393], loss: 5.907386, mean_absolute_error: 27.524906, mean_q: 55.976719
 18442/50000: episode: 297, duration: 0.433s, episode steps: 177, steps per second: 409, episode reward: 177.000, mean rewar

 22589/50000: episode: 323, duration: 0.448s, episode steps: 178, steps per second: 398, episode reward: 178.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.231 [-2.010, 0.450], loss: 7.331310, mean_absolute_error: 30.680487, mean_q: 62.017239
 22750/50000: episode: 324, duration: 0.420s, episode steps: 161, steps per second: 383, episode reward: 161.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.281 [-2.034, 0.641], loss: 9.077161, mean_absolute_error: 30.757631, mean_q: 62.071060
 22950/50000: episode: 325, duration: 0.492s, episode steps: 200, steps per second: 407, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.495 [0.000, 1.000], mean observation: -0.020 [-0.592, 0.500], loss: 8.728942, mean_absolute_error: 31.147163, mean_q: 62.927231
 23086/50000: episode: 326, duration: 0.335s, episode steps: 136, steps per second: 406, episode reward: 136.000, mean rewar

 27657/50000: episode: 352, duration: 0.987s, episode steps: 155, steps per second: 157, episode reward: 155.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.471 [0.000, 1.000], mean observation: -0.267 [-1.831, 0.516], loss: 5.874833, mean_absolute_error: 33.544415, mean_q: 67.470963
 27857/50000: episode: 353, duration: 1.289s, episode steps: 200, steps per second: 155, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.490 [0.000, 1.000], mean observation: -0.093 [-0.939, 0.505], loss: 9.220509, mean_absolute_error: 33.834423, mean_q: 68.120956
 28038/50000: episode: 354, duration: 1.147s, episode steps: 181, steps per second: 158, episode reward: 181.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.246 [-1.966, 0.453], loss: 6.445538, mean_absolute_error: 33.938950, mean_q: 68.347557
 28198/50000: episode: 355, duration: 1.015s, episode steps: 160, steps per second: 158, episode reward: 160.000, mean rewar

 32633/50000: episode: 381, duration: 0.774s, episode steps: 162, steps per second: 209, episode reward: 162.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.275 [-1.868, 0.537], loss: 7.188449, mean_absolute_error: 34.727104, mean_q: 69.449280
 32780/50000: episode: 382, duration: 0.713s, episode steps: 147, steps per second: 206, episode reward: 147.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.463 [0.000, 1.000], mean observation: -0.299 [-2.020, 0.353], loss: 6.973725, mean_absolute_error: 34.670589, mean_q: 69.621452
 32910/50000: episode: 383, duration: 0.614s, episode steps: 130, steps per second: 212, episode reward: 130.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.462 [0.000, 1.000], mean observation: -0.341 [-1.828, 0.466], loss: 6.140037, mean_absolute_error: 34.654854, mean_q: 69.308296
 33108/50000: episode: 384, duration: 0.946s, episode steps: 198, steps per second: 209, episode reward: 198.000, mean rewar

 37507/50000: episode: 410, duration: 0.352s, episode steps: 154, steps per second: 437, episode reward: 154.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.468 [0.000, 1.000], mean observation: -0.291 [-1.854, 0.490], loss: 6.415348, mean_absolute_error: 35.075874, mean_q: 70.157570
 37662/50000: episode: 411, duration: 0.420s, episode steps: 155, steps per second: 369, episode reward: 155.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.465 [0.000, 1.000], mean observation: -0.302 [-1.987, 0.651], loss: 5.531255, mean_absolute_error: 35.294353, mean_q: 70.656525
 37840/50000: episode: 412, duration: 0.458s, episode steps: 178, steps per second: 389, episode reward: 178.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.478 [0.000, 1.000], mean observation: -0.212 [-1.458, 0.672], loss: 8.753805, mean_absolute_error: 34.940079, mean_q: 69.804100
 38040/50000: episode: 413, duration: 0.382s, episode steps: 200, steps per second: 523, episode reward: 200.000, mean rewar

 42820/50000: episode: 439, duration: 0.326s, episode steps: 165, steps per second: 506, episode reward: 165.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.467 [0.000, 1.000], mean observation: -0.271 [-1.968, 0.505], loss: 7.374728, mean_absolute_error: 35.439671, mean_q: 70.594292
 42984/50000: episode: 440, duration: 0.318s, episode steps: 164, steps per second: 515, episode reward: 164.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.470 [0.000, 1.000], mean observation: -0.279 [-1.843, 0.889], loss: 6.155617, mean_absolute_error: 35.511452, mean_q: 70.915909
 43184/50000: episode: 441, duration: 0.389s, episode steps: 200, steps per second: 514, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.475 [0.000, 1.000], mean observation: -0.221 [-1.784, 0.742], loss: 9.353643, mean_absolute_error: 35.486382, mean_q: 70.720451
 43384/50000: episode: 442, duration: 0.393s, episode steps: 200, steps per second: 508, episode reward: 200.000, mean rewar

 48111/50000: episode: 468, duration: 0.312s, episode steps: 162, steps per second: 519, episode reward: 162.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.256 [-1.810, 0.508], loss: 6.443087, mean_absolute_error: 36.188934, mean_q: 71.995369
 48311/50000: episode: 469, duration: 0.390s, episode steps: 200, steps per second: 513, episode reward: 200.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.036 [-0.585, 0.636], loss: 6.075263, mean_absolute_error: 35.692295, mean_q: 71.093468
 48487/50000: episode: 470, duration: 0.447s, episode steps: 176, steps per second: 394, episode reward: 176.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.472 [0.000, 1.000], mean observation: -0.271 [-1.860, 0.519], loss: 6.815546, mean_absolute_error: 36.269474, mean_q: 72.201973
 48687/50000: episode: 471, duration: 0.584s, episode steps: 200, steps per second: 342, episode reward: 200.000, mean reward

<keras.callbacks.History at 0xd5f1f60>

In [6]:
dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 168.000, steps: 168
Episode 5: reward: 200.000, steps: 200


<keras.callbacks.History at 0xd5f1048>

In [None]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

ENV_NAME = 'CartPole-v0'

# Get the environment and extract the number of actions available in the Cartpole problem
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=500, visualize=False, verbose=2)

In [None]:

dqn.test(env, nb_episodes=50, visualize=True)

In [None]:
import numpy as np
import gym

from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.cem import CEMAgent
from rl.memory import EpisodeParameterMemory

ENV_NAME = 'CartPole-v0'


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)

nb_actions = env.action_space.n
obs_dim = env.observation_space.shape[0]

# Option 1 : Simple model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(nb_actions))
model.add(Activation('softmax'))

# Option 2: deep network
# model = Sequential()
# model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(16))
# model.add(Activation('relu'))
# model.add(Dense(nb_actions))
# model.add(Activation('softmax'))


print(model.summary())


# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = EpisodeParameterMemory(limit=1000, window_length=1)

cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
               batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
cem.compile()



In [None]:
# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
cem.fit(env, nb_steps=100000, visualize=False, verbose=2)

# After training is done, we save the best weights.
cem.save_weights('cem_{}_params.h5f'.format(ENV_NAME), overwrite=True)

In [None]:

# Finally, evaluate our algorithm for 5 episodes.
cem.test(env, nb_episodes=5, visualize=True)

In [2]:
import numpy as np
import gym

from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Flatten, Input, Concatenate
from keras.optimizers import Adam

from rl.agents import DDPGAgent
from rl.memory import SequentialMemory
from rl.random import OrnsteinUhlenbeckProcess


ENV_NAME = 'Pendulum-v0'
gym.undo_logger_setup()


# Get the environment and extract the number of actions.
env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
assert len(env.action_space.shape) == 1
nb_actions = env.action_space.shape[0]

# Next, we build a very simple model.
actor = Sequential()
actor.add(Flatten(input_shape=(1,) + env.observation_space.shape))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(16))
actor.add(Activation('relu'))
actor.add(Dense(nb_actions))
actor.add(Activation('linear'))
print(actor.summary())

action_input = Input(shape=(nb_actions,), name='action_input')
observation_input = Input(shape=(1,) + env.observation_space.shape, name='observation_input')
flattened_observation = Flatten()(observation_input)
x = Concatenate()([action_input, flattened_observation])
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(32)(x)
x = Activation('relu')(x)
x = Dense(1)(x)
x = Activation('linear')(x)
critic = Model(inputs=[action_input, observation_input], outputs=x)
print(critic.summary())

# Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
# even the metrics!
memory = SequentialMemory(limit=100000, window_length=1)
random_process = OrnsteinUhlenbeckProcess(size=nb_actions, theta=.15, mu=0., sigma=.3)
agent = DDPGAgent(nb_actions=nb_actions, actor=actor, critic=critic, critic_action_input=action_input,
                  memory=memory, nb_steps_warmup_critic=100, nb_steps_warmup_actor=100,
                  random_process=random_process, gamma=.99, target_model_update=1e-3)
agent.compile(Adam(lr=.001, clipnorm=1.), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this
# slows down training quite a lot. You can always safely abort the training prematurely using
# Ctrl + C.
agent.fit(env, nb_steps=50000, visualize=False, verbose=1, nb_max_episode_steps=200)

# After training is done, we save the final weights.
agent.save_weights('ddpg_{}_weights.h5f'.format(ENV_NAME), overwrite=True)





[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 3)                 0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                64        
_________________________________________________________________
activation_9 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 16)                272       
_________________________________________________________________
activation_10 (Activation)   (None, 16)                0         
__________________________________________________________

In [1]:
# Finally, evaluate our algorithm for 5 episodes.
agent.test(env, nb_episodes=5, visualize=True, nb_max_episode_steps=200)

NameError: name 'agent' is not defined

In [None]:
# Credits
# @misc{plappert2016kerasrl,
#    author = {Matthias Plappert},
#    title = {keras-rl},
#    year = {2016},
#   publisher = {GitHub},
#   journal = {GitHub repository},
#  howpublished = {\url{https://github.com/keras-rl/keras-rl}},
#}