In [2]:
import numpy as np
import gym

In [3]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

In [1]:
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
#setting variables
ENV_NAME = 'CartPole-v0'

env = gym.make(ENV_NAME)
np.random.seed(123)
env.seed(123)
nb_actions = env.action_space.n

In [6]:
#building neural network
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(16))
model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                80        
_________________________________________________________________
activation_2 (Activation)    (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 34        
_________________________________________________________________
activation_3 (Activation)    (None, 2)                 0         
Total params: 114
Trainable params: 114
Non-trainable params: 0
_________________________________________________________________
None


In [8]:
#setting agent

policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory = memory, nb_steps_warmup = 10, target_model_update=1e-2, policy =policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=5000, verbose=2)

Training for 5000 steps ...




   11/5000: episode: 1, duration: 1.126s, episode steps: 11, steps per second: 10, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.111 [-3.316, 2.196], loss: --, mean_absolute_error: --, mean_q: --
   21/5000: episode: 2, duration: 0.080s, episode steps: 10, steps per second: 126, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.167 [-1.905, 3.089], loss: 0.576204, mean_absolute_error: 0.973066, mean_q: 0.925027
   31/5000: episode: 3, duration: 0.063s, episode steps: 10, steps per second: 158, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.157 [-3.116, 1.934], loss: 0.451600, mean_absolute_error: 0.805644, mean_q: 0.875803
   39/5000: episode: 4, duration: 0.058s, episode steps: 8, steps per second: 139, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 




   48/5000: episode: 5, duration: 0.062s, episode steps: 9, steps per second: 146, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.154 [-1.753, 2.794], loss: 0.414353, mean_absolute_error: 0.788617, mean_q: 1.187130
   57/5000: episode: 6, duration: 0.053s, episode steps: 9, steps per second: 170, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.134 [-2.454, 1.608], loss: 0.355994, mean_absolute_error: 0.673336, mean_q: 1.122542
   66/5000: episode: 7, duration: 0.062s, episode steps: 9, steps per second: 146, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.138 [-2.817, 1.757], loss: 0.434367, mean_absolute_error: 0.714431, mean_q: 1.301025
   75/5000: episode: 8, duration: 0.065s, episode steps: 9, steps per second: 138, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 

  351/5000: episode: 36, duration: 0.056s, episode steps: 10, steps per second: 179, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.900 [0.000, 1.000], mean observation: -0.131 [-2.596, 1.586], loss: 0.813166, mean_absolute_error: 1.209408, mean_q: 3.258272
  361/5000: episode: 37, duration: 0.051s, episode steps: 10, steps per second: 196, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.131 [-3.004, 1.973], loss: 0.766104, mean_absolute_error: 1.250832, mean_q: 3.253578
  369/5000: episode: 38, duration: 0.053s, episode steps: 8, steps per second: 150, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 1.000 [1.000, 1.000], mean observation: -0.138 [-2.490, 1.562], loss: 0.643470, mean_absolute_error: 1.245947, mean_q: 3.205327
  381/5000: episode: 39, duration: 0.062s, episode steps: 12, steps per second: 194, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mea

  659/5000: episode: 67, duration: 0.059s, episode steps: 10, steps per second: 169, episode reward: 10.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.800 [0.000, 1.000], mean observation: -0.121 [-2.377, 1.584], loss: 0.572910, mean_absolute_error: 2.239264, mean_q: 4.702991
  668/5000: episode: 68, duration: 0.049s, episode steps: 9, steps per second: 185, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.147 [-2.257, 1.386], loss: 0.586389, mean_absolute_error: 2.246960, mean_q: 4.717199
  677/5000: episode: 69, duration: 0.050s, episode steps: 9, steps per second: 178, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.889 [0.000, 1.000], mean observation: -0.124 [-2.235, 1.387], loss: 0.542446, mean_absolute_error: 2.234447, mean_q: 4.752459
  688/5000: episode: 70, duration: 0.089s, episode steps: 11, steps per second: 123, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean 

  960/5000: episode: 96, duration: 0.049s, episode steps: 8, steps per second: 163, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.875 [0.000, 1.000], mean observation: -0.142 [-2.168, 1.356], loss: 0.382800, mean_absolute_error: 2.712770, mean_q: 5.726696
  975/5000: episode: 97, duration: 0.090s, episode steps: 15, steps per second: 166, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.061 [-2.222, 1.585], loss: 0.435596, mean_absolute_error: 2.707220, mean_q: 5.732056
  986/5000: episode: 98, duration: 0.060s, episode steps: 11, steps per second: 185, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.727 [0.000, 1.000], mean observation: -0.117 [-2.105, 1.359], loss: 0.672781, mean_absolute_error: 2.798894, mean_q: 5.789237
  995/5000: episode: 99, duration: 0.065s, episode steps: 9, steps per second: 139, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean 

 1265/5000: episode: 126, duration: 0.053s, episode steps: 9, steps per second: 169, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.120 [-1.648, 1.016], loss: 0.318958, mean_absolute_error: 3.334663, mean_q: 6.592852
 1277/5000: episode: 127, duration: 0.084s, episode steps: 12, steps per second: 143, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.092 [-1.730, 1.216], loss: 0.366692, mean_absolute_error: 3.207676, mean_q: 6.265222
 1289/5000: episode: 128, duration: 0.086s, episode steps: 12, steps per second: 140, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.667 [0.000, 1.000], mean observation: -0.113 [-1.760, 1.153], loss: 0.404219, mean_absolute_error: 3.278796, mean_q: 6.342018
 1301/5000: episode: 129, duration: 0.083s, episode steps: 12, steps per second: 145, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000],

 1712/5000: episode: 158, duration: 0.067s, episode steps: 8, steps per second: 120, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.153 [-1.538, 2.552], loss: 0.819311, mean_absolute_error: 4.340707, mean_q: 8.221566
 1727/5000: episode: 159, duration: 0.092s, episode steps: 15, steps per second: 162, episode reward: 15.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.200 [0.000, 1.000], mean observation: 0.099 [-1.706, 2.719], loss: 1.571523, mean_absolute_error: 4.511755, mean_q: 8.537303
 1735/5000: episode: 160, duration: 0.046s, episode steps: 8, steps per second: 172, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.000 [0.000, 0.000], mean observation: 0.166 [-1.578, 2.590], loss: 1.271868, mean_absolute_error: 4.577639, mean_q: 8.604052
 1743/5000: episode: 161, duration: 0.056s, episode steps: 8, steps per second: 142, episode reward: 8.000, mean reward: 1.000 [1.000, 1.000], mean a

 2061/5000: episode: 187, duration: 0.068s, episode steps: 9, steps per second: 131, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.126 [-1.409, 2.134], loss: 2.026814, mean_absolute_error: 5.331370, mean_q: 9.941029
 2070/5000: episode: 188, duration: 0.061s, episode steps: 9, steps per second: 147, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.222 [0.000, 1.000], mean observation: 0.157 [-1.197, 1.989], loss: 1.934086, mean_absolute_error: 5.314934, mean_q: 9.943865
 2079/5000: episode: 189, duration: 0.060s, episode steps: 9, steps per second: 151, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.111 [0.000, 1.000], mean observation: 0.121 [-1.348, 2.193], loss: 2.980314, mean_absolute_error: 5.467113, mean_q: 10.107325
 2088/5000: episode: 190, duration: 0.063s, episode steps: 9, steps per second: 143, episode reward: 9.000, mean reward: 1.000 [1.000, 1.000], mean ac

 2383/5000: episode: 218, duration: 0.075s, episode steps: 11, steps per second: 147, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.273 [0.000, 1.000], mean observation: 0.108 [-1.158, 1.829], loss: 3.150957, mean_absolute_error: 5.749426, mean_q: 10.501872
 2394/5000: episode: 219, duration: 0.064s, episode steps: 11, steps per second: 171, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.109 [-0.974, 1.561], loss: 3.416447, mean_absolute_error: 5.719017, mean_q: 10.337673
 2405/5000: episode: 220, duration: 0.085s, episode steps: 11, steps per second: 130, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.364 [0.000, 1.000], mean observation: 0.110 [-1.019, 1.581], loss: 2.972915, mean_absolute_error: 5.771017, mean_q: 10.521905
 2417/5000: episode: 221, duration: 0.081s, episode steps: 12, steps per second: 147, episode reward: 12.000, mean reward: 1.000 [1.000, 1.000

 2738/5000: episode: 249, duration: 0.101s, episode steps: 16, steps per second: 158, episode reward: 16.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.500 [0.000, 1.000], mean observation: 0.094 [-0.954, 1.440], loss: 1.949209, mean_absolute_error: 5.700992, mean_q: 10.550403
 2749/5000: episode: 250, duration: 0.064s, episode steps: 11, steps per second: 171, episode reward: 11.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.455 [0.000, 1.000], mean observation: 0.128 [-0.772, 1.248], loss: 2.351872, mean_absolute_error: 5.773325, mean_q: 10.736169
 2775/5000: episode: 251, duration: 0.147s, episode steps: 26, steps per second: 177, episode reward: 26.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.538 [0.000, 1.000], mean observation: 0.076 [-0.592, 0.885], loss: 2.110947, mean_absolute_error: 5.827948, mean_q: 10.873758
 2798/5000: episode: 252, duration: 0.139s, episode steps: 23, steps per second: 165, episode reward: 23.000, mean reward: 1.000 [1.000, 1.000

 4162/5000: episode: 278, duration: 0.424s, episode steps: 64, steps per second: 151, episode reward: 64.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.469 [0.000, 1.000], mean observation: -0.095 [-0.763, 0.347], loss: 1.994943, mean_absolute_error: 6.932982, mean_q: 13.010740
 4284/5000: episode: 279, duration: 0.832s, episode steps: 122, steps per second: 147, episode reward: 122.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.484 [0.000, 1.000], mean observation: -0.061 [-0.733, 0.594], loss: 2.477951, mean_absolute_error: 7.008595, mean_q: 13.104886
 4318/5000: episode: 280, duration: 0.240s, episode steps: 34, steps per second: 141, episode reward: 34.000, mean reward: 1.000 [1.000, 1.000], mean action: 0.441 [0.000, 1.000], mean observation: -0.133 [-0.756, 0.444], loss: 1.990718, mean_absolute_error: 7.177884, mean_q: 13.540406
 4389/5000: episode: 281, duration: 0.474s, episode steps: 71, steps per second: 150, episode reward: 71.000, mean reward: 1.000 [1.000, 

<keras.callbacks.History at 0x2b17e646c18>