In [1]:
import gym 
import numpy as np
import tensorflow as tf

In [2]:
from rl.agents import DQNAgent
from rl.policy import EpsGreedyQPolicy, LinearAnnealedPolicy
from rl.memory import SequentialMemory

In [3]:
env_name = 'Acrobot-v1'
env = gym.make(env_name)

In [6]:
env.reset()
for step in range(200):
    env.render()
    env.step(env.action_space.sample()) # take a random action

env.close()

In [7]:
n_actions = env.action_space.n
print(n_actions)
n_obs = env.observation_space.shape
print(n_obs)

3
(6,)


In [8]:
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(1,) + n_obs),
    tf.keras.layers.Dense(64, activation='relu'),   
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(n_actions, activation='linear')
])

model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=0.001))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten (Flatten)           (None, 6)                 0         
                                                                 
 dense (Dense)               (None, 64)                448       
                                                                 
 dense_1 (Dense)             (None, 64)                4160      
                                                                 
 dense_2 (Dense)             (None, 64)                4160      
                                                                 
 dense_3 (Dense)             (None, 3)                 195       
                                                                 
Total params: 8,963
Trainable params: 8,963
Non-trainable params: 0
_________________________________________________________________


  super(Adam, self).__init__(name, **kwargs)


In [9]:
memory = SequentialMemory(limit=50000, window_length=1)
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=150000)

In [10]:
dqn = DQNAgent(model=model, nb_actions=n_actions, memory=memory, nb_steps_warmup=1000, batch_size=32, target_model_update=1000, policy=policy, gamma=0.99)

In [11]:
dqn.compile(tf.keras.optimizers.Adam(lr=0.001), metrics=['mae'])

  super(Adam, self).__init__(name, **kwargs)


In [12]:
dqn.fit(env, nb_steps=150000, visualize=False, verbose=1)

Training for 150000 steps ...
Interval 1 (0 steps performed)
   79/10000 [..............................] - ETA: 12s - reward: -1.0000

  updates=self.state_updates,


20 episodes - episode_reward: -486.450 [-500.000, -310.000] - loss: 0.024 - mae: 3.335 - mean_q: -4.918 - mean_eps: 0.967

Interval 2 (10000 steps performed)
20 episodes - episode_reward: -500.000 [-500.000, -500.000] - loss: 0.144 - mae: 8.338 - mean_q: -12.316 - mean_eps: 0.910

Interval 3 (20000 steps performed)
21 episodes - episode_reward: -488.143 [-500.000, -369.000] - loss: 0.327 - mae: 12.602 - mean_q: -18.643 - mean_eps: 0.850

Interval 4 (30000 steps performed)
23 episodes - episode_reward: -423.261 [-500.000, -237.000] - loss: 0.485 - mae: 16.112 - mean_q: -23.846 - mean_eps: 0.790

Interval 5 (40000 steps performed)
27 episodes - episode_reward: -365.259 [-500.000, -204.000] - loss: 0.677 - mae: 19.024 - mean_q: -28.106 - mean_eps: 0.730

Interval 6 (50000 steps performed)
33 episodes - episode_reward: -312.061 [-500.000, -180.000] - loss: 0.777 - mae: 21.458 - mean_q: -31.618 - mean_eps: 0.670

Interval 7 (60000 steps performed)
41 episodes - episode_reward: -242.000 [-47

<keras.callbacks.History at 0x247071df4f0>

In [13]:
dqn.save_weights('dqn_{}_weights.h5f'.format(env_name), overwrite=True)

In [17]:
dqn.test(env, nb_episodes=5, visualize=True)
env.close()

Testing for 5 episodes ...
Episode 1: reward: -71.000, steps: 72
Episode 2: reward: -84.000, steps: 85
Episode 3: reward: -110.000, steps: 111
Episode 4: reward: -81.000, steps: 82
Episode 5: reward: -81.000, steps: 82
