## ATARI Asteroids DQN_gym with keras-rl

In [29]:
import numpy as no
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam

from rl.agents.dqn import DQNAgent
from rl.agents.ddpg import DDPGAgent
from rl.policy import BoltzmannQPolicy , LinearAnnealedPolicy , EpsGreedyQPolicy
from rl.memory import SequentialMemory

In [2]:
ENV_NAME_2 = 'Asteroids-v0'

In [3]:
# Get the environment and extract the number of actions
env = gym.make(ENV_NAME_2)
nb_actions = env.action_space.n
nb_actions

14

In [4]:
# Next, we build a neural network model
model = Sequential()
model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
model.add(Dense(3, activation= 'tanh')) # One layer of 3 units with tanh activation function 
model.add(Dense(nb_actions))
model.add(Activation('sigmoid')) # one layer of 1 unit with sigmoid activation function
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_1 (Flatten)          (None, 100800)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 302403    
_________________________________________________________________
dense_2 (Dense)              (None, 14)                56        
_________________________________________________________________
activation_1 (Activation)    (None, 14)                0         
Total params: 302,459
Trainable params: 302,459
Non-trainable params: 0
_________________________________________________________________
None


In [5]:
#DQN -- Deep Reinforcement Learning 

#Configure and compile the agent. 
#Use every built-in Keras optimizer and metrics!
memory = SequentialMemory(limit=20000, window_length=1)
policy = BoltzmannQPolicy()
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
              target_model_update=1e-2, policy=policy)
dqn.compile(Adam(lr=1e-3), metrics=['mae', 'acc'])

In [6]:
## Visualize the training during 20000 steps 
dqn.fit(env, nb_steps=20000, visualize=True, verbose=2)

Training for 20000 steps ...




   775/20000: episode: 1, duration: 25.397s, episode steps: 775, steps per second: 31, episode reward: 480.000, mean reward: 0.619 [0.000, 100.000], mean action: 6.365 [0.000, 13.000], mean observation: 2.692 [0.000, 240.000], loss: 10.828399, mean_absolute_error: 0.647379, mean_q: 0.806558
  2576/20000: episode: 2, duration: 54.979s, episode steps: 1801, steps per second: 33, episode reward: 1510.000, mean reward: 0.838 [0.000, 100.000], mean action: 6.421 [0.000, 13.000], mean observation: 2.110 [0.000, 240.000], loss: 23.661833, mean_absolute_error: 0.869386, mean_q: 0.936581
  3395/20000: episode: 3, duration: 24.540s, episode steps: 819, steps per second: 33, episode reward: 830.000, mean reward: 1.013 [0.000, 100.000], mean action: 6.560 [0.000, 13.000], mean observation: 2.360 [0.000, 240.000], loss: 27.443249, mean_absolute_error: 0.932018, mean_q: 0.973010
  4308/20000: episode: 4, duration: 27.419s, episode steps: 913, steps per second: 33, episode reward: 1180.000, mean rewa

<keras.callbacks.History at 0x121649748>

In [8]:
## Save the model 
dqn.save_weights('dqn_{}_weights.h5f'.format(ENV_NAME_2), overwrite=True)

In [9]:
# Evaluate the algorithm for 10 episodes 
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 80.000, steps: 1934
Episode 2: reward: 80.000, steps: 1935
Episode 3: reward: 80.000, steps: 1939
Episode 4: reward: 80.000, steps: 1937
Episode 5: reward: 80.000, steps: 1904
Episode 6: reward: 80.000, steps: 1912
Episode 7: reward: 80.000, steps: 1939
Episode 8: reward: 80.000, steps: 1924
Episode 9: reward: 80.000, steps: 1947
Episode 10: reward: 80.000, steps: 1928


<keras.callbacks.History at 0x141763438>

In [10]:
### Another Policy with dqn 

In [21]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr="eps", value_max=.8, value_min=.01,
                              value_test=.0,
                              nb_steps=100000)
dqn = DQNAgent(model=model, nb_actions=nb_actions,  nb_steps_warmup=10, 
               policy=policy, test_policy=policy, memory = memory,
               target_model_update=1e-2)

In [22]:
dqn.compile(Adam(lr=1e-3), metrics=['mae', 'acc'])

In [23]:
dqn.fit(env, nb_steps=20000, visualize=True, verbose=2)

Training for 20000 steps ...
   678/20000: episode: 1, duration: 20.570s, episode steps: 678, steps per second: 33, episode reward: 630.000, mean reward: 0.929 [0.000, 100.000], mean action: 7.835 [0.000, 13.000], mean observation: 2.334 [0.000, 240.000], loss: 33.277592, mean_absolute_error: 0.991395, acc: 0.104385, mean_q: 0.999981, mean_eps: 0.797282
  1696/20000: episode: 2, duration: 30.019s, episode steps: 1018, steps per second: 34, episode reward: 980.000, mean reward: 0.963 [0.000, 100.000], mean action: 7.811 [0.000, 13.000], mean observation: 2.143 [0.000, 240.000], loss: 29.905849, mean_absolute_error: 0.985209, acc: 0.111186, mean_q: 0.999962, mean_eps: 0.790627
  2221/20000: episode: 3, duration: 14.991s, episode steps: 525, steps per second: 35, episode reward: 780.000, mean reward: 1.486 [0.000, 100.000], mean action: 7.947 [0.000, 13.000], mean observation: 1.946 [0.000, 240.000], loss: 31.834536, mean_absolute_error: 0.989462, acc: 0.118512, mean_q: 0.999972, mean_eps

<keras.callbacks.History at 0x13400deb8>

In [24]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 110.000, steps: 680
Episode 2: reward: 110.000, steps: 692
Episode 3: reward: 110.000, steps: 685
Episode 4: reward: 110.000, steps: 682
Episode 5: reward: 110.000, steps: 675
Episode 6: reward: 110.000, steps: 676
Episode 7: reward: 110.000, steps: 683
Episode 8: reward: 110.000, steps: 682
Episode 9: reward: 110.000, steps: 680
Episode 10: reward: 110.000, steps: 679


<keras.callbacks.History at 0x1549c9fd0>

In [41]:
#SARSA Agent -- Deep Reinforcement Learning 
from rl.agents.sarsa import SARSAAgent
sarsa = SARSAAgent(model, nb_actions, 
                policy=None, test_policy=None, 
                gamma=0.99, nb_steps_warmup=10, 
                train_interval=1)
sarsa.compile(Adam(lr=1e-3), metrics=['mae', 'acc'])
sarsa.fit(env, nb_steps=20000, visualize=True, verbose=2)
sarsa.test(env, nb_episodes=10, visualize=True)

Training for 20000 steps ...
   533/20000: episode: 1, duration: 4.751s, episode steps: 533, steps per second: 112, episode reward: 340.000, mean reward: 0.638 [0.000, 100.000], mean action: 7.762 [0.000, 12.000], mean observation: 2.698 [0.000, 240.000], loss: 24.707138, mean_absolute_error: 0.975929, acc: 0.898467, mean_q: 1.000000
  1046/20000: episode: 2, duration: 3.683s, episode steps: 513, steps per second: 139, episode reward: 160.000, mean reward: 0.312 [0.000, 50.000], mean action: 11.986 [0.000, 13.000], mean observation: 2.752 [0.000, 240.000], loss: 6.052589, mean_absolute_error: 0.951730, acc: 0.898438, mean_q: 1.000000
  2008/20000: episode: 3, duration: 6.924s, episode steps: 962, steps per second: 139, episode reward: 830.000, mean reward: 0.863 [0.000, 100.000], mean action: 12.427 [0.000, 13.000], mean observation: 2.225 [0.000, 240.000], loss: 30.741153, mean_absolute_error: 0.991028, acc: 0.920916, mean_q: 1.000000
  2641/20000: episode: 4, duration: 4.364s, episod

 16374/20000: episode: 28, duration: 3.227s, episode steps: 468, steps per second: 145, episode reward: 260.000, mean reward: 0.556 [0.000, 150.000], mean action: 7.748 [0.000, 13.000], mean observation: 2.917 [0.000, 240.000], loss: 28.046947, mean_absolute_error: 0.969189, acc: 0.888651, mean_q: 1.000000
 16775/20000: episode: 29, duration: 2.769s, episode steps: 401, steps per second: 145, episode reward: 240.000, mean reward: 0.599 [0.000, 100.000], mean action: 7.878 [0.000, 13.000], mean observation: 3.306 [0.000, 240.000], loss: 19.745300, mean_absolute_error: 0.972301, acc: 0.900000, mean_q: 1.000000
 17502/20000: episode: 30, duration: 5.010s, episode steps: 727, steps per second: 145, episode reward: 560.000, mean reward: 0.770 [0.000, 100.000], mean action: 7.894 [0.000, 13.000], mean observation: 2.321 [0.000, 240.000], loss: 28.367681, mean_absolute_error: 0.984459, acc: 0.911846, mean_q: 1.000000
 18057/20000: episode: 31, duration: 3.828s, episode steps: 555, steps per s

<keras.callbacks.History at 0x154b79b70>