In [47]:
import keras
import numpy as np
from rl.agents.dqn import DQNAgent
from rl.policy import EpsGreedyQPolicy
from rl.memory import SequentialMemory
import gym

In [48]:
env = gym.make('CartPole-v1')
print(f"Action Space : {env.action_space} \n Observation Space : {env.observation_space}")

Action Space : Discrete(2) 
 Observation Space : Box(4,)


In [49]:
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=(1, env.observation_space.shape[0])),
    keras.layers.Dense(24, activation=keras.activations.relu),
    keras.layers.Dense(24, activation=keras.activations.relu),
    keras.layers.Dense(env.action_space.n, activation=keras.activations.linear)
])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_2 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_4 (Dense)              (None, 24)                120       
_________________________________________________________________
dense_5 (Dense)              (None, 24)                600       
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 50        
Total params: 770
Trainable params: 770
Non-trainable params: 0
_________________________________________________________________


In [10]:
dqn = DQNAgent(model=model,
               policy=EpsGreedyQPolicy(),
               memory=SequentialMemory(limit=50000, window_length=1),
               nb_actions=env.action_space.n,
               nb_steps_warmup=10)
dqn.compile(optimizer=keras.optimizers.Adam(), 
            metrics=[keras.metrics.mean_absolute_error])
dqn.fit(env=env,
        nb_steps=100,
        visualize=True,
        nb_max_episode_steps=200,
        verbose=1,
        log_interval=200)

InternalError: GPU sync failed

In [54]:
nb_actions = env.action_space.n
policy = EpsGreedyQPolicy()
memory = SequentialMemory(limit=50000, window_length=1)
dqn = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
target_model_update=1e-2, policy=policy, enable_double_dqn=False)
dqn.compile(keras.optimizers.Adam(lr=1e-3), metrics=['mae'])

# Okay, now it's time to learn something! We visualize the training here for show, but this slows down training quite a lot. 
dqn.fit(env, nb_steps=5000, visualize=False, verbose=1, log_interval=200)

Training for 5000 steps ...
Interval 1 (0 steps performed)
1 episodes - episode_reward: 174.000 [174.000, 174.000] - loss: 0.257 - mean_absolute_error: 24.366 - mean_q: 48.748

Interval 2 (200 steps performed)
1 episodes - episode_reward: 40.000 [40.000, 40.000] - loss: 3.758 - mean_absolute_error: 24.707 - mean_q: 49.354

Interval 3 (400 steps performed)
1 episodes - episode_reward: 294.000 [294.000, 294.000] - loss: 3.007 - mean_absolute_error: 25.827 - mean_q: 51.816

Interval 4 (600 steps performed)
1 episodes - episode_reward: 245.000 [245.000, 245.000] - loss: 2.264 - mean_absolute_error: 26.484 - mean_q: 53.223

Interval 5 (800 steps performed)
1 episodes - episode_reward: 205.000 [205.000, 205.000] - loss: 3.068 - mean_absolute_error: 27.086 - mean_q: 54.376

Interval 6 (1000 steps performed)
1 episodes - episode_reward: 201.000 [201.000, 201.000] - loss: 3.639 - mean_absolute_error: 27.661 - mean_q: 55.542

Interval 7 (1200 steps performed)
1 episodes - episode_reward: 212.000

<keras.callbacks.History at 0x139c147ccc0>

In [55]:
dqn.test(env, nb_episodes=5, visualize=True)


Testing for 5 episodes ...
Episode 1: reward: 216.000, steps: 216
Episode 2: reward: 235.000, steps: 235
Episode 3: reward: 236.000, steps: 236
Episode 4: reward: 208.000, steps: 208
Episode 5: reward: 213.000, steps: 213


<keras.callbacks.History at 0x139bec5da58>

In [33]:
!mkdir weights

In [35]:
dqn.save_weights(filepath='weights/6_1.h5')

In [56]:
test = DQNAgent(model=model,
               policy=EpsGreedyQPolicy(),
               memory=SequentialMemory(limit=50000, window_length=1),
               nb_actions=env.action_space.n,
               nb_steps_warmup=10)
test.compile(optimizer=keras.optimizers.Adam(), 
            metrics=[keras.metrics.mean_absolute_error])

In [57]:
test.load_weights(filepath='weights/6_1.h5')

In [59]:
test.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 500.000, steps: 500
Episode 2: reward: 500.000, steps: 500
Episode 3: reward: 500.000, steps: 500
Episode 4: reward: 500.000, steps: 500
Episode 5: reward: 500.000, steps: 500
Episode 6: reward: 500.000, steps: 500
Episode 7: reward: 500.000, steps: 500
Episode 8: reward: 500.000, steps: 500
Episode 9: reward: 500.000, steps: 500
Episode 10: reward: 500.000, steps: 500


<keras.callbacks.History at 0x139b81e9be0>