In [None]:
!pip install keras-rl2, tensorflow

In [12]:
import gym
import rl
from rl.memory import SequentialMemory

# balance exploration and exploitation
from rl.policy import EpsGreedyQPolicy
# to decay our eps
from rl.policy import LinearAnnealedPolicy
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, Flatten, Input
from tensorflow.keras.optimizers import Adam

In [14]:
env = gym.make('CartPole-v0')
state_size = env.observation_space.shape[0]  # total number of states (S)
action_size = env.action_space.n

In [15]:
def build_model(state_size, num_actions):
    input = Input(shape=(1,state_size))
    x = Flatten()(input)
    x = Dense(16, activation='relu')(x)
    x = Dense(16, activation='relu')(x)
    x = Dense(16, activation='relu')(x)
    output = Dense(num_actions, activation='linear')(x)
    model = Model(inputs=input, outputs=output)
    print(model.summary())
    return model

In [46]:
model=build_model(state_size, action_size)
# model = tf.keras.models.load_model('./models/model.h5') to load saved model

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 1, 4)]            0         
                                                                 
 flatten_4 (Flatten)         (None, 4)                 0         
                                                                 
 dense_16 (Dense)            (None, 16)                80        
                                                                 
 dense_17 (Dense)            (None, 16)                272       
                                                                 
 dense_18 (Dense)            (None, 16)                272       
                                                                 
 dense_19 (Dense)            (None, 2)                 34        
                                                                 
Total params: 658
Trainable params: 658
Non-trainable param

In [47]:
memory = SequentialMemory(limit=50000, window_length=1)

In [48]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=1., value_min=.1, value_test=.05, nb_steps=10000)

In [49]:
from rl.agents.dqn import DQNAgent

dqn = DQNAgent(model=model, nb_actions=action_size, memory=memory, nb_steps_warmup=10,
               target_model_update=1e-2, policy=policy)

In [50]:
dqn.compile(optimizer= Adam(learning_rate=1e-3), metrics=['mae'])

In [59]:
# no need to fit if using saved model
dqn.fit(env, nb_steps=2500,
visualize=False,
verbose=2,)

Training for 2500 steps ...
   48/2500: episode: 1, duration: 0.284s, episode steps:  48, steps per second: 169, episode reward: 48.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.479 [0.000, 1.000],  loss: 6.639272, mae: 31.279091, mean_q: 64.606182, mean_eps: 0.997390
   56/2500: episode: 2, duration: 0.049s, episode steps:   8, steps per second: 162, episode reward:  8.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.125 [0.000, 1.000],  loss: 4.832373, mae: 31.562858, mean_q: 64.874635, mean_eps: 0.995365
   74/2500: episode: 3, duration: 0.093s, episode steps:  18, steps per second: 194, episode reward: 18.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.500 [0.000, 1.000],  loss: 2.981705, mae: 31.341058, mean_q: 65.177416, mean_eps: 0.994195
  105/2500: episode: 4, duration: 0.165s, episode steps:  31, steps per second: 188, episode reward: 31.000, mean reward:  1.000 [ 1.000,  1.000], mean action: 0.581 [0.000, 1.000],  loss: 5.949092, mae: 32.209021

<keras.callbacks.History at 0x7f5f54146370>

In [60]:
model.save('./models/model.h5')

In [61]:
dqn.test(env, nb_episodes=10, visualize=True)

Testing for 10 episodes ...
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
Episode 6: reward: 200.000, steps: 200
Episode 7: reward: 200.000, steps: 200
Episode 8: reward: 200.000, steps: 200
Episode 9: reward: 180.000, steps: 180
Episode 10: reward: 200.000, steps: 200


<keras.callbacks.History at 0x7f5f541465b0>