In [1]:
import numpy as np
import gym
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from keras.optimizers import Adam
from rl.agents import SARSAAgent
from rl.policy import EpsGreedyQPolicy

Using TensorFlow backend.


In [2]:
# load the environment
env = gym.make('CartPole-v1')

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m


In [3]:
# set seed 
seed_val = 456
env.seed(seed_val)
np.random.seed(seed_val)

In [4]:
states = env.observation_space.shape[0]
actions = env.action_space.n

In [5]:
# define agent
def agent(states, actions):
    """Simple Deep Neural Network."""
    model = Sequential()
    model.add(Flatten(input_shape=(1,states)))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(16))
    model.add(Activation('relu'))
    model.add(Dense(actions))
    model.add(Activation('linear'))
    return model

model = agent(states, actions)

In [6]:
# Define the policy
policy = EpsGreedyQPolicy()

# Define SARSA agent by feeding it the policy and the model
sarsa = SARSAAgent(model=model, nb_actions=actions, nb_steps_warmup=10, policy=policy)

# compile sarsa with mean squared error loss
sarsa.compile('adam', metrics=['mse'])

# train the agent for 50000 steps
sarsa.fit(env, nb_steps=50000, visualize=False, verbose=1)

Training for 50000 steps ...
Interval 1 (0 steps performed)
319 episodes - episode_reward: 30.984 [8.000, 500.000] - loss: 7.445 - mean_squared_error: 552.109 - mean_q: 29.475

Interval 2 (10000 steps performed)
122 episodes - episode_reward: 82.467 [9.000, 435.000] - loss: 6.987 - mean_squared_error: 831.830 - mean_q: 39.221

Interval 3 (20000 steps performed)
81 episodes - episode_reward: 122.617 [14.000, 500.000] - loss: 10.447 - mean_squared_error: 1416.096 - mean_q: 52.411

Interval 4 (30000 steps performed)
75 episodes - episode_reward: 133.933 [15.000, 500.000] - loss: 6.933 - mean_squared_error: 1527.035 - mean_q: 54.172

Interval 5 (40000 steps performed)
done, took 470.141 seconds


<keras.callbacks.History at 0x120d44dd8>

In [7]:
# Evaluate the agent on 100 new episodes.
scores = sarsa.test(env, nb_episodes=100, visualize=False)

print('Average score over 100 test games: {}'.format(np.mean(scores.history['episode_reward'])))

Testing for 100 episodes ...
Episode 1: reward: 350.000, steps: 350
Episode 2: reward: 383.000, steps: 383
Episode 3: reward: 316.000, steps: 316
Episode 4: reward: 371.000, steps: 371
Episode 5: reward: 356.000, steps: 356
Episode 6: reward: 399.000, steps: 399
Episode 7: reward: 317.000, steps: 317
Episode 8: reward: 370.000, steps: 370
Episode 9: reward: 364.000, steps: 364
Episode 10: reward: 432.000, steps: 432
Episode 11: reward: 331.000, steps: 331
Episode 12: reward: 387.000, steps: 387
Episode 13: reward: 361.000, steps: 361
Episode 14: reward: 365.000, steps: 365
Episode 15: reward: 350.000, steps: 350
Episode 16: reward: 375.000, steps: 375
Episode 17: reward: 396.000, steps: 396
Episode 18: reward: 372.000, steps: 372
Episode 19: reward: 386.000, steps: 386
Episode 20: reward: 388.000, steps: 388
Episode 21: reward: 326.000, steps: 326
Episode 22: reward: 336.000, steps: 336
Episode 23: reward: 413.000, steps: 413
Episode 24: reward: 387.000, steps: 387
Episode 25: reward: 