In [1]:
import numpy as np

In [2]:
from tensorforce.agents import PPOAgent
from tensorforce.execution import Runner, MultiRunner
from tensorforce.environments import OpenAIGym

In [3]:
from leduc import LeducEnv
from leduc.util import get_safe_action

In [4]:
# Create an OpenAIgym environment
environment = OpenAIGym('Leduc-v0')

In [None]:
# Create the agent
agent1 = PPOAgent(
    states=environment.states(), actions=environment.actions(),
    # Automatically configured network
    network='auto',
    # Memory sampling most recent experiences, with a capacity of 2500 timesteps
    # (6100 > [30 batch episodes] * [200 max timesteps per episode])
    memory=6100,
    # Update every 10 episodes, with a batch of 30 episodes
    update_mode=dict(unit='episodes', batch_size=30, frequency=10),
    # PPO optimizer
    step_optimizer=dict(type='adam', learning_rate=1e-3),
    # PPO multi-step optimization: 10 updates, each based on a third of the batch
    subsampling_fraction=0.33, optimization_steps=10,
    # MLP baseline
    baseline_mode='states', baseline=dict(type='network', network='auto'),
    # Baseline optimizer
    baseline_optimizer=dict(
        type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5
    ),
    # Other parameters
    discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2
)

In [None]:
# Create the agent
agent2 = PPOAgent(
    states=environment.states(), actions=environment.actions(),
    # Automatically configured network
    network='auto',
    # Memory sampling most recent experiences, with a capacity of 2500 timesteps
    # (6100 > [30 batch episodes] * [200 max timesteps per episode])
    memory=6100,
    # Update every 10 episodes, with a batch of 30 episodes
    update_mode=dict(unit='episodes', batch_size=30, frequency=10),
    # PPO optimizer
    step_optimizer=dict(type='adam', learning_rate=1e-3),
    # PPO multi-step optimization: 10 updates, each based on a third of the batch
    subsampling_fraction=0.33, optimization_steps=10,
    # MLP baseline
    baseline_mode='states', baseline=dict(type='network', network='auto'),
    # Baseline optimizer
    baseline_optimizer=dict(
        type='multi_step', optimizer=dict(type='adam', learning_rate=1e-4), num_steps=5
    ),
    # Other parameters
    discount=0.99, entropy_regularization=1e-2, gae_lambda=None, likelihood_ratio_clipping=0.2
)

In [None]:
# Create the runner
runner = MultiRunner(agents=[agent1, agent2], environment=environment)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [None]:
# Callback function printing episode statistics
def episode_finished(r):
    if r.episode % 100 != 0:
        return True
    print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(
        ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1]))
    return True

In [None]:
# Start learning
runner.run(num_episodes=500, max_episode_timesteps=200)
runner.close()

In [None]:
# Print statistics
print("Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}.".format(
    ep=runner.episode,
    ar=np.mean(runner.episode_rewards))
)