In [2]:
import gym
import ray
from ray.rllib.agents.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.tune.logger import pretty_print

print("Successfully installed all the dependencies!")

Successfully installed all the dependencies!


# 1. Introduction to Policies with Proximal Policy Optimization
PPO works in two phases. In one phase, a large number of rollouts are performed (in parallel). The rollouts 
are then aggregated on the driver and a surrogate optimization objective is defined based on those rollouts. 
We then use SGD to find the policy that maximizes that objective with a penalty term for diverging too much 
from the current policy.

In [3]:
# Start up Ray. This must be done before we instantiate any RL agents.
ray.init(num_cpus=3, ignore_reinit_error=True, log_to_driver=False)

2019-12-21 16:05:25,124	INFO resource_spec.py:216 -- Starting Ray with 8.15 GiB memory available for workers and up to 4.08 GiB for objects. You can adjust these settings with ray.init(memory=<bytes>, object_store_memory=<bytes>).


{'node_ip_address': '192.168.42.189',
 'redis_address': '192.168.42.189:64105',
 'object_store_address': '/tmp/ray/session_2019-12-21_16-05-25_121690_12015/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2019-12-21_16-05-25_121690_12015/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2019-12-21_16-05-25_121690_12015'}

In [6]:
config = DEFAULT_CONFIG.copy()
config['num_workers'] = 3
config['num_sgd_iter'] = 15
config['sgd_minibatch_size'] = 512
config['model']['fcnet_hiddens'] = [32, 32]
config['num_cpus_per_worker'] = 0

agent = PPOTrainer(config, 'CartPole-v0')



In [7]:
for i in range(25):
    result = agent.train()
    print(pretty_print(result))

custom_metrics: {}
date: 2019-12-21_16-10-22
done: false
episode_len_mean: 21.857142857142858
episode_reward_max: 74.0
episode_reward_mean: 21.857142857142858
episode_reward_min: 10.0
episodes_this_iter: 182
episodes_total: 182
experiment_id: 2927b54e348a4ddb95bcfba414f8e925
hostname: azkaban
info:
  grad_time_ms: 829.353
  learner:
    default_policy:
      cur_kl_coeff: 0.20000000298023224
      cur_lr: 4.999999873689376e-05
      entropy: 0.6923392415046692
      entropy_coeff: 0.0
      kl: 0.0008665229543112218
      policy_loss: -0.0025005110073834658
      total_loss: 262.7922058105469
      vf_explained_var: 3.695487976074219e-05
      vf_loss: 262.7945251464844
  load_time_ms: 106.892
  num_steps_sampled: 4000
  num_steps_trained: 3584
  sample_time_ms: 2633.65
  update_time_ms: 992.936
iterations_since_restore: 1
node_ip: 192.168.42.189
num_healthy_workers: 3
off_policy_estimator: {}
perf:
  cpu_util_percent: 59.15555555555555
  ram_util_percent: 33.63333333333333
pid: 12015


In [8]:
checkpoint_path = agent.save()
print(checkpoint_path)

/home/syzygianinfern0/ray_results/PPO_CartPole-v0_2019-12-21_16-10-10sbwmgs0a/checkpoint_25/checkpoint-25


In [9]:
trained_config = config.copy()

test_agent = PPOTrainer(trained_config, 'CartPole-v0')
test_agent.restore(checkpoint_path)

2019-12-21 16:25:41,665	INFO trainable.py:346 -- Restored from checkpoint: /home/syzygianinfern0/ray_results/PPO_CartPole-v0_2019-12-21_16-10-10sbwmgs0a/checkpoint_25/checkpoint-25
2019-12-21 16:25:41,667	INFO trainable.py:353 -- Current state after restoring: {'_iteration': 25, '_timesteps_total': 100000, '_time_total': 75.43531012535095, '_episodes_total': 1573}


In [10]:
env = gym.make('CartPole-v0')
state = env.reset()
done = False
cumulative_reward = 0

while not done:
    action = test_agent.compute_action(state)
    state, reward, done, _ = env.step(action)
    cumulative_reward += reward

print(cumulative_reward)

200.0


# 2. Custom Environments and Rewards Shaping

In [14]:
from gym import spaces
import numpy as np
import test_exercises

In [11]:
ray.init(ignore_reinit_error=True, log_to_driver=False)

2019-12-21 16:32:09,517	ERROR worker.py:679 -- Calling ray.init() again after it has already been called.


In [13]:
action_space_map = {
    "discrete_10": spaces.Discrete(10),
    "box_1": spaces.Box(0, 1, shape=(1,)),
    "box_3x1": spaces.Box(-2, 2, shape=(3, 1)),
    "multi_discrete": spaces.MultiDiscrete([ 5, 2, 2, 4 ])
}

action_space_jumble = {
    "discrete_10": 1,
    "multi_discrete": np.array([0, 0, 0, 2]),
    "box_3x1": np.array([[-1.2657754], [-1.6528835], [ 0.5982418]]),
    "box_1": np.array([0.89089584]),
}


for space_id, state in action_space_jumble.items():
    assert action_space_map[space_id].contains(state), (
        "Looks like {} to {} is matched incorrectly.".format(space_id, state))
    
print("Success!")

Success!


In [15]:
class ChainEnv(gym.Env):
    
    def __init__(self, env_config = None):
        env_config = env_config or {}
        self.n = env_config.get("n", 20)
        self.small_reward = env_config.get("small", 2)  # payout for 'backwards' action
        self.large_reward = env_config.get("large", 10)  # payout at end of chain for 'forwards' action
        self.state = 0  # Start at beginning of the chain
        self._horizon = self.n
        self._counter = 0  # For terminating the episode
        self._setup_spaces()
    
    def _setup_spaces(self):
        ##############
        # TODO: Implement this so that it passes tests
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Discrete(self.n)
        ##############

    def step(self, action):
        assert self.action_space.contains(action)
        if action == 1:  # 'backwards': go back to the beginning, get small reward
            ##############
            # TODO 2: Implement this so that it passes tests
            reward = self.small_reward
            ##############
            self.state = 0
        elif self.state < self.n - 1:  # 'forwards': go up along the chain
            ##############
            # TODO 2: Implement this so that it passes tests
            reward = 0
            self.state += 1
        else:  # 'forwards': stay at the end of the chain, collect large reward
            ##############
            # TODO 2: Implement this so that it passes tests
            reward = self.large_reward
            ##############
        self._counter += 1
        done = self._counter >= self._horizon
        return self.state, reward, done, {}

    def reset(self):
        self.state = 0
        self._counter = 0
        return self.state
    
# Tests here:
test_exercises.test_chain_env_spaces(ChainEnv)
test_exercises.test_chain_env_reward(ChainEnv)

Testing if spaces have been setup correctly...
Success! You've setup the spaces correctly.
Testing if reward has been setup correctly...
Success! You've setup the rewards correctly.


In [16]:
trainer_config = DEFAULT_CONFIG.copy()
trainer_config['num_workers'] = 1
trainer_config["train_batch_size"] = 400
trainer_config["sgd_minibatch_size"] = 64
trainer_config["num_sgd_iter"] = 10

In [17]:
trainer = PPOTrainer(trainer_config, ChainEnv)
for i in range(20):
    print("Training iteration {}...".format(i))
    trainer.train()



Training iteration 0...
Training iteration 1...
Training iteration 2...
Training iteration 3...
Training iteration 4...
Training iteration 5...
Training iteration 6...
Training iteration 7...
Training iteration 8...
Training iteration 9...
Training iteration 10...
Training iteration 11...
Training iteration 12...
Training iteration 13...
Training iteration 14...
Training iteration 15...
Training iteration 16...
Training iteration 17...
Training iteration 18...
Training iteration 19...


In [18]:
env = ChainEnv({})
state = env.reset()

done = False
max_state = -1
cumulative_reward = 0

while not done:
    action = trainer.compute_action(state)
    state, reward, done, results = env.step(action)
    max_state = max(max_state, state)
    cumulative_reward += reward

print("Cumulative reward you've received is: {}. Congratulations!".format(cumulative_reward))
print("Max state you've visited is: {}. This is out of {} states.".format(max_state, env.n))

Cumulative reward you've received is: 40. Congratulations!
Max state you've visited is: 0. This is out of 20 states.


# 3. Shaping the rewards to encourage proper behaviour

In [21]:
class ShapedChainEnv(ChainEnv):
    def step(self, action):
        assert self.action_space.contains(action)
        if action == 1:  # 'backwards': go back to the beginning
            reward = -20 * self.large_reward
            self.state = 0
        elif self.state < self.n - 1:  # 'forwards': go up along the chain
            reward = self.small_reward
            self.state += 1
        else:  # 'forwards': stay at the end of the chain
            reward = 20 * self.large_reward
        self._counter += 1
        done = self._counter >= self._horizon
        return self.state, reward, done, {}
    
test_exercises.test_chain_env_behavior(ShapedChainEnv)

Testing if behavior has been changed...
Success! Behavior of environment is correct.


In [22]:
trainer = PPOTrainer(trainer_config, ShapedChainEnv)
for i in range(20):
    print("Training iteration {}...".format(i))
    trainer.train()

env = ShapedChainEnv({})

max_states = []

for i in range(5):
    state = env.reset()
    done = False
    max_state = -1
    cumulative_reward = 0
    while not done:
        action = trainer.compute_action(state)
        state, reward, done, results = env.step(action)
        max_state = max(max_state, state)
        cumulative_reward += reward
    max_states += [max_state]

print("Cumulative reward you've received is: {}!".format(cumulative_reward))
print("Max state you've visited is: {}. This is out of {} states.".format(np.mean(max_state), env.n))
assert (env.n - np.mean(max_state)) / env.n < 0.2, "This policy did not traverse many states."



Training iteration 0...
Training iteration 1...
Training iteration 2...
Training iteration 3...
Training iteration 4...
Training iteration 5...
Training iteration 6...
Training iteration 7...
Training iteration 8...
Training iteration 9...
Training iteration 10...
Training iteration 11...
Training iteration 12...
Training iteration 13...
Training iteration 14...
Training iteration 15...
Training iteration 16...
Training iteration 17...
Training iteration 18...
Training iteration 19...
Cumulative reward you've received is: -566!
Max state you've visited is: 7.0. This is out of 20 states.


AssertionError: This policy did not traverse many states.