In [1]:
import gym
from gym.spaces import Box, Dict, Discrete
import numpy as np
import random


class ParametricActionsCartPole(gym.Env):
    """Parametric action version of CartPole.
    In this env there are only ever two valid actions, but we pretend there are
    actually up to `max_avail_actions` actions that can be taken, and the two
    valid actions are randomly hidden among this set.
    At each step, we emit a dict of:
        - the actual cart observation
        - a mask of valid actions (e.g., [0, 0, 1, 0, 0, 1] for 6 max avail)
        - the list of action embeddings (w/ zeroes for invalid actions) (e.g.,
            [[0, 0],
             [0, 0],
             [-0.2322, -0.2569],
             [0, 0],
             [0, 0],
             [0.7878, 1.2297]] for max_avail_actions=6)
    In a real environment, the actions embeddings would be larger than two
    units of course, and also there would be a variable number of valid actions
    per step instead of always [LEFT, RIGHT].
    """

    def __init__(self, max_avail_actions):
        # Use simple random 2-unit action embeddings for [LEFT, RIGHT]
        self.left_action_embed = np.random.randn(2)
        self.right_action_embed = np.random.randn(2)
        self.action_space = Discrete(max_avail_actions)
        self.wrapped = gym.make("CartPole-v0")
        self.observation_space = Dict({
            "action_mask": Box(0, 1, shape=(max_avail_actions, )),
            "avail_actions": Box(-10, 10, shape=(max_avail_actions, 2)),
            "cart": self.wrapped.observation_space,
        })

    def update_avail_actions(self):
        self.action_assignments = np.array([[0., 0.]] * self.action_space.n)
        self.action_mask = np.array([0.] * self.action_space.n)
        self.left_idx, self.right_idx = random.sample(
            range(self.action_space.n), 2)
        self.action_assignments[self.left_idx] = self.left_action_embed
        self.action_assignments[self.right_idx] = self.right_action_embed
        self.action_mask[self.left_idx] = 1
        self.action_mask[self.right_idx] = 1

    def reset(self):
        self.update_avail_actions()
        return {
            "action_mask": self.action_mask,
            "avail_actions": self.action_assignments,
            "cart": self.wrapped.reset(),
        }

    def step(self, action):
        if action == self.left_idx:
            actual_action = 0
        elif action == self.right_idx:
            actual_action = 1
        else:
            raise ValueError(
                "Chosen action was not one of the non-zero action embeddings",
                action, self.action_assignments, self.action_mask,
                self.left_idx, self.right_idx)
        orig_obs, rew, done, info = self.wrapped.step(actual_action)
        self.update_avail_actions()
        obs = {
            "action_mask": self.action_mask,
            "avail_actions": self.action_assignments,
            "cart": orig_obs,
        }
        return obs, rew, done, info

In [2]:
import random
import numpy as np
import collections
import gym

import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
from matplotlib.offsetbox import AnchoredText

from tqdm import tqdm

from scipy import stats
from scipy.stats import norm
from math import sqrt

!pip install -q ray
!pip install -q ray[tune]
!pip install -q lz4

import ray
from ray import tune
ray.init()
import ray.rllib.agents.ppo as ppo


[K     |████████████████████████████████| 52.7 MB 1.4 MB/s 
[K     |████████████████████████████████| 4.1 MB 57.5 MB/s 
[K     |████████████████████████████████| 175 kB 41.5 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.8.0 requires tf-estimator-nightly==2.8.0.dev2021122109, which is not installed.[0m
[K     |████████████████████████████████| 125 kB 25.1 MB/s 
[K     |████████████████████████████████| 1.2 MB 24.0 MB/s 
[?25h

In [4]:
env=ParametricActionsCartPole(4)

In [3]:
env_config = 6

tune.register_env("ParametricActionsCartPole", lambda config: ParametricActionsCartPole(env_config))

In [4]:
config = ppo.DEFAULT_CONFIG.copy() 

agent = ppo.PPOTrainer(env="ParametricActionsCartPole", config=config)

# n_epochs = 100
# for n in range(n_epochs):
#     result = agent.train()
#     print(f'epoch: {n}, episode_reward_mean: {result["episode_reward_mean"]}, episode_reward_min: {result["episode_reward_min"]}')

2022-03-22 12:41:31,885	INFO trainer.py:2141 -- Your framework setting is 'tf', meaning you are using static-graph mode. Set framework='tf2' to enable eager execution with tf2.x. You may also then want to set eager_tracing=True in order to reach similar execution speed as with static-graph mode.
2022-03-22 12:41:31,895	INFO ppo.py:250 -- In multi-agent mode, policies will be optimized sequentially by the multi-GPU optimizer. Consider setting simple_optimizer=True if this doesn't work for you.
2022-03-22 12:41:31,897	INFO trainer.py:781 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
2022-03-22 12:41:44,704	INFO trainable.py:130 -- Trainable.setup took 12.820 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [5]:
n_epochs = 100
for n in range(n_epochs):
    result = agent.train()
    print(f'epoch: {n}, episode_reward_mean: {result["episode_reward_mean"]}, episode_reward_min: {result["episode_reward_min"]}')



RayTaskError(ValueError): ignored