In [1]:
# __quick_start_begin__
import gymnasium as gym
from ray.rllib.algorithms.ppo import PPOConfig


# Define your problem using python and openAI's gym API:
class SimpleCorridor(gym.Env):
    """Corridor in which an agent must learn to move right to reach the exit.

    ---------------------
    | S | 1 | 2 | 3 | G |   S=start; G=goal; corridor_length=5
    ---------------------

    Possible actions to chose from are: 0=left; 1=right
    Observations are floats indicating the current field index, e.g. 0.0 for
    starting position, 1.0 for the field next to the starting position, etc..
    Rewards are -0.1 for all steps, except when reaching the goal (+1.0).
    """

    def __init__(self, config):
        self.end_pos = config["corridor_length"]
        self.cur_pos = 0
        self.action_space = gym.spaces.Discrete(2)  # left and right
        self.observation_space = gym.spaces.Box(0.0, self.end_pos, shape=(1,))

    def reset(self, *, seed=None, options=None):
        """Resets the episode.

        Returns:
           Initial observation of the new episode and an info dict.
        """
        self.cur_pos = 0
        # Return initial observation.
        return [self.cur_pos], {}

    def step(self, action):
        """Takes a single step in the episode given `action`.

        Returns:
            New observation, reward, terminated-flag, truncated-flag, info-dict (empty).
        """
        # Walk left.
        if action == 0 and self.cur_pos > 0:
            self.cur_pos -= 1
        # Walk right.
        elif action == 1:
            self.cur_pos += 1
        # Set `terminated` flag when end of corridor (goal) reached.
        terminated = self.cur_pos >= self.end_pos
        truncated = False
        # +1 when goal reached, otherwise -1.
        reward = 1.0 if terminated else -0.1
        return [self.cur_pos], reward, terminated, truncated, {}

In [2]:
import ray
ray.init(num_gpus=1, local_mode=False)

Usage stats collection is enabled by default for nightly wheels. To disable this, run the following command: `ray disable-usage-stats` before starting Ray. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details.


2023-01-07 17:35:46,004	INFO worker.py:1536 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.15
Ray version:,3.0.0.dev0
Dashboard:,http://127.0.0.1:8265


In [3]:
env_name = "Platoon-v1"

In [4]:
# Create an RLlib Algorithm instance from a PPOConfig object.
import platoonenv
config = (
    PPOConfig().environment(
        # Env class to use (here: our gym.Env sub-class from above).
        # env=SimpleCorridor,
        env=env_name,
        # Config dict to be passed to our custom env's constructor.
        # Use corridor with 20 fields (including S and G).
        # env_config={"corridor_length": 28},
    )
    .framework("torch")
    .resources(num_gpus=1)
    # Parallelize environment rollouts.
    .rollouts(num_rollout_workers=3)
)
# Construct the actual (PPO) algorithm object from the config.
algo = config.build()

2023-01-07 17:35:58,565	INFO worker_set.py:309 -- Inferred observation/action spaces from remote worker (local worker has no env): {'default_policy': (Box(-10.0, 1.0, (20,), float32), Discrete(11)), '__env__': (Box(-10.0, 1.0, (20,), float32), Discrete(11))}
2023-01-07 17:35:58,607	INFO policy.py:1196 -- Policy (worker=local) running on 1 GPUs.
2023-01-07 17:35:58,608	INFO torch_policy_v2.py:110 -- Found 1 visible cuda devices.
[2m[36m(RolloutWorker pid=4340)[0m 2023-01-07 17:35:58,546	INFO policy.py:1196 -- Policy (worker=1) running on CPU.
[2m[36m(RolloutWorker pid=4340)[0m 2023-01-07 17:35:58,546	INFO torch_policy_v2.py:110 -- Found 1 visible cuda devices.
[2m[36m(RolloutWorker pid=20032)[0m 2023-01-07 17:35:58,546	INFO policy.py:1196 -- Policy (worker=3) running on CPU.
[2m[36m(RolloutWorker pid=20032)[0m 2023-01-07 17:35:58,546	INFO torch_policy_v2.py:110 -- Found 1 visible cuda devices.
[2m[36m(RolloutWorker pid=21264)[0m 2023-01-07 17:35:58,548	INFO policy.py:1196

In [12]:
# Train for n iterations and report results (mean episode rewards).
# Since we have to move at least 19 times in the env to reach the goal and
# each move gives us -0.1 reward (except the last move at the end: +1.0),
# we can expect to reach an optimal episode reward of -0.1*18 + 1.0 = -0.8
for i in range(200):
    results = algo.train()
    print(f"Iter: {i}; avg. reward={results['episode_reward_mean']}")

Iter: 0; avg. reward=-49.288220551378444
Iter: 1; avg. reward=-49.35839598997494
Iter: 2; avg. reward=-49.27
Iter: 3; avg. reward=-49.229426433915215
Iter: 4; avg. reward=-49.385
Iter: 5; avg. reward=-49.30576441102757
Iter: 6; avg. reward=-49.32418952618454
Iter: 7; avg. reward=-49.2275
Iter: 8; avg. reward=-49.37593984962406
Iter: 9; avg. reward=-49.32835820895522
Iter: 10; avg. reward=-49.38095238095238
Iter: 11; avg. reward=-49.16541353383459
Iter: 12; avg. reward=-49.34
Iter: 13; avg. reward=-49.33416458852868
Iter: 14; avg. reward=-49.52
Iter: 15; avg. reward=-49.769423558897245
Iter: 16; avg. reward=-49.6359102244389
Iter: 17; avg. reward=-49.5075
Iter: 18; avg. reward=-49.478696741854634
Iter: 19; avg. reward=-49.308457711442784
Iter: 20; avg. reward=-49.27568922305765
Iter: 21; avg. reward=-49.40852130325815
Iter: 22; avg. reward=-49.435
Iter: 23; avg. reward=-49.33665835411471
Iter: 24; avg. reward=-49.3625
Iter: 25; avg. reward=-49.230576441102755
Iter: 26; avg. reward=-49.2

In [6]:
results 

{'custom_metrics': {},
 'episode_media': {},
 'info': {'learner': {'default_policy': {'custom_metrics': {},
    'learner_stats': {'cur_kl_coeff': 0.20000000000000004,
     'cur_lr': 5.0000000000000016e-05,
     'total_loss': 9.314977171087778,
     'policy_loss': -0.0364526856590503,
     'vf_loss': 9.347446115555302,
     'vf_explained_var': -8.762523692141297e-07,
     'kl': 0.019918766150615975,
     'entropy': 1.9746735113923268,
     'entropy_coeff': 0.0},
    'model': {},
    'num_grad_updates_lifetime': 18135.5,
    'diff_num_grad_updates_vs_sampler_policy': 464.5}},
  'num_env_steps_sampled': 80000,
  'num_env_steps_trained': 80000,
  'num_agent_steps_sampled': 80000,
  'num_agent_steps_trained': 80000},
 'sampler_results': {'episode_reward_max': -53.0,
  'episode_reward_min': -157.0,
  'episode_reward_mean': -74.1592039800995,
  'episode_len_mean': 10.0,
  'episode_media': {},
  'episodes_this_iter': 402,
  'policy_reward_min': {},
  'policy_reward_max': {},
  'policy_reward_m

In [13]:

# Perform inference (action computations) based on given env observations.
# Note that we are using a slightly different env here (len 10 instead of 20),
# however, this should still work as the agent has (hopefully) learned
# to "just always walk right!"
# env = SimpleCorridor({"corridor_length": 10})
env = gym.make(env_name)
# Get the initial observation (should be: [0.0] for the starting position).
obs, info = env.reset()
terminated = truncated = False
total_reward = 0.0
# Play one episode.
while not terminated and not truncated:
    # Compute a single action, given the current observation
    # from the environment.
    action = algo.compute_single_action(obs)
    # Apply the computed action in the environment.
    obs, reward, terminated, truncated, info = env.step(action)
    # Sum up rewards for reporting purposes.
    total_reward += reward
    # env.render()
# Report results.
print(f"Played 1 episode; total-reward={total_reward}")
# __quick_start_end__

NameNotFound: Environment Platoon doesn't exist. Did you mean: `Pooyan`?