* Ray RLlib 노트북

필요 패키지 삽입

In [None]:
import gymnasium as gym
import numpy as np
import pandas as pd

from horcrux_terrain_v1.envs import SandWorld

import ray
from ray.rllib.algorithms import ppo
from ray.rllib.algorithms.ppo import PPOConfig

from ray.tune.registry import register_env


Ray 실행 (Warning 관련 무시 키워드)

In [None]:
ray.init(runtime_env={"env_vars": {"PYTHONWARNINGS": "ignore::DeprecationWarning"}})

Gym -> Rllib Env 등록

In [3]:
env_config = {
    "forward_reward_weight": 4,
    "side_cost_weight": 0.2,
    "unhealthy_max_steps": 75,
    "healthy_roll_range": (-70,70),
    "rotation_norm_cost_weight": 0.1,
}

register_env("sand-v1", lambda config: SandWorld(forward_reward_weight=env_config["forward_reward_weight"], 
                                                 side_cost_weight=env_config["side_cost_weight"], 
                                                 unhealthy_max_steps=env_config["unhealthy_max_steps"], 
                                                 healthy_roll_range=env_config["healthy_roll_range"],
                                                 rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"]))

학습 알고리즘 설정

In [None]:
config = PPOConfig()
# Activate new API stack.
config.api_stack(
    enable_rl_module_and_learner=True,
    enable_env_runner_and_connector_v2=True,
)
config.environment("sand-v1")
config.framework("torch")
config.resources(num_gpus=1, num_cpus_for_main_process=12)
config.env_runners(num_env_runners=12)
config.training(
    gamma=0.9, 
    lr=0.0001, 
    # kl_coeff=0.3, 
    train_batch_size_per_learner=96000,              
    sgd_minibatch_size=32768, 
)
config.evaluation(evaluation_interval=100)

# See model catalog for more options.
# https://docs.ray.io/en/latest/rllib/rllib-models.html
# config.model["fcnet_hiddens"] = [512, 512, 512, 512, 512]
config.model["fcnet_hiddens"] = [1024, 1024, 1024, 1024, 1024]
config.model["use_lstm"] = True
# config.model["lstm_cell_size"] = 2048
config.model["lstm_cell_size"] = 4096

algo = config.build()


학습 시작

In [None]:
from pprint import pprint

n_iter = 150
save_iter = 0
save_name = "1024x5_4096_inf"

for i in range(n_iter):
    result = algo.train()
    print(f"{i:03d}th iteration done")
    result.pop("config")
    pprint(result)

    if i%60 == 0:
        checkpoint_dir = algo.save(save_name+"_"+str(save_iter))
        print(f"Checkpoint saved in directory {checkpoint_dir}")
        save_iter += 1

algo.save(save_name+str("_final"))

환경에서 학습된 Policy 테스트하기

In [6]:
from ray.rllib.core.rl_module import RLModule
import pathlib
import torch
import numpy as np
import gymnasium as gym
from horcrux_terrain_v1.envs import SandWorld

rl_module = RLModule.from_checkpoint(
    pathlib.Path("./1024x5_4096_inf_final") / "learner_group" / "learner" / "rl_module"
)["default_policy"]

In [None]:
env = gym.make("horcrux_terrain_v1/sand-v1", 
               terminate_when_unhealthy = False, 
               render_mode = "human", 
            #    render_camera_name = 'ceiling', 
               use_gait = True,
               gait_params = (30,30,40,40,0),
               **env_config,
               ) 


episode_return = 0
terminated = truncated = False

obs, info = env.reset()

torch_obs_batch = torch.from_numpy(np.array([obs]))

for i in range(6000):
   torch_obs_batch = torch.from_numpy(np.array([obs]))
   action_logits = rl_module.forward_inference({"obs": torch_obs_batch})[
      "action_dist_inputs"
    ]
   
   action = torch.argmax(action_logits[0]).numpy()
   obs, reward, terminated, truncated, info = env.step(action)

   episode_return += reward

print(f"Reached episode return of {episode_return}.")

env.close()