* Ray RLlib 노트북

필요 패키지 삽입

In [None]:
import gymnasium as gym
import numpy as np
import pandas as pd

from horcrux_terrain_v1.envs import SandWorld
from horcrux_terrain_v1.envs import PlaneWorld

import ray
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms import ppo
from ray.rllib.algorithms.ppo import PPOConfig

from ray.tune.registry import register_env


Ray 실행 (Warning 관련 무시 키워드)

In [None]:
ray.init(runtime_env={"env_vars": {"PYTHONWARNINGS": "ignore::DeprecationWarning"}})

Gym -> Rllib Env 등록

In [3]:
env_config = {
    "forward_reward_weight": 4,
    "side_cost_weight": 2,
    "unhealthy_max_steps": 75,
    "healthy_roll_range": (-45,45),
    "terminating_roll_range": (-85,85),
    "rotation_norm_cost_weight": 0.05,
    "termination_reward": 0,
}

# Sand
register_env("sand-v1", lambda config: SandWorld(forward_reward_weight=env_config["forward_reward_weight"], 
                                                 side_cost_weight=env_config["side_cost_weight"], 
                                                 unhealthy_max_steps=env_config["unhealthy_max_steps"], 
                                                 healthy_roll_range=env_config["healthy_roll_range"],
                                                 terminating_roll_range=env_config["terminating_roll_range"],
                                                 rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                 termination_reward=env_config["termination_reward"]))

# Plane
register_env("plane-v1", lambda config: PlaneWorld(forward_reward_weight=env_config["forward_reward_weight"], 
                                                 side_cost_weight=env_config["side_cost_weight"], 
                                                 unhealthy_max_steps=env_config["unhealthy_max_steps"], 
                                                 healthy_roll_range=env_config["healthy_roll_range"],
                                                 terminating_roll_range=env_config["terminating_roll_range"],
                                                 rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                 termination_reward=env_config["termination_reward"]))


학습 알고리즘 설정

In [None]:
config = PPOConfig()
# Activate new API stack. -> 구려서 안씀.
config.api_stack(
    enable_rl_module_and_learner=False,
    enable_env_runner_and_connector_v2=False,
)
# config.environment("sand-v1")
config.environment("plane-v1")
config.framework("torch")
total_workers = 12
config.resources(num_gpus=1,num_cpus_per_worker=1, num_gpus_per_worker= 1/(total_workers+1))
config.rollouts(num_rollout_workers=total_workers)
config.training(
    gamma=0.9, 
    lr=0.001, 
    # kl_coeff=0.3, 

    # See model catalog for more options.
    # https://docs.ray.io/en/latest/rllib/rllib-models.html
    model={ "fcnet_hiddens": [512, 512, 512, 512, 512],
            },
)
config.evaluation(evaluation_interval=100)

# # See model catalog for more options.
# # https://docs.ray.io/en/latest/rllib/rllib-models.html
# # config.model["fcnet_hiddens"] = [512, 512, 512, 512, 512]
# config.model["uses_new_env_runners"] = True
# config.model["fcnet_hiddens"] = [1024, 1024, 1024, 1024, 1024]
# config.model["use_lstm"] = True
# # config.model["lstm_cell_size"] = 2048
# config.model["lstm_cell_size"] = 4096
# config.model["max_seq_len"] = 200
# config.model["lstm_use_prev_action"] = True

algo = config.build()



혹시 이전 학습 결과를 로드할 경우

In [5]:
# algo = Algorithm.from_checkpoint("./agents/ModelV2_512x5_2048_v0924_2")

학습 파라미터 재조정

In [6]:
# algo.get_config().training().num_sgd_iter

#Env runner 파라미터 보기.
# algo.env_runner.config["exploration_config"]
# algo.get_config().model

# algo.compute_single_action()
# algo.get_policy()

학습 시작

In [None]:
from pprint import pprint

n_iter = 1000
save_iter = 0
save_name = "ModelV2_512x5"

for i in range(n_iter):
    result = algo.train()
    print(f"{i:03d}th iteration done")
    # result.pop("config")
    # pprint(result)

    if i%60 == 0:
        checkpoint_dir = algo.save(save_name+"_"+str(save_iter))
        print(f"Checkpoint saved in directory {checkpoint_dir}")
        save_iter += 1

algo.save(save_name+str("_final"))

환경에서 학습된 Policy 테스트하기 (RL Module 사용)

In [None]:
from ray.rllib.core.rl_module import RLModule
import pathlib
import torch
import numpy as np
import gymnasium as gym
from horcrux_terrain_v1.envs import SandWorld
import time

algo = Algorithm.from_checkpoint("./ModelV2_512x5_11")


In [None]:
# env = gym.make("horcrux_terrain_v1/plane-v1", 
#                terminate_when_unhealthy = False, 
#                render_mode = "human", 
#             #    render_camera_name = 'ceiling', 
#                use_gait = True,
#                gait_params = (30,30,40,40,0),
#                **env_config,
#                ) 

# obs, info = env.reset()

# algo.get_policy().action_connectors

In [None]:
# env = gym.make("horcrux_terrain_v1/sand-v1", 
#                terminate_when_unhealthy = False, 
#                render_mode = "human", 
#             #    render_camera_name = 'ceiling', 
#                use_gait = True,
#                gait_params = (30,30,40,40,0),
#                **env_config,
#                ) 

env = gym.make("horcrux_terrain_v1/plane-v1", 
               terminate_when_unhealthy = False, 
               render_mode = "human", 
            #    render_camera_name = 'ceiling', 
               use_gait = True,
               gait_params = (30,30,40,40,0),
               **env_config,
               ) 

for j in range(5):
   episode_return = 0
   terminated = truncated = False

   obs, info = env.reset()


   for i in range(6000):
      action = algo.compute_single_action(obs, explore=False)
      
      obs, reward, terminated, truncated, info = env.step(action)
      
      prev_a = action

      if terminated:
         print("terminated")

      episode_return += reward

   print(f"Reached episode return of {episode_return}.")

env.close()

환경에서 학습된 Policy 테스트하기 (PPO 알고리즘 사용)

In [None]:
import pathlib
import torch
import numpy as np
import gymnasium as gym
from horcrux_terrain_v1.envs import SandWorld
from ray.rllib.algorithms.algorithm import Algorithm
import time

env = gym.make("horcrux_terrain_v1/plane-v1", 
               terminate_when_unhealthy = False, 
               render_mode = "human", 
            #    render_camera_name = 'ceiling', 
               use_gait = True,
               gait_params = (30,30,40,40,0),
               **env_config,
               ) 

for j in range(10):
   episode_return = 0
   terminated = truncated = False

   obs, info = env.reset()

   init_state = state = [np.zeros([4096], np.float32) for _ in range(200)]
   prev_action = np.zeros((14), np.float32)
   for i in range(1000):

      a, init_state = algo.compute_single_action(observation= obs, state=init_state, prev_action=prev_action,policy_id="default_policy")
      
      obs, reward, terminated, truncated, info = env.step(action)

      prev_action = a
      if terminated:
         print("terminated")

      episode_return += reward

   print(f"Reached episode return of {episode_return}.")

env.close()