* Ray RLlib 노트북 (파이프 오르기 전용)

필요 패키지 삽입

In [None]:
import gymnasium as gym
import numpy as np
import pandas as pd

from horcrux_terrain_v1.envs import SandWorld
from horcrux_terrain_v1.envs import PlaneWorld
from horcrux_terrain_v1.envs import PlanePipeWorld
from horcrux_terrain_v1.envs import ClimbWorld

import ray
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms import ppo
from ray.rllib.algorithms.ppo import PPOConfig
from ray.rllib.algorithms.sac import SACConfig

from ray.tune.registry import register_env


Ray 실행 (Warning 관련 무시 키워드)

In [None]:
ray.init(runtime_env={"env_vars": {"PYTHONWARNINGS": "ignore::DeprecationWarning"}})

Gym -> Rllib Env 등록

In [None]:
env_config = {
    "forward_reward_weight": 6.5,
}

# Sand
register_env("sand-v1", lambda config: SandWorld(forward_reward_weight=env_config["forward_reward_weight"],))

# Plane
register_env("plane-v1", lambda config: PlaneWorld(forward_reward_weight=env_config["forward_reward_weight"],))

# Pipe
register_env("pipe-v1", lambda config: PlanePipeWorld(forward_reward_weight=env_config["forward_reward_weight"],))

# Climb
register_env("climb-v1", lambda config: ClimbWorld(forward_reward_weight=env_config["forward_reward_weight"],))

학습 알고리즘 설정

In [None]:
config = SACConfig()
# Activate new API stack. -> 구려서 안씀.
config.api_stack(
    enable_rl_module_and_learner=False,
    enable_env_runner_and_connector_v2=False,
)
config.environment("climb-v1")
config.framework("torch")
total_workers = 12
config.resources(num_gpus=1,num_cpus_per_worker=1, num_gpus_per_worker= 1/(total_workers+1))
config.rollouts(num_rollout_workers=total_workers)
config.training(
    gamma=0.95, 
    # kl_coeff=0.3, 
    replay_buffer_config = {
            "_enable_replay_buffer_api": True,
            "type": "MultiAgentPrioritizedReplayBuffer",
            "capacity": int(1e6),
            # If True prioritized replay buffer will be used.
            "prioritized_replay": False,
            "prioritized_replay_alpha": 0.6,
            "prioritized_replay_beta": 0.4,
            "prioritized_replay_eps": 1e-6,
            # Whether to compute priorities already on the remote worker side.
            "worker_side_prioritization": False,
        },

    # See model catalog for more options.
    # https://docs.ray.io/en/latest/rllib/rllib-models.html
    q_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define custom Q-model(s).
            "custom_model_config": {},
        },
    policy_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define a custom policy model.
            "custom_model_config": {},
        },
    train_batch_size_per_learner = 8192,
    num_steps_sampled_before_learning_starts = 6000,

)
config.evaluation(evaluation_interval=100)

# # See model catalog for more options.
# # https://docs.ray.io/en/latest/rllib/rllib-models.html
# # config.model["fcnet_hiddens"] = [512, 512, 512, 512, 512]
# config.model["uses_new_env_runners"] = True
# config.model["fcnet_hiddens"] = [1024, 1024, 1024, 1024, 1024]
# config.model["use_lstm"] = True
# # config.model["lstm_cell_size"] = 2048
# config.model["lstm_cell_size"] = 4096
# config.model["max_seq_len"] = 200
# config.model["lstm_use_prev_action"] = True

algo = config.build()



혹시 이전 학습 결과를 로드할 경우

In [None]:
algo = Algorithm.from_checkpoint("./SAC_layer_512_5_32_slithering_low_healthy_reward2_final")

학습 네트워크 가중치 확보

In [None]:
trained_weights = algo.get_weights()
algo.cleanup()

가중치 교환

In [None]:
algo.set_weights(trained_weights)

In [None]:
trained_weights

학습 파라미터 재조정

In [None]:
# algo.get_config().training().num_sgd_iter

#Env runner 파라미터 보기.
# algo.env_runner.config["exploration_config"]
# algo.get_config().model

# algo.compute_single_action()
algo.get_policy().model

학습 시작

In [None]:
from pprint import pprint

n_iter = 27000
save_iter = 0
save_name = "SAC_layer_512_5_32_helix_sidecost"

for i in range(n_iter):
    result = algo.train()
    print(f"{i:03d}th iteration done")
    # result.pop("config")
    # pprint(result)

    if i%3000 == 0:
        checkpoint_dir = algo.save(save_name+"_"+str(save_iter))
        print(f"Checkpoint saved in directory {checkpoint_dir}")
        save_iter += 1

algo.save(save_name+str("_final"))

환경에서 학습된 Policy 테스트하기 (RL Module 사용)

In [None]:
from ray.rllib.core.rl_module import RLModule
import pathlib
import torch
import numpy as np
import gymnasium as gym
from horcrux_terrain_v1.envs import SandWorld
import time

algo = Algorithm.from_checkpoint("./SAC_layer_512_5_32_helix2_3")


In [None]:
# env = gym.make("horcrux_terrain_v1/plane-v1", 
#                terminate_when_unhealthy = False, 
#                render_mode = "human", 
#             #    render_camera_name = 'ceiling', 
#                use_gait = True,
#                gait_params = (30,30,40,40,0),
#                **env_config,
#                ) 

# obs, info = env.reset()

# algo.get_policy().action_connectors

In [None]:
import time
import mediapy as media

# env = gym.make("horcrux_terrain_v1/sand-v1", 
#                terminate_when_unhealthy = False, 
#                render_mode = "human", 
#             #    render_camera_name = 'ceiling', 
#                use_gait = True,
#                gait_params = (30,30,40,40,0),
#                **env_config,
#                ) 
# env_config['gait_params'] = (30,30,40,40,0)

env = gym.make("horcrux_terrain_v1/climb-v1", 
               terminate_when_unhealthy = False, 
               # render_mode = "human", 
               render_mode = "rgb_array", 
            #    render_camera_name = 'ceiling', 
               use_gait = True,
               # gait_params = (30,30,40,40,0),
               **env_config,
               ) 

step_starting_index = 0
episode_index = 1
video_prefix = "SAC_Slithering_"

frames = []

for j in range(1):
   episode_return = 0
   terminated = truncated = False

   obs, info = env.reset()

   t_now = time.time()
   for i in range(1000):
      # while (time.time() - t_now) < 0.1:
      #     pass
      t_now = time.time()
      action = algo.compute_single_action(obs, explore=False)
      
      obs, reward, terminated, truncated, info = env.step(action)
      
      pixels = env.render()
      frames.append(pixels)

      prev_a = action

      if terminated:
         print("terminated")

      episode_return += reward

      # print(info['com_ypr'])
      # print(f"{info['x_velocity']} || {info['y_velocity']}")

   print(f"Reached episode return of {episode_return}.")

env.close()

media.show_video(frames, fps=10)

환경에서 학습된 Policy 테스트하기 (PPO 알고리즘 사용)

In [None]:
import pathlib
import torch
import numpy as np
import gymnasium as gym
from horcrux_terrain_v1.envs import SandWorld
from ray.rllib.algorithms.algorithm import Algorithm
import time

env = gym.make("horcrux_terrain_v1/plane-v1", 
               terminate_when_unhealthy = False, 
               render_mode = "human", 
            #    render_camera_name = 'ceiling', 
               use_gait = True,
               gait_params = (30,30,40,40,0),
               **env_config,
               ) 

for j in range(10):
   episode_return = 0
   terminated = truncated = False

   obs, info = env.reset()

   init_state = state = [np.zeros([4096], np.float32) for _ in range(200)]
   prev_action = np.zeros((14), np.float32)
   for i in range(1000):

      a, init_state = algo.compute_single_action(observation= obs, state=init_state, prev_action=prev_action,policy_id="default_policy")
      
      obs, reward, terminated, truncated, info = env.step(action)

      prev_action = a
      if terminated:
         print("terminated")

      episode_return += reward

   print(f"Reached episode return of {episode_return}.")

env.close()