# Horcrux Joystick 입력 학습 진행

## 필요 패키지 import

In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd

# 조이스틱 환경 삽입
from horcrux_terrain_v1.envs import PlaneJoyWorld
from horcrux_terrain_v1.envs import PlaneWorld

# Ray 패키지 삽입
import ray
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.sac import SACConfig

from ray.tune.registry import register_env

## Ray 실행

In [2]:
# import socket
# import psutil

# conn_ip = ""
# interfaces = psutil.net_if_addrs()
# for interface_name, addresses in interfaces.items():
#     if "openvpn" in interface_name.lower() and "tap" in interface_name.lower():
#         snicaddrs = interfaces[str(interface_name)]
#         for addrfamily in snicaddrs:
#             if addrfamily.family == socket.AF_INET:
#                 conn_ip = addrfamily.address

# print(f"Connection IP: {conn_ip}")
# # 해당 init을 통해서 VPN을 통한 외부 접속 가능함.
# ray.init(dashboard_host=conn_ip, dashboard_port=8265)

ray.init()

2025-03-17 17:11:23,773	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.12.9
Ray version:,2.36.1
Dashboard:,http://127.0.0.1:8265


## Gym 환경 등록하기

In [3]:
env_config = {
    "forward_reward_weight": 6.5,
    "side_cost_weight": 2.0,
    "unhealthy_max_steps": 100,
    "healthy_reward": 0.5,
    "healthy_roll_range": (-35,35),
    "terminating_roll_range": (-85,85),
    "rotation_norm_cost_weight": 0.01,
    "rotation_orientation_cost_weight": 1.2,
    "termination_reward": 0,
    "gait_params": (30, 30, 60, 60, 0),
    "use_friction_chg": True,
    "joy_input_random": True,
}

# JoyWorld
register_env("joy-v1", lambda config: PlaneJoyWorld( forward_reward_weight=env_config["forward_reward_weight"], 
                                                     side_cost_weight=env_config["side_cost_weight"], 
                                                     unhealthy_max_steps=env_config["unhealthy_max_steps"],
                                                     healthy_reward=env_config["healthy_reward"], 
                                                     healthy_roll_range=env_config["healthy_roll_range"],
                                                     terminating_roll_range=env_config["terminating_roll_range"],
                                                     rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                     rotation_orientation_cost_weight=env_config["rotation_orientation_cost_weight"],
                                                     termination_reward=env_config["termination_reward"],
                                                     gait_params=env_config["gait_params"],
                                                     use_friction_chg=env_config["use_friction_chg"],
                                                     joy_input_random=env_config["joy_input_random"],
                                                   )
            )

# Plane
register_env("plane-v1", lambda config: PlaneWorld(forward_reward_weight=env_config["forward_reward_weight"], 
                                                 side_cost_weight=env_config["side_cost_weight"], 
                                                 unhealthy_max_steps=env_config["unhealthy_max_steps"], 
                                                 healthy_reward=env_config["healthy_reward"],
                                                 healthy_roll_range=env_config["healthy_roll_range"],
                                                 terminating_roll_range=env_config["terminating_roll_range"],
                                                 rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                 rotation_orientation_cost_weight=env_config["rotation_orientation_cost_weight"],
                                                 termination_reward=env_config["termination_reward"],
                                                 use_friction_chg=env_config["use_friction_chg"],
                                                 gait_params=env_config["gait_params"]))

## 학습된 알고리즘 불러오기

In [4]:
# algo = Algorithm.from_checkpoint("./Paper_agents/good/Linear/SAC_layer_512_5_32_Linear_restart_final")

# trained_weights = algo.get_weights()
# algo.cleanup()

## 학습 알고리즘 설정하기

In [5]:
config = SACConfig()

# 구형 API 구조 사용
config.api_stack(
    enable_rl_module_and_learner=False,
    enable_env_runner_and_connector_v2=False,
)

config.environment("joy-v1")
config.framework("torch")

# 병렬 CPU 사용 설정
total_workers = 16
config.resources(num_gpus=1)
config.env_runners(num_env_runners = total_workers, num_cpus_per_env_runner = 1, num_gpus_per_env_runner = 0.4/(total_workers+1), rollout_fragment_length = 1000)
config.training(
    gamma=0.95,
    replay_buffer_config={
    "_enable_replay_buffer_api": True,
    "capacity": int(5000000),
    "type": "MultiAgentReplayBuffer",
    "replay_batch_size": 10000,
    # If True prioritized replay buffer will be used.
    # "prioritized_replay": False,
    # "prioritized_replay_alpha": 0.6,
    # "prioritized_replay_beta": 0.4,
    # "prioritized_replay_eps": 1e-6,
    # Whether to compute priorities already on the remote worker side.
    # "worker_side_prioritization": False,
    },

    q_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define custom Q-model(s).
            "custom_model_config": {},
    },
    policy_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define a custom policy model.
            "custom_model_config": {},
    },

    train_batch_size = 100000,
    num_steps_sampled_before_learning_starts = 200000,
)

algo = config.build()


`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2025-03-17 17:11:37,573	INFO trainable.py:161 -- Trainable.setup took 13.272 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


## 학습시작

In [None]:
from pprint import pprint

n_iter = 6300
save_iter = 0
save_name = "SAC_layer_512_5_32_linear_friction_joy_317"

for i in range(n_iter):
    result = algo.train()
    print(f"{i:03d}", end=", ")
    # result.pop("config")
    # pprint(result)

    if i%20 == 0:
        checkpoint_dir = algo.save(save_name+"_"+str(save_iter))
        print(f"Checkpoint saved in directory {checkpoint_dir}")
        save_iter += 1

algo.save(save_name+str("_final"))

000, Checkpoint saved in directory TrainingResult(checkpoint=Checkpoint(filesystem=local, path=SAC_layer_512_5_32_linear_friction_joy_317_0), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {}, 'num_env_steps_sampled': 16000, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 16000, 'num_agent_steps_trained': 0}, 'env_runners': {'episode_reward_max': np.float64(9356.54455276363), 'episode_reward_min': np.float64(387.57087287913293), 'episode_reward_mean': np.float64(3405.8655076842274), 'episode_len_mean': np.float64(368.9375), 'episode_media': {}, 'episodes_timesteps_total': 5903, 'policy_reward_min': {'default_policy': np.float64(387.57087287913293)}, 'policy_reward_max': {'default_policy': np.float64(9356.54455276363)}, 'policy_reward_mean': {'default_policy': np.float64(3405.8655076842274)}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [np.float64(4028.306012894526), np.float64(2960.0729688287474), np.float64(2751.2465512540275), np.float64(164



012, 013, 014, 015, 016, 017, 018, 019, 020, Checkpoint saved in directory TrainingResult(checkpoint=Checkpoint(filesystem=local, path=SAC_layer_512_5_32_linear_friction_joy_317_1), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'custom_metrics': {}, 'learner_stats': {'actor_loss': np.float64(-9.47246265411377), 'critic_loss': np.float64(16.388931274414062), 'alpha_loss': np.float64(-0.05626858398318291), 'alpha_value': np.float32(0.9973034), 'log_alpha_value': np.float32(-0.0027002722), 'target_entropy': np.float32(-14.0), 'policy_t': np.float64(-0.0007298057898879051), 'mean_q': np.float64(0.05425451695919037), 'max_q': np.float64(0.09943384677171707), 'min_q': np.float64(-0.09457898885011673)}, 'model': {}, 'num_grad_updates_lifetime': np.float64(9.0), 'diff_num_grad_updates_vs_sampler_policy': np.float64(8.0), 'td_error': array([22.714354 , 11.871616 , 16.39198  , ..., 19.234325 ,  8.4655285,
       15.949805 ], shape=(100000,), dtype=fl

# 학습 알고리즘 평가하기

In [None]:
algo = Algorithm.from_checkpoint("./SAC_layer_512_5_32_linear_friction_joy2_2") # 학습된 정책 로드

## 평가용 Env 생성 및 실행

In [None]:
import pathlib
import torch
import numpy as np
import gymnasium as gym
from horcrux_terrain_v1.envs import SandWorld
from ray.rllib.algorithms.algorithm import Algorithm
import time

eval_config = env_config
eval_config["use_friction_chg"] = False
eval_config["joy_input_random"] = False
eval_config["joy_input"] = (1, 0, 0)


In [None]:
env = gym.make("horcrux_terrain_v1/plane-v2", 
               terminate_when_unhealthy = False, 
               render_mode = "human", 
               render_camera_name = 'ceiling', 
               use_gait = True,               
               **eval_config,
               ) 

for j in range(3):
   episode_return = 0
   yaw_total = 0
   terminated = truncated = False

   obs, info = env.reset()

   for i in range(1000):

      action = algo.compute_single_action(observation= obs)
      
      obs, reward, terminated, truncated, info = env.step(action)
      if i == 0:
          print(f"Joy input = {info['joy_input']}")
      if terminated:
         print("terminated")

      episode_return += reward
      yaw_total += info['step_ypr'][0] / 0.1

   print(f"Reached episode return of {episode_return}.")
   print(yaw_total)

env.close()

### Pytorch 모델 바로 불러오기

In [None]:
import torch
from ray.rllib.policy.policy import Policy
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2

_algo = Algorithm.from_checkpoint("./Paper_agents/good/Linear/SAC_layer_512_5_32_Linear_restart_final")
# _policy = Policy.from_checkpoint("./SAC_layer_512_5_32_linear_friction_joy_31_0")
# _policy = Policy.from_checkpoint("./Paper_agents/good/Linear/SAC_layer_512_5_32_Linear_restart_final")


In [None]:
randobs = np.random.random(94)

# _algo.compute_single_action(randobs)
# _algo.get_policy('default_policy').dist_class.mro()
_algo.get_policy('default_policy').model.action_dist



In [None]:
randobs = np.random.random(94)

# obs = torch.tensor(randobs, dtype=torch.float32)
_policy['default_policy'].dist_class
# _policy['default_policy'].compute_single_action(torch.tensor(randobs, dtype=torch.float32), explore=False)