# Horcrux Joystick 입력 학습 진행

## 필요 패키지 import

In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd

# 조이스틱 환경 삽입
import horcrux_terrain_v2
from horcrux_terrain_v2.envs import PlaneJoyWorld

# Ray 패키지 삽입
import ray
import os
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.sac import SACConfig

from ray.tune.registry import register_env

import mediapy as media

from scipy.ndimage import uniform_filter1d
from scipy.spatial.transform import Rotation

import matplotlib.pyplot as plt

from gymnasium.utils.save_video import save_video

from IPython.display import Video

# 사용자 구성 모델 정의

In [2]:
import torch.nn as nn
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork
from ray.rllib.models import ModelCatalog

class CustomSACModel(TorchModelV2, nn.Module):
    def __init__(self, obs_space, action_space, num_outputs, model_config, name):
        TorchModelV2.__init__(self, obs_space, action_space, num_outputs, model_config, name)
        nn.Module.__init__(self)

        model_shape = model_config['fcnet_hiddens']
        print(model_config)

        # Shared actor trunk
        self.shared = FullyConnectedNetwork(
            obs_space, action_space, model_shape[-1], model_config, name + "_shared"
        )

        # Value network head 확장
        self.value_branch = nn.Sequential(
            nn.Linear(model_shape[-1], 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

        self._value_out = None

    def forward(self, input_dict, state, seq_lens):
        features, _ = self.shared(input_dict, state, seq_lens)
        self._value_out = self.value_branch(features)
        return features, state

    def value_function(self):
        return self._value_out.squeeze(1)
    
    
ModelCatalog.register_custom_model("custom_sac_model", CustomSACModel)

# 필요 함수 정의

In [3]:
def get_unique_filename(base_path, ext=".mp4"):
    """중복된 파일명이 존재하면 숫자를 증가하여 새로운 경로를 반환"""
    if not base_path.endswith(ext):
        base_path += ext  # 확장자 자동 추가

    file_name, file_ext = os.path.splitext(base_path)  # 파일명과 확장자 분리
    count = 0
    new_path = f"{file_name}-episode-0"+file_ext

    while os.path.exists(new_path):  # 파일 존재 여부 확인
        new_path = f"{file_name}{count}-episode-0{file_ext}"
        count += 1


    return f"rl-video{count-1}", new_path


def default_plot(x, y, f_name='default_plot', legends=['acc_x', 'acc_y', 'acc_z'], title=''):
    colors = plt.get_cmap("tab10").colors
    fig, ax = plt.subplots(figsize=(15/2.54, 10/2.54))
    ax.set_facecolor((0.95, 0.95, 0.95)) 

    n_column = len(np.shape(y))
    if n_column>2:
        print("The dimmension of data must be less than 3. (1D or 2D)")
        return -1
    
    n_data = np.shape(y)[1]

    for i in range(n_data):
        # **Plot**
        ax.plot(x, y[:,i], linewidth=1.5, linestyle="-", color=colors[i], label=legends[i])
        # ax.plot(x, y[:,i], linewidth=1.5, linestyle="-", color=colors[1], label=legends[1])
        # ax.plot(x, y[:,i], linewidth=1.5, linestyle="-", color=colors[2], label=legends[2])

    # **Grid 설정**
    ax.grid(True, linestyle="--", linewidth=1, color="#202020", alpha=0.7)  # 주요 그리드
    ax.minorticks_on()
    ax.grid(True, which="minor", linestyle=":", linewidth=0.5, color="#404040", alpha=0.5)  # 보조 그리드

    # **Axis 스타일 설정**
    ax.spines["top"].set_linewidth(1.0)
    ax.spines["right"].set_linewidth(1.0)
    ax.spines["left"].set_linewidth(1.0)
    ax.spines["bottom"].set_linewidth(1.0)

    ax.tick_params(axis="both", labelsize=11, width=1.0)  # 폰트 크기 및 라인 두께
    ax.xaxis.label.set_size(12)
    ax.yaxis.label.set_size(12)

    # **폰트 및 제목 설정**
    plt.rcParams["font.family"] = "Arial"
    ax.set_xlabel("X-Axis", fontsize=12, fontweight="bold")
    ax.set_ylabel("Y-Axis", fontsize=12, fontweight="bold")
    ax.set_title(title, fontsize=14, fontweight="bold")

    # **Legend (MATLAB 스타일 적용)**
    ax.legend(loc="upper right", ncol=3, fontsize=10, frameon=True)

    # **비율 설정 (MATLAB의 `pbaspect([2.1 1 1])`과 비슷한 효과)**
    fig.set_size_inches(2.1 * 5, 5)  # 비율 2.1:1 (기본 높이 5inch 기준)

    # **Save Figure (MATLAB saveas와 유사)**
    plt.savefig(f"./figs/{f_name}.png", dpi=600, bbox_inches="tight")

    plt.show()

def moving_average(data, window_size):
    kernel = np.ones(window_size) / window_size
    return np.convolve(data, kernel, mode='same')  # 'valid'는 경계 제외


def get_data_from_info(info):
    # Status info
    stat_init_rpy = np.array([_info['init_rpy'] for _info in info])
    stat_init_com = np.array([_info['init_com'] for _info in info])
    stat_xy_vel = np.array([[_info['x_velocity'], _info['y_velocity']] for _info in info])
    stat_yaw_vel = np.array([_info['yaw_velocity'] for _info in info])
    stat_quat = np.array([_info['head_quat'] for _info in info])
    stat_ang_vel = np.array([_info['head_ang_vel'] for _info in info])
    stat_lin_acc = np.array([_info['head_lin_acc'] for _info in info])
    stat_motion_vector = np.array([_info['motion_vector'] for _info in info])
    stat_com_pos = np.array([_info['com_pos'] for _info in info])
    stat_com_ypr = np.array([_info['com_ypr'] for _info in info])
    stat_step_ypr = np.array([_info['step_ypr'] for _info in info])
    

    # Rew info
    rew_linear_movement = np.array([_info['reward_linear_movement'] for _info in info])
    reward_angular_movement = np.array([_info['reward_angular_movement'] for _info in info])
    reward_efficiency = np.array([_info['reward_efficiency'] for _info in info])
    reward_healthy = np.array([_info['reward_healthy'] for _info in info])
    cost_ctrl = np.array([_info['cost_ctrl'] for _info in info])
    cost_unhealthy = np.array([_info['cost_unhealthy'] for _info in info])
    cost_orientation = np.array([_info['cost_orientation'] for _info in info])
    cost_yaw_vel = np.array([_info['cost_yaw_vel'] for _info in info])
    direction_similarity = np.array([_info['direction_similarity'] for _info in info])
    rotation_alignment = np.array([_info['rotation_alignment'] for _info in info])

    # Input info
    input_joy = np.array([_info['joy_input'] for _info in info])

    data_dict = {
        'stat_init_rpy': stat_init_rpy,
        'stat_init_com': stat_init_com,
        'stat_xy_vel': stat_xy_vel,
        'stat_yaw_vel': stat_yaw_vel,
        'stat_quat': stat_quat,
        'stat_ang_vel': stat_ang_vel,
        'stat_lin_acc': stat_lin_acc,
        'stat_motion_vector': stat_motion_vector,
        'stat_com_pos': stat_com_pos,
        'stat_com_ypr': stat_com_ypr,
        'stat_step_ypr': stat_step_ypr,

        'rew_linear_movement': rew_linear_movement,
        'reward_angular_movement': reward_angular_movement,
        'reward_efficiency': reward_efficiency,
        'reward_healthy': reward_healthy,
        'cost_ctrl': cost_ctrl,
        'cost_unhealthy': cost_unhealthy,
        'cost_orientation': cost_orientation,
        'cost_yaw_vel': cost_yaw_vel,
        'direction_similarity': direction_similarity,
        'rotation_alignment': rotation_alignment,

        'input_joy': input_joy,
    }
    
    return data_dict


## Ray 실행

In [None]:
ray.init(dashboard_host="0.0.0.0", dashboard_port=8265)

2025-03-27 16:14:48,139	INFO worker.py:1810 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://10.130.6.78:8265 [39m[22m


0,1
Python version:,3.12.9
Ray version:,2.39.0
Dashboard:,http://10.130.6.78:8265


[33m(raylet)[0m [2025-03-28 09:47:48,746 E 5316 5316] (raylet) node_manager.cc:3069: 1 Workers (tasks / actors) killed due to memory pressure (OOM), 0 Workers crashed due to other reasons at node (ID: e4a10df36715847ab63558bbfa56d4a9b0ea7bbd45032987c3bf7c8d, IP: 10.130.6.78) over the last time period. To see more information about the Workers killed on this node, use `ray logs raylet.out -ip 10.130.6.78`
[33m(raylet)[0m 
[33m(raylet)[0m Refer to the documentation on how to address the out of memory issue: https://docs.ray.io/en/latest/ray-core/scheduling/ray-oom-prevention.html. Consider provisioning more memory on this node or reducing task parallelism by requesting more CPUs per task. To adjust the kill threshold, set the environment variable `RAY_memory_usage_threshold` when starting Ray. To disable worker killing, set the environment variable `RAY_memory_monitor_refresh_ms` to zero.


## Gym 환경 등록하기

In [5]:
env_config = {
    "gait_sampling_interval": 0.01,
    "forward_reward_weight": 200.0,
    "rotation_reward_weight": 100.0,
    "unhealthy_max_steps": 80.0,
    "healthy_reward": 3.0,
    "healthy_roll_range": (-30,30),
    "terminating_roll_range": (-80,80),
    "rotation_norm_cost_weight": 8.0,
    "termination_reward": 0,
    "gait_params": (30, 30, 40, 40, 0),
    "use_friction_chg": True,
    "joy_input_random": True,
    "use_imu_window": True,
    "ctrl_cost_weight": 0.05,
}

render_env_config = {
    "forward_reward_weight": 60.0,
    "rotation_reward_weight": 40.0,
    "unhealthy_max_steps": 100.0,
    "healthy_reward": 3.0,
    "healthy_roll_range": (-35,35),
    "terminating_roll_range": (-85,85),
    "rotation_norm_cost_weight": 1.5,
    "termination_reward": 0,
    "gait_params": (30, 30, 40, 40, 0),
    "use_friction_chg": True,
    "joy_input_random": True,
    "render_mode": "rgb_array",
    "render_camera_name": 'ceiling',
    "use_imu_window": True,
    "ctrl_cost_weight": 1.5,
}

# env = gym.make("horcrux_terrain_v2/plane-v2", **render_env_config)

# JoyWorld
register_env("joy-v1", lambda config: PlaneJoyWorld( forward_reward_weight=env_config["forward_reward_weight"], 
                                                     rotation_reward_weight=env_config["rotation_reward_weight"], 
                                                     unhealthy_max_steps=env_config["unhealthy_max_steps"],
                                                     healthy_reward=env_config["healthy_reward"], 
                                                     healthy_roll_range=env_config["healthy_roll_range"],
                                                     terminating_roll_range=env_config["terminating_roll_range"],
                                                     rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                     termination_reward=env_config["termination_reward"],
                                                     gait_params=env_config["gait_params"],
                                                     use_friction_chg=env_config["use_friction_chg"],
                                                     joy_input_random=env_config["joy_input_random"],
                                                     use_imu_window=env_config["use_imu_window"],
                                                     ctrl_cost_weight=env_config["ctrl_cost_weight"],
                                                   )
            )

## 학습 알고리즘 설정하기

In [6]:
config = SACConfig()

# 구형 API 구조 사용
config.api_stack(
    enable_rl_module_and_learner=False,
    enable_env_runner_and_connector_v2=False,
)

config.environment("joy-v1")
config.framework("torch")

# 병렬 CPU 사용 설정
total_workers = 16
config.resources(num_gpus=1)
config.learners(num_learners = 1, num_gpus_per_learner=0.6)
config.env_runners(num_env_runners = total_workers, num_cpus_per_env_runner = 1, num_gpus_per_env_runner = 0.3/(total_workers), rollout_fragment_length = 5000)
config.training(
    gamma=0.95,
    replay_buffer_config={
    "_enable_replay_buffer_api": True,
    "capacity": int(1000000),
    "type": "MultiAgentPrioritizedReplayBuffer",
    "replay_batch_size": 10000,
    # If True prioritized replay buffer will be used.
    "prioritized_replay": True,
    "prioritized_replay_alpha": 0.6,
    "prioritized_replay_beta": 0.4,
    "prioritized_replay_eps": 1e-6,
    # Whether to compute priorities already on the remote worker side.
    # "worker_side_prioritization": False,
    },

    # Custom model 사용 (잘 안됨)
    # model={
    #     "custom_model": "custom_sac_model",  # ValueNetwork MLP 사용
    #     "custom_model_config": {},
    #     "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
    #     "fcnet_activation": "tanh",
    #     "post_fcnet_hiddens": [],
    #     "post_fcnet_activation": None,
    #     "vf_share_layers": False,
    # },

    q_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 512],
            "fcnet_activation": "relu",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": "tanh",
            "custom_model": None,  # Use this to define custom Q-model(s).
            "custom_model_config": {},
    },
    policy_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 512],
            "fcnet_activation": "relu",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": "tanh",
            "custom_model": None,  # Use this to define custom Q-model(s).
            "custom_model_config": {},
    },

    minibatch_size = 10000,
    train_batch_size = 100000,
    num_steps_sampled_before_learning_starts = 200000,
)

algo = config.build()


`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2025-03-27 16:15:01,640	INFO trainable.py:161 -- Trainable.setup took 12.960 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [7]:
from pprint import pprint

pprint(algo.get_config().to_dict())
# algo.get_default_policy_class(config).

{'_AlgorithmConfig__prior_exploration_config': None,
 '_deterministic_loss': False,
 '_disable_action_flattening': False,
 '_disable_execution_plan_api': -1,
 '_disable_initialize_loss_from_dummy_batch': False,
 '_disable_preprocessor_api': False,
 '_dont_auto_sync_env_runner_states': False,
 '_enable_rl_module_api': -1,
 '_env_to_module_connector': None,
 '_evaluation_parallel_to_training_wo_thread': False,
 '_fake_gpus': False,
 '_is_atari': None,
 '_learner_class': None,
 '_learner_connector': None,
 '_model_config': {},
 '_module_to_env_connector': None,
 '_per_module_overrides': {},
 '_rl_module_spec': None,
 '_run_training_always_in_thread': False,
 '_tf_policy_handles_more_than_one_loss': False,
 '_torch_grad_scaler_class': None,
 '_torch_lr_scheduler_classes': None,
 '_use_beta_distribution': False,
 'action_mask_key': 'action_mask',
 'action_space': None,
 'actions_in_input_normalized': False,
 'actor_lr': 3e-05,
 'add_default_connectors_to_env_to_module_pipeline': True,
 'add

## 학습시작

In [8]:
from pprint import pprint
import datetime
from scipy.io import savemat

n_iter = 1200
save_iter = 0
save_name = "SAC_layer_Big_chordal_relu_sampling001_327"

for i in range(n_iter):
    result = algo.train()
    print(f"{i:03d}", end=", ")
    # result.pop("config")
    # pprint(result)

    if i%50 == 0:
        checkpoint_dir = algo.save(save_name+"_"+str(save_iter))
        print(f"Checkpoint saved in directory {checkpoint_dir}")
        save_iter += 1


        # Record Validation Env
        env = gym.make("horcrux_terrain_v2/plane-v2", **render_env_config)
        obs = env.reset()[0]
        env_done = False

        _video_base_name = 'rl-video'

        rew_return = 0
        frames = []
        info = []

        for i in range(3000):
            act = algo.compute_single_action(observation=obs)
            obs, _step_rew, _, env_done, env_info = env.step(act)
            pixels = env.render()
            frames.append(pixels)
            info.append(env_info)
            rew_return += _step_rew

        _f_name, _full_path = get_unique_filename(f"./video/{_video_base_name}")
        rew_dict = get_data_from_info(info)
        rew_dict['rew_return'] = rew_return

        # Save Video
        save_video(frames, "./video/", name_prefix=_f_name, fps=env.metadata['render_fps'])

        # Save Video Info
        _f_video_info = open(f"./video/joy_input.txt", 'a')
        _f_video_info.write(f'File creation time: {datetime.datetime.now()}\n')
        _f_video_info.write(f'Video file name: {_f_name}, Joy input: {info[0]["joy_input"]}, Friction: {info[0]["friction_coeff"]}\n')
        _f_video_info.close()

        # Save Reward Info mat file
        savemat(f"./data/{save_name}_{_f_name}.mat", rew_dict)

        env.close()


algo.save(save_name+str("_final"))



000, Checkpoint saved in directory TrainingResult(checkpoint=Checkpoint(filesystem=local, path=SAC_layer_Big_chordal_relu_sampling001_327_0), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {}, 'num_env_steps_sampled': 80000, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 80000, 'num_agent_steps_trained': 0}, 'env_runners': {'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episode_media': {}, 'episodes_timesteps_total': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'num_faulty_episodes': 0, 'connector_metrics': {}, 'num_episodes': 0, 'episode_return_max': nan, 'episode_return_min': nan, 'episode_return_mean': nan, 'episodes_this_iter': 0}, 'num_healthy_workers': 16, 'num_in_flight_async_sample_reqs': 0, 'num_remote_worker_restarts': 0, 'num_agent_steps_sampled': 



002, 003, 004, 005, 006, 007, 008, 009, 010, 011, 012, 013, 014, 015, 016, 017, 018, 019, 020, 021, 022, 023, 024, 025, 026, 027, 028, 029, 030, 031, 032, 033, 034, 035, 036, 037, 038, 039, 040, 041, 042, 043, 044, 045, 046, 047, 048, 049, 050, Checkpoint saved in directory TrainingResult(checkpoint=Checkpoint(filesystem=local, path=SAC_layer_Big_chordal_relu_sampling001_327_1), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {'default_policy': {'custom_metrics': {}, 'learner_stats': {'actor_loss': np.float64(0.7749482184648514), 'critic_loss': np.float64(0.9278041958808899), 'alpha_loss': np.float64(-3.4243703126907348), 'alpha_value': np.float32(0.864443), 'log_alpha_value': np.float32(-0.14567028), 'target_entropy': np.float32(-14.0), 'policy_t': np.float64(0.0030787271447479726), 'mean_q': np.float64(-8.72304801940918), 'max_q': np.float64(28.158403396606445), 'min_q': np.float64(-17.704904556274414)}, 'model': {}, 'num_grad_updates_lifetime': np.float64(485

  weight = (p_sample * len(self)) ** (-beta)


IndexError: list index out of range