# Horcrux Joystick 입력 학습 진행

## 필요 패키지 import

In [1]:
import gymnasium as gym
import numpy as np
import pandas as pd

# 조이스틱 환경 삽입
from horcrux_terrain_v1.envs import PlaneJoyWorld
from horcrux_terrain_v1.envs import PlaneWorld

# Ray 패키지 삽입
import ray
from ray.rllib.algorithms.algorithm import Algorithm
from ray.rllib.algorithms.sac import SACConfig

from ray.tune.registry import register_env

  "cipher": algorithms.TripleDES,
  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


## Ray 실행

In [3]:
import socket
import psutil

conn_ip = ""
interfaces = psutil.net_if_addrs()
for interface_name, addresses in interfaces.items():
    if "openvpn" in interface_name.lower() and "tap" in interface_name.lower():
        snicaddrs = interfaces[str(interface_name)]
        for addrfamily in snicaddrs:
            if addrfamily.family == socket.AF_INET:
                conn_ip = addrfamily.address

# 해당 init을 통해서 VPN을 통한 외부 접속 가능함.
ray.init(dashboard_host=conn_ip, dashboard_port=8265)

2025-02-26 00:09:48,684	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.12.5
Ray version:,2.36.1
Dashboard:,http://127.0.0.1:8265


## Gym 환경 등록하기

In [5]:
env_config = {
    "forward_reward_weight": 6.5,
    "side_cost_weight": 2.0,
    "unhealthy_max_steps": 100,
    "healthy_reward": 0.5,
    "healthy_roll_range": (-35,35),
    "terminating_roll_range": (-85,85),
    "rotation_norm_cost_weight": 0.01,
    "rotation_orientation_cost_weight": 1.2,
    "termination_reward": 0,
    "gait_params": (30, 30, 60, 60, 0),
    "use_friction_chg": True,
    "joy_input_random": True,
}

# JoyWorld
register_env("joy-v1", lambda config: PlaneJoyWorld( forward_reward_weight=env_config["forward_reward_weight"], 
                                                     side_cost_weight=env_config["side_cost_weight"], 
                                                     unhealthy_max_steps=env_config["unhealthy_max_steps"],
                                                     healthy_reward=env_config["healthy_reward"], 
                                                     healthy_roll_range=env_config["healthy_roll_range"],
                                                     terminating_roll_range=env_config["terminating_roll_range"],
                                                     rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                     rotation_orientation_cost_weight=env_config["rotation_orientation_cost_weight"],
                                                     termination_reward=env_config["termination_reward"],
                                                     gait_params=env_config["gait_params"],
                                                     use_friction_chg=env_config["use_friction_chg"],
                                                     joy_input_random=env_config["joy_input_random"],
                                                   )
            )

# Plane
register_env("plane-v1", lambda config: PlaneWorld(forward_reward_weight=env_config["forward_reward_weight"], 
                                                 side_cost_weight=env_config["side_cost_weight"], 
                                                 unhealthy_max_steps=env_config["unhealthy_max_steps"], 
                                                 healthy_reward=env_config["healthy_reward"],
                                                 healthy_roll_range=env_config["healthy_roll_range"],
                                                 terminating_roll_range=env_config["terminating_roll_range"],
                                                 rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                 rotation_orientation_cost_weight=env_config["rotation_orientation_cost_weight"],
                                                 termination_reward=env_config["termination_reward"],
                                                 use_friction_chg=env_config["use_friction_chg"],
                                                 gait_params=env_config["gait_params"]))

## 학습된 알고리즘 불러오기

In [None]:
# algo = Algorithm.from_checkpoint("./Paper_agents/good/Linear/SAC_layer_512_5_32_Linear_restart_final")

# trained_weights = algo.get_weights()
# algo.cleanup()

## 학습 알고리즘 설정하기

In [7]:
config = SACConfig()

# 구형 API 구조 사용
config.api_stack(
    enable_rl_module_and_learner=False,
    enable_env_runner_and_connector_v2=False,
)

config.environment("joy-v1")
config.framework("torch")

# 병렬 CPU 사용 설정
total_workers = 12
config.resources(num_gpus=1)
config.env_runners(num_env_runners=12)
config.training(
    gamma=0.95,
    replay_buffer_config={
    "_enable_replay_buffer_api": True,
    "capacity": int(1e6),
    # If True prioritized replay buffer will be used.
    "prioritized_replay": False,
    "prioritized_replay_alpha": 0.6,
    "prioritized_replay_beta": 0.4,
    "prioritized_replay_eps": 1e-6,
    # Whether to compute priorities already on the remote worker side.
    "worker_side_prioritization": False,
    },

    q_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define custom Q-model(s).
            "custom_model_config": {},
    },
    policy_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define a custom policy model.
            "custom_model_config": {},
    },

    train_batch_size_per_learner = 8192,
    num_steps_sampled_before_learning_starts = 6000,
)

algo = config.build()


`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
[36m(pid=57468)[0m   "cipher": algorithms.TripleDES,
[36m(pid=28756)[0m   "class": algorithms.Blowfish,
[36m(pid=28756)[0m   "class": algorithms.TripleDES,
2025-02-26 00:10:38,866	INFO trainable.py:161 -- Trainable.setup took 11.929 seconds. If your trainable is slow to initialize, consider setti

## 학습시작

In [None]:
from pprint import pprint

n_iter = 21000
save_iter = 0
save_name = "SAC_layer_512_5_32_linear_friction_joy2"

for i in range(n_iter):
    result = algo.train()
    print(f"{i:03d}th iteration done")
    # result.pop("config")
    # pprint(result)

    if i%3000 == 0:
        checkpoint_dir = algo.save(save_name+"_"+str(save_iter))
        print(f"Checkpoint saved in directory {checkpoint_dir}")
        save_iter += 1

algo.save(save_name+str("_final"))



000th iteration done
Checkpoint saved in directory TrainingResult(checkpoint=Checkpoint(filesystem=local, path=SAC_layer_512_5_32_linear_friction_joy2_0), metrics={'custom_metrics': {}, 'episode_media': {}, 'info': {'learner': {}, 'num_env_steps_sampled': 600, 'num_env_steps_trained': 0, 'num_agent_steps_sampled': 600, 'num_agent_steps_trained': 0}, 'env_runners': {'episode_reward_max': nan, 'episode_reward_min': nan, 'episode_reward_mean': nan, 'episode_len_mean': nan, 'episode_media': {}, 'episodes_timesteps_total': 0, 'policy_reward_min': {}, 'policy_reward_max': {}, 'policy_reward_mean': {}, 'custom_metrics': {}, 'hist_stats': {'episode_reward': [], 'episode_lengths': []}, 'sampler_perf': {}, 'num_faulty_episodes': 0, 'connector_metrics': {}, 'num_episodes': 0, 'episode_return_max': nan, 'episode_return_min': nan, 'episode_return_mean': nan, 'episodes_this_iter': 0}, 'num_healthy_workers': 12, 'num_in_flight_async_sample_reqs': 0, 'num_remote_worker_restarts': 0, 'num_agent_steps_s



008th iteration done
009th iteration done
010th iteration done
011th iteration done
012th iteration done
013th iteration done
014th iteration done
015th iteration done
016th iteration done
017th iteration done
018th iteration done
019th iteration done
020th iteration done
021th iteration done
022th iteration done
023th iteration done
024th iteration done
025th iteration done
026th iteration done
027th iteration done
028th iteration done
029th iteration done
030th iteration done
031th iteration done
032th iteration done
033th iteration done
034th iteration done
035th iteration done
036th iteration done
037th iteration done
038th iteration done
039th iteration done
040th iteration done
041th iteration done
042th iteration done
043th iteration done
044th iteration done
045th iteration done
046th iteration done
047th iteration done
048th iteration done
049th iteration done
050th iteration done
051th iteration done
052th iteration done
053th iteration done
054th iteration done
055th iterati

# 학습 알고리즘 평가하기

In [None]:
algo = Algorithm.from_checkpoint("./SAC_layer_512_5_32_linear_friction_joy_1") # 학습된 정책 로드

## 평가용 Env 생성 및 실행

In [None]:
import pathlib
import torch
import numpy as np
import gymnasium as gym
from horcrux_terrain_v1.envs import SandWorld
from ray.rllib.algorithms.algorithm import Algorithm
import time

eval_config = env_config
eval_config["use_friction_chg"] = False
eval_config["joy_input_random"] = False
eval_config["joy_input"] = (0.7, 0, 0.6)


env = gym.make("horcrux_terrain_v1/plane-v2", 
               terminate_when_unhealthy = False, 
               render_mode = "human", 
               render_camera_name = 'ceiling', 
               use_gait = True,               
               **eval_config,
               ) 

In [None]:
for j in range(3):
   episode_return = 0
   terminated = truncated = False

   obs, info = env.reset()

   for i in range(1000):

      action = algo.compute_single_action(observation= obs)
      
      obs, reward, terminated, truncated, info = env.step(action)

      if terminated:
         print("terminated")

      episode_return += reward

   print(f"Reached episode return of {episode_return}.")

env.close()