### 진짜 완전 새롭게 다시 시작

In [1]:
import gymnasium as gym
from horcrux_terrain_v2.envs import PlaneJoyWorld

import ray
from ray.tune.registry import register_env
from ray.rllib.algorithms.sac import SACConfig

### Ray 실행

In [2]:
import socket
import psutil

# conn_ip = ""
# interfaces = psutil.net_if_addrs()
# for interface_name, addresses in interfaces.items():
#     if "openvpn" in interface_name.lower() and "tap" in interface_name.lower():
#         snicaddrs = interfaces[str(interface_name)]
#         for addrfamily in snicaddrs:
#             if addrfamily.family == socket.AF_INET:
#                 conn_ip = addrfamily.address

# 해당 init을 통해서 VPN을 통한 외부 접속 가능함.
ray.init(dashboard_host="0.0.0.0", dashboard_port=8265)
print(ray.available_resources())

2025-03-20 10:44:14,627	INFO worker.py:1841 -- Started a local Ray instance.


{'GPU': 1.0, 'accelerator_type:G': 1.0, 'memory': 28693816935.0, 'CPU': 24.0, 'object_store_memory': 14346908467.0, 'node:172.30.151.148': 1.0, 'node:__internal_head__': 1.0}


In [3]:
env_config = {
    "forward_reward_weight": 6.5,
    "side_cost_weight": 2.0,
    "unhealthy_max_steps": 100,
    "healthy_reward": 0.5,
    "healthy_roll_range": (-35,35),
    "terminating_roll_range": (-85,85),
    "rotation_norm_cost_weight": 0.01,
    "rotation_orientation_cost_weight": 1.2,
    "termination_reward": 0,
    "gait_params": (30, 30, 60, 60, 0),
    "use_friction_chg": True,
    "joy_input_random": True,
}

# JoyWorld
register_env("joy-v1", lambda config: PlaneJoyWorld( forward_reward_weight=env_config["forward_reward_weight"], 
                                                     side_cost_weight=env_config["side_cost_weight"], 
                                                     unhealthy_max_steps=env_config["unhealthy_max_steps"],
                                                     healthy_reward=env_config["healthy_reward"], 
                                                     healthy_roll_range=env_config["healthy_roll_range"],
                                                     terminating_roll_range=env_config["terminating_roll_range"],
                                                     rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                     rotation_orientation_cost_weight=env_config["rotation_orientation_cost_weight"],
                                                     termination_reward=env_config["termination_reward"],
                                                     gait_params=env_config["gait_params"],
                                                     use_friction_chg=env_config["use_friction_chg"],
                                                     joy_input_random=env_config["joy_input_random"],
                                                   )
            )


In [None]:
config = (
    SACConfig()
    .environment("joy-v1")
    .env_runners(num_env_runners=24)
    .api_stack(enable_rl_module_and_learner=False, enable_env_runner_and_connector_v2=False)
    .resources(num_gpus=1)
    .framework('torch')
    .training(
        gamma=0.9,
        actor_lr=0.001,
        critic_lr=0.002,
        train_batch_size= 100000,
        train_batch_size_per_learner= 100000,
        num_steps_sampled_before_learning_starts = 200000,
        replay_buffer_config={
            "_enable_replay_buffer_api": True,
            # "type": "MultiAgentReplayBuffer",
            # "type": "EpisodeReplayBuffer",
            "capacity": int(1000000),
            "replay_batch_size": 10000,
        },

        q_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define custom Q-model(s).
            "custom_model_config": {},
        },
        policy_model_config = {
            "fcnet_hiddens": [512, 512, 512, 512, 512, 32],
            "fcnet_activation": "tanh",
            "post_fcnet_hiddens": [],
            "post_fcnet_activation": None,
            "custom_model": None,  # Use this to define a custom policy model.
            "custom_model_config": {},
        },
    )
)

# Build the SAC algo object from the config and run 1 training iteration.
algo = config.build_algo()
# config.to_dict()

ValueError: You are using the new API stack EnvRunners (SingleAgentEnvRunner or MultiAgentEnvRunner), but have forgotten to switch on the new API stack! Try setting `config.api_stack(enable_rl_module_and_learner=True)`.
To suppress all validation errors, set `config.experimental(_validate_config=False)` at your own risk.

### 학습시작

In [None]:
import os
from pprint import pprint

n_iter = 4000
save_iter = 0
save_name = "~/learned_policy/SAC_layer_512_5_32_linear_friction_joy_320"

for i in range(n_iter):
    result = algo.train()
    print(f"{i:03d}", end=", ")
    # result.pop("config")
    # pprint(result)

    if i%200 == 0:
        checkpoint_dir = algo.save_to_path(path=save_name+f"_{save_iter}")
        pprint(f"Checkpoint saved in directory {checkpoint_dir}")
        save_iter += 1

checkpoint_dir = algo.save_to_path(path=save_name+"_final")
pprint(f"Checkpoint saved in directory {checkpoint_dir}")