### 학습 전 Ray tune을 통한 하이퍼파라미터 튜닝 진행

In [1]:
import ray
from ray import tune
from ray.rllib.algorithms.ppo import PPOConfig

import horcrux_terrain_v2
from horcrux_terrain_v2.envs import PlaneJoyWorld
from ray.tune.registry import register_env

# Ray 초기화
# ray.init(dashboard_host="0.0.0.0", dashboard_port=8265)

### Env 등록하기

In [2]:
env_config = {
    "gait_sampling_interval": 0.01,
    "forward_reward_weight": 200.0,
    "rotation_reward_weight": 100.0,
    "unhealthy_max_steps": 80.0,
    "healthy_reward": 3.0,
    "healthy_roll_range": (-30,30),
    "terminating_roll_range": (-80,80),
    "rotation_norm_cost_weight": 8.0,
    "termination_reward": 0,
    "gait_params": (30, 30, 40, 40, 0),
    "use_friction_chg": True,
    "joy_input_random": True,
    "use_imu_window": True,
    "ctrl_cost_weight": 0.05,
}

# JoyWorld
register_env("joy-v1", lambda config: PlaneJoyWorld( forward_reward_weight=env_config["forward_reward_weight"], 
                                                     rotation_reward_weight=env_config["rotation_reward_weight"], 
                                                     unhealthy_max_steps=env_config["unhealthy_max_steps"],
                                                     healthy_reward=env_config["healthy_reward"], 
                                                     healthy_roll_range=env_config["healthy_roll_range"],
                                                     terminating_roll_range=env_config["terminating_roll_range"],
                                                     rotation_norm_cost_weight=env_config["rotation_norm_cost_weight"],
                                                     termination_reward=env_config["termination_reward"],
                                                     gait_params=env_config["gait_params"],
                                                     use_friction_chg=env_config["use_friction_chg"],
                                                     joy_input_random=env_config["joy_input_random"],
                                                     use_imu_window=env_config["use_imu_window"],
                                                     ctrl_cost_weight=env_config["ctrl_cost_weight"],
                                                   )
            )

### 알고리즘 설정 진행하기

In [3]:
algo_config = PPOConfig().to_dict()
"""
Tunable config sets
    gamma=0.95,
    lr=0.0005,
    train_batch_size = 100000,
    minibatch_size = 10000,
    num_epochs = 10,
    entropy_coeff = 0.01,
    vf_loss_coeff = 0.7, #이 값 튜닝 진행해야함. (기본값 : 1.0)
    vf_clip_param = 7,
"""
algo_config['framework'] = 'torch'
algo_config['env'] = 'joy-v1'
algo_config['lr'] = tune.loguniform(1e-5, 1e-2)
algo_config['gamma'] = tune.uniform(0.9, 0.99)
algo_config['num_epochs'] = tune.choice([30, 35, 40, 45, 50])
algo_config['minibatch_size'] = tune.choice([5000, 7000, 10000, 15000])
algo_config['shuffle_batch_per_epoch'] = False
algo_config['lambda'] = tune.uniform(0.95, 0.99)
algo_config['kl_coeff'] = tune.uniform(0.1, 0.5)
algo_config['vf_loss_coeff'] = tune.uniform(0.5, 1.5)
algo_config['entropy_coeff'] = tune.uniform(0.01, 0.1)
algo_config['clip_param'] = tune.uniform(0.1, 0.3)
algo_config['vf_clip_param'] = tune.uniform(5, 10)
algo_config['train_batch_size'] = tune.choice([100000, 150000, 200000])

In [16]:
ex_name = '401-ex1'
ray.tune.TuneConfig.trial_dirname_creator

In [None]:
from ray.rllib.connectors.env_to_module import FlattenObservations


In [22]:
# 실험 실행
def short_dirname(trial):
    return "trial401_" + str(trial.trial_id)

import ray.tune

tune.run(
    "PPO",
    name=ex_name,
    config=algo_config,
    stop={"episode_reward_mean": 195},  # 또는 원하는 조건으로 변경
    num_samples=1,  # GridSearch, Choice 등을 활용하면 num_samples는 보통 1
    checkpoint_at_end=True,
    trial_dirname_creator=short_dirname,
    storage_path='~/ray_results',
    verbose=1,
)


0,1
Current time:,2025-04-02 01:06:58
Running for:,08:01:58.68
Memory:,22.1/63.9 GiB

Trial name,status,loc,clip_param,entropy_coeff,gamma,kl_coeff,lambda,lr,minibatch_size,num_epochs,train_batch_size,vf_clip_param,vf_loss_coeff,iter,total time (s)
PPO_joy-v1_02123_00000,RUNNING,127.0.0.1:36348,0.156614,0.0348509,0.977361,0.480443,0.988087,0.00527696,10000,40,150000,5.60548,0.657777,409,28862.3


2025-04-02 01:06:58,602	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to 'C:/Users/doore/ray_results/401-ex1' in 0.0050s.
2025-04-02 01:07:08,681	INFO tune.py:1041 -- Total run time: 28928.76 seconds (28918.67 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)


<ray.tune.analysis.experiment_analysis.ExperimentAnalysis at 0x1b8aa62c740>