In [2]:
import gymnasium as gym
from gymnasium.envs.registration import register
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env

In [3]:
register(
    id="TowerClimb-v0",
    entry_point="tower_climb.tower_climb_env:TowerClimbEnv",
)

In [3]:

env = gym.make("TowerClimb-v0")

# Opcjonalnie: sprawdzenie środowiska
check_env(env, warn=True)

model = PPO(
    "MultiInputPolicy",
    env=env,
    learning_rate=3e-4,         # 🌟 Zwiększamy lekko learning rate dla szybszego uczenia
    n_steps=2048,               # ✅ Duże n_steps = mniej szumu, dłuższe rollouty
    batch_size=64,              # ✅ OK
    n_epochs=10,                # ✅ OK
    gamma=0.99,                 # ✅ Standard, dobre dla dłuższych odcinków
    gae_lambda=0.95,            # 🌟 Stabilizuje Advantage Estimates (polecane)
    ent_coef=0.02,              # 🌟 Zachęca do eksploracji
    clip_range=0.2,             # 🌟 Mniejsze = mniejsze ryzyko niestabilnych zmian (z 0.3 na 0.2)
    vf_coef=0.5,                # 🌟 Domyślnie OK, ale możesz zwiększyć na 0.7, jeśli `value_loss` jest bardzo wysoki
    max_grad_norm=0.5,          # ✅ Standardowo, dla stabilności
    verbose=1,
)
model.learn(total_timesteps=300_000)

model.save("ppo_tower_climb")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 51.7     |
|    ep_rew_mean     | -101     |
| time/              |          |
|    fps             | 887      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 65.9        |
|    ep_rew_mean          | -10.2       |
| time/                   |             |
|    fps                  | 802         |
|    iterations           | 2           |
|    time_elapsed         | 5           |
|    total_timesteps      | 4096        |
| train/                  |             |
|    approx_kl            | 0.010697299 |
|    clip_fraction        | 0.0853      |
|    clip_range           | 0.2         |
|    entropy_loss   

In [4]:
model = PPO.load("ppo_tower_climb")

env = gym.make("TowerClimb-v0", render_mode="human")
obs, _ = env.reset()


done = False
while not done:
    action, _states = model.predict(obs, deterministic=True)
    obs, reward, terminated, truncated, info = env.step(action)
    done = terminated or truncated
    print(f"Reward: {reward}, Done: {done}, Info: {info}, Action: {action}, Obs: {obs}")
    env.render()

Reward: 8.533719076598244, Done: False, Info: {'platform': 0}, Action: 2, Obs: {'agent_position': array([450. ,  28.5], dtype=float32), 'current_platform': array([300.,   0., 600.,   0.], dtype=float32), 'next_platform': array([679.93414, 150.     , 841.014  , 150.     ], dtype=float32), 'agent_speed': array([28.5])}


  logger.warn(


Reward: 7.406361639391981, Done: False, Info: {'platform': 0}, Action: 2, Obs: {'agent_position': array([450. ,  55.5], dtype=float32), 'current_platform': array([300.,   0., 600.,   0.], dtype=float32), 'next_platform': array([679.93414, 150.     , 841.014  , 150.     ], dtype=float32), 'agent_speed': array([27.])}
Reward: 7.460171361216591, Done: False, Info: {'platform': 0}, Action: 2, Obs: {'agent_position': array([450.,  81.], dtype=float32), 'current_platform': array([300.,   0., 600.,   0.], dtype=float32), 'next_platform': array([679.93414, 150.     , 841.014  , 150.     ], dtype=float32), 'agent_speed': array([25.5])}
Reward: 7.496863291726738, Done: False, Info: {'platform': 0}, Action: 2, Obs: {'agent_position': array([450., 105.], dtype=float32), 'current_platform': array([300.,   0., 600.,   0.], dtype=float32), 'next_platform': array([679.93414, 150.     , 841.014  , 150.     ], dtype=float32), 'agent_speed': array([24.])}
Reward: 7.518454975335233, Done: False, Info: {'p