In [None]:
import ray
from ray.rllib.algorithms.ppo import PPO, PPOConfig
from ray import tune
import pprint

In [None]:
ray.shutdown()
ray.init(ignore_reinit_error=True)

In [None]:
config = PPOConfig()  
config = config.training(gamma=0.9, lr=tune.grid_search([5e-05, 5e-08]), kl_coeff=0.3)  
config = config.resources(num_gpus=0)  
config = config.rollouts(num_rollout_workers=3)
config = config.environment(env='BipedalWalker-v3')
config = config.framework(framework='tf')
pprint.pprint(config.to_dict())  

In [None]:
stop = {
    'timesteps_total': 500000
}

In [None]:
analysis = tune.run(
    'PPO',
    config=config,
    stop=stop,
    checkpoint_at_end=True,
    checkpoint_freq=1000,
)

In [None]:
trials = analysis.trials

print(trials[0].experiment_tag, trials[1].experiment_tag,)

In [None]:
trial = trials[0]
best_checkpoint = analysis.get_best_checkpoint(trial, metric='episode_reward_mean', mode='max')
print(f'The best checkpoint: {trial.experiment_tag}: {best_checkpoint}')

In [None]:
config = config.training(lr=5e-05)
new_trainer = PPO(config=config)

new_trainer.restore(best_checkpoint)
new_trainer

In [None]:
import gymnasium as gym
env = gym.make("BipedalWalker-v3", render_mode="human")

observation = env.reset()
env.reset()

observation, info = env.reset()

for _ in range(1000):
   action = new_trainer.compute_single_action(observation)
   observation, reward, terminated, truncated, info = env.step(action)
   print(observation)

   if terminated or truncated:
      observation, info = env.reset()

env.close()