In [4]:
import itertools
import gym
from stable_baselines3 import PPO
import env.custom_hopper
from stable_baselines3.common.evaluation import evaluate_policy

def grid_search_ppo(env_name, total_timesteps, param_grid, n_eval_episodes=1000): #1000 since we want the test as reliable as possibile
    best_mean_reward = -float('inf')
    best_params = None
    all_results = []

    # Generate all combinations of hyperparameters
    keys, values = zip(*param_grid.items())
    all_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    for params in all_combinations:
        print(f"Testing combination: {params}")

        # Create the environment
        env = gym.make(env_name)
        env.seed(0)  # Set seed for reproducibility

        try:
            # Create the PPO model with the current set of hyperparameters
            model = PPO(
                "MlpPolicy",
                env,
                learning_rate=params['learning_rate'],
                n_steps=params['n_steps'],
                batch_size=params['batch_size'],
                gamma=params['gamma'],
                clip_range=params['clip_range'],
                verbose=0,
                seed=0  # Set seed for reproducibility
            )

            # Train the model
            model.learn(total_timesteps=total_timesteps)

            # Evaluate the model
            mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=n_eval_episodes)
            print(f"Mean reward: {mean_reward} +/- {std_reward}")

            # Save the result
            all_results.append((params, mean_reward, std_reward))

            # Update best parameters if this is the best mean reward so far
            if mean_reward > best_mean_reward:
                best_mean_reward = mean_reward
                best_params = params

        finally:
            # Close the environment
            env.close()

    return best_params, best_mean_reward, all_results

param_grid = {
    'learning_rate': [1e-4, 3e-4, 1e-3],
    'n_steps': [2048, 4096],
    'batch_size': [64, 128],
    'gamma': [0.99, 0.995],
    'clip_range': [0.2, 0.3]
}

best_params, best_mean_reward, all_results = grid_search_ppo('CustomHopper-target-v0', total_timesteps=100000, param_grid=param_grid)
print("Best hyperparameters:", best_params)
print("Best mean reward:", best_mean_reward)


Testing combination: {'learning_rate': 0.0001, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.99, 'clip_range': 0.2}




Mean reward: 283.9247493470907 +/- 1.4900058121130269
Testing combination: {'learning_rate': 0.0001, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.99, 'clip_range': 0.3}
Mean reward: 208.5073460499048 +/- 1.3638309364741126
Testing combination: {'learning_rate': 0.0001, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.995, 'clip_range': 0.2}
Mean reward: 309.7309877593517 +/- 1.4228187268329222
Testing combination: {'learning_rate': 0.0001, 'n_steps': 2048, 'batch_size': 64, 'gamma': 0.995, 'clip_range': 0.3}
Mean reward: 198.35272253763677 +/- 1.3115714345817509
Testing combination: {'learning_rate': 0.0001, 'n_steps': 2048, 'batch_size': 128, 'gamma': 0.99, 'clip_range': 0.2}
Mean reward: 186.63160265135764 +/- 1.7858755947531058
Testing combination: {'learning_rate': 0.0001, 'n_steps': 2048, 'batch_size': 128, 'gamma': 0.99, 'clip_range': 0.3}
Mean reward: 170.29410685443878 +/- 1.6791457169127826
Testing combination: {'learning_rate': 0.0001, 'n_steps': 2048, 'batch_size': 128, 'gamma

KeyboardInterrupt: 