In [None]:
%pip install gym pyvirtualdisplay > /dev/null 2>&1
!apt-get install -y xvfb python-opengl ffmpeg > /dev/null 2>&1
!apt-get update > /dev/null 2>&1
!apt-get install cmake > /dev/null 2>&1
%pip install --upgrade setuptools 2>&1
%pip install ez_setup > /dev/null 2>&1
%pip install gym[classic-control] > /dev/null 2>&1
%pip install gymnasium
%pip install stable_baselines3
%pip install shimmy

In [None]:
import gymnasium as gym
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from stable_baselines3 import DDPG
from stable_baselines3.common.noise import NormalActionNoise
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.ddpg.policies import MlpPolicy
from gym import logger as gymlogger
from gym.wrappers.record_video import RecordVideo
gymlogger.set_level(40) #error only
import tensorflow as tf
import random
import math
import glob
import io
import base64
from IPython.display import HTML
from IPython import display as ipythondisplay
from stable_baselines3.common.callbacks import BaseCallback
import statistics
import itertools

In [8]:
env = gym.make("Pendulum-v1",render_mode="rgb_array")

In [9]:
n_actions = env.action_space.shape[-1]
action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions))

In [None]:
model = DDPG(MlpPolicy, env, action_noise=action_noise, verbose=1)

In [11]:
class RewardLoggerCallback(BaseCallback):
    def __init__(self, verbose=0):
        super(RewardLoggerCallback, self).__init__(verbose)
        self.rewards = []
        self.cumulative_rewards = []

    def _on_step(self) -> bool:
        self.rewards.append(self.locals["rewards"])
        if self.locals["dones"]:
            self.cumulative_rewards.append(np.sum(self.rewards))
            self.rewards = []
        return True

reward_logger = RewardLoggerCallback()

In [19]:
# Define the parameter grids
learning_rates = [0.001, 0.01, 0.1]
buffer_sizes = [1000, 10000, 100000]
batch_sizes = [32, 64, 128]
tau_values = [0.001, 0.01]
gamma_values = [0.5, 0.99]

# Create a DataFrame to store the results
columns = ['configuration ID', 'learning_rate', 'buffer', 'batch', 'tau', 'gamma', 'Reward']
results_df = pd.DataFrame(columns=columns)

# Generate all combinations
configuration_id = 0
for learning_rate, buffer_size, batch_size, tau, gamma in itertools.product(
    learning_rates, buffer_sizes, batch_sizes, tau_values, gamma_values):

    # Initialize and train the model
    model = DDPG(MlpPolicy, env, action_noise=action_noise, learning_rate=learning_rate, verbose=0, batch_size=batch_size, buffer_size=buffer_size, tau=tau, gamma=gamma)
    model.learn(total_timesteps=10000)

    # Evaluate the policy
    rewards_list = evaluate_policy(model, model.get_env(), n_eval_episodes=20, return_episode_rewards=True)[0]

    # Temporary DataFrame for this iteration
    temp_df = pd.DataFrame({
        'configuration ID': [configuration_id] * len(rewards_list),
        'learning_rate': [learning_rate] * len(rewards_list),
        'buffer': [buffer_size] * len(rewards_list),
        'batch': [batch_size] * len(rewards_list),
        'tau': [tau] * len(rewards_list),
        'gamma': [gamma] * len(rewards_list),
        'Reward': rewards_list
    })

    # Concatenate the temporary DataFrame with the main results DataFrame
    results_df = pd.concat([results_df, temp_df], ignore_index=True)

    # Increment configuration ID
    configuration_id += 1

# Show or save the DataFrame
print(results_df)
# Optionally, save to CSV
results_df.to_csv('evaluation_results.csv', index=False)

     configuration ID  learning_rate  buffer batch    tau  gamma       Reward
0                   0          0.001    1000    32  0.001   0.50 -1352.338412
1                   0          0.001    1000    32  0.001   0.50 -1381.263307
2                   0          0.001    1000    32  0.001   0.50 -1383.176071
3                   0          0.001    1000    32  0.001   0.50 -1494.011277
4                   0          0.001    1000    32  0.001   0.50 -1502.690619
...               ...            ...     ...   ...    ...    ...          ...
2155              107          0.100  100000   128  0.010   0.99 -1081.014297
2156              107          0.100  100000   128  0.010   0.99 -1672.557162
2157              107          0.100  100000   128  0.010   0.99 -1608.784389
2158              107          0.100  100000   128  0.010   0.99 -1631.250004
2159              107          0.100  100000   128  0.010   0.99 -1391.510157

[2160 rows x 7 columns]
