In [5]:
import numpy as np
import gymnasium as gym
from SAC.replay_buffer import ReplayBuffer
from SAC.SAC import SAC

# Create the Pendulum environment
env = gym.make("Pendulum-v1", render_mode="rgb_array")

buffer = ReplayBuffer.from_env(env, 100000, device="cpu")
sac = SAC(buffer, env.observation_space.shape[0], env.action_space.shape[0], 128, 0.001, 0.99, 0.01, 0.2, "cpu")

print(f"Observation space: {env.observation_space}")
print(f"Action space: {env.action_space}")
print(f"Action space bounds: [{env.action_space.low[0]}, {env.action_space.high[0]}]")

Observation space: Box([-1. -1. -8.], [1. 1. 8.], (3,), float32)
Action space: Box(-2.0, 2.0, (1,), float32)
Action space bounds: [-2.0, 2.0]


In [6]:
# Run a simple episode with random actions and render in a separate window
import cv2

obs, info = env.reset(seed=42)

cv2.namedWindow("Pendulum-v1", cv2.WINDOW_NORMAL)
cv2.resizeWindow("Pendulum-v1", 500, 500)

total_reward = 0
for episode in range(1000):
    print(f"Episode {episode + 1}")
    episode_total_rewards = []
    episode_actor_loss = []
    episode_critic_loss = []
    episode_alpha_loss = []
    for step in range(1000):
        # Render using OpenCV
        frame = env.render()          # this returns an RGB array 
        if frame is not None:
            cv2.imshow("Pendulum-v1", cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))
            # Press 'q' key to stop early
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break

        # Sample action from SAC agent
        action = sac.act(obs)
        env_action = action * env.action_space.high[0]

        # Take a step in the environment
        next_obs, reward, terminated, truncated, info = env.step(env_action)
        total_reward += reward

        buffer.add(obs, action, reward, next_obs, terminated)
        obs = next_obs

        if buffer.is_ready(3000):
            actor_loss, critic_loss, alpha_loss = sac.update()
            episode_actor_loss.append(actor_loss)
            episode_critic_loss.append(critic_loss)
            episode_alpha_loss.append(alpha_loss)
        
        if terminated or truncated:
            episode_total_rewards.append(total_reward)
            total_reward = 0
            obs, info = env.reset()
            break  # End the episode after termination/truncation

    print(f"Episode {episode + 1} - Actor Loss: {np.mean(episode_actor_loss):.4f}, Critic Loss: {np.mean(episode_critic_loss):.4f}, Alpha Loss: {np.mean(episode_alpha_loss):.4f} - Total Reward: {np.mean(episode_total_rewards):.2f}")
    episode_actor_loss = []
    episode_critic_loss = []
    episode_alpha_loss = []
    episode_total_rewards = []

cv2.destroyAllWindows()
print(f"\nEpisode finished after {step + 1} steps")
print(f"Total reward: {total_reward:.2f}")

Episode 1


  from pkg_resources import resource_stream, resource_exists
  >>> np.mean(a)
  if is_float16_result:


Episode 1 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -982.57
Episode 2
Episode 2 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -1301.88
Episode 3
Episode 3 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -1571.22
Episode 4
Episode 4 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -1208.73
Episode 5
Episode 5 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -1231.18
Episode 6
Episode 6 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -1042.48
Episode 7
Episode 7 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -962.43
Episode 8
Episode 8 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -1053.08
Episode 9
Episode 9 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -757.49
Episode 10
Episode 10 - Actor Loss: nan, Critic Loss: nan, Alpha Loss: nan - Total Reward: -1319.59
Episode 11
Episode 11 - Actor L

KeyboardInterrupt: 

In [None]:
# Clean up
env.close()