# DDPG SIMPLE TAG PETTINGZOO

In [1]:
import ray
import time
import numpy as np
import scipy
import torch
import sklearn
import supersuit as ss
import matplotlib.pyplot as plt
import os
import cv2

%matplotlib inline

from ray import tune
from ray.rllib.algorithms.ddpg import DDPG
from ray.rllib.algorithms.ddpg import DDPGConfig
from ray.rllib.env.wrappers.pettingzoo_env import ParallelPettingZooEnv
from pettingzoo.mpe import simple_tag_v3



In [2]:
def env_creator():
    env = simple_tag_v3.parallel_env(num_good=2, num_adversaries=4, num_obstacles=2, max_cycles=25, continuous_actions=True, render_mode="rgb_array")
    env = ss.pad_observations_v0(env)
    env = ss.pad_action_space_v0(env)
    env = ss.frame_stack_v1(env, 3)
    env = ss.dtype_v0(env, np.float32)  # Ensure observations are float32
    return env

env = env_creator()
ray.init()

2024-11-11 08:37:12,141	INFO worker.py:1673 -- Started a local Ray instance.


0,1
Python version:,3.10.13
Ray version:,2.8.0


In [3]:
print("Observation Spaces:")
for agent in env.possible_agents:
    print(f"{agent}: {env.observation_space(agent)}")

print("\nActual Observations:")
observations = env.reset()
for agent, obs in observations[0].items():
    print(f"{agent}: shape={obs.shape}, dtype={obs.dtype}, min={obs.min()}, max={obs.max()}")

Observation Spaces:
adversary_0: Box(-inf, inf, (66,), float32)
adversary_1: Box(-inf, inf, (66,), float32)
adversary_2: Box(-inf, inf, (66,), float32)
adversary_3: Box(-inf, inf, (66,), float32)
agent_0: Box(-inf, inf, (66,), float32)
agent_1: Box(-inf, inf, (66,), float32)

Actual Observations:
adversary_0: shape=(66,), dtype=float32, min=-1.3043397665023804, max=0.691358208656311
adversary_1: shape=(66,), dtype=float32, min=-1.670365333557129, max=0.970991313457489
adversary_2: shape=(66,), dtype=float32, min=-0.505263090133667, max=1.165102243423462
adversary_3: shape=(66,), dtype=float32, min=-0.49872809648513794, max=1.4474895000457764
agent_0: shape=(66,), dtype=float32, min=-0.8476895689964294, max=1.5410290956497192
agent_1: shape=(66,), dtype=float32, min=-1.1252126693725586, max=0.5451526641845703


In [4]:
for agent in env.possible_agents:
    print(agent, env.observation_space(agent))

adversary_0 Box(-inf, inf, (66,), float32)
adversary_1 Box(-inf, inf, (66,), float32)
adversary_2 Box(-inf, inf, (66,), float32)
adversary_3 Box(-inf, inf, (66,), float32)
agent_0 Box(-inf, inf, (66,), float32)
agent_1 Box(-inf, inf, (66,), float32)


In [5]:
env_name = "simple_tag"
tune.register_env(env_name, lambda config: ParallelPettingZooEnv(env_creator()))

In [6]:
config = (
    DDPGConfig()
    .environment(env=env_name)
    .framework("torch")
    .rollouts(num_rollout_workers=4)
    .training(
        actor_lr=1e-4,
        critic_lr=1e-3,
        tau=0.01,
        gamma=0.95,
        train_batch_size=1024,
        actor_hiddens=[64, 64],
        critic_hiddens=[64, 64],
        n_step=3,
    )
    .multi_agent(
        policies={agent: (None, env.observation_space(agent), env.action_space(agent), {})
                  for agent in env.possible_agents},
        policy_mapping_fn=lambda agent_id, *args, **kwargs: agent_id,
    )
)

## Training

In [None]:
stop = {
    "training_iteration": 500,
    "timesteps_total": 2000000,
    "episode_reward_mean": 200,
}

results = tune.run(
    "DDPG",
    config=config.to_dict(),
    stop=stop,
    checkpoint_freq=1,
    checkpoint_at_end=True,
    local_dir="/local/scratch/a/jshreeku/ece595_reinforcement_learning/src/results",
    verbose=1,
)

# Get the best trial
best_trial = results.get_best_trial("episode_reward_mean", "max", "last")
print(f"Best trial config: {best_trial.config}")
print(f"Best trial final validation reward: {best_trial.last_result['episode_reward_mean']}")

In [None]:
print(f"Ray version: {ray.__version__}")
print(f"NumPy version: {np.__version__}")
print(f"SciPy version: {scipy.__version__}")
# print(f"scikit-learn version: {sklearn.__version__}")
print(f"PyTorch version: {torch.__version__}")

## Visualization

In [7]:
frame_dir = './outputs/saved_frames'
video_dir = "./outputs/saved_video"
if not os.path.exists(frame_dir):
    os.makedirs(frame_dir)
    os.makedirs(video_dir)

In [8]:
def load_checkpoint(checkpoint_path):
    algo = DDPG(config=config)
    algo.restore(checkpoint_path)
    return algo

In [9]:
def save_rendered_frame(frame, frame_count):
    # Convert frame from RGB to BGR for OpenCV compatibility
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
    
    # Save the frame as an image file
    frame_file = f'{frame_dir}/frame_{frame_count}.png'
    cv2.imwrite(frame_file, frame)

In [10]:
def save_frame(obs, frame_count):
    for agent, observation in obs.items():
        # Ensure the observation is in the correct format (HxWxC)
        if observation.ndim == 3 and observation.shape[0] == 3:  # If CxHxW format
            frame = observation.transpose(1, 2, 0)
        else:
            frame = observation

        # Normalize and convert to uint8 if necessary
        if frame.dtype != np.uint8:
            frame = (frame * 255).astype(np.uint8)

        # Ensure the frame is in RGB format (OpenCV uses BGR)
        if frame.shape[-1] == 3:
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        frame_file = f'{frame_dir}/frame_{frame_count}_{agent}.png'
        cv2.imwrite(frame_file, frame)

    return frame_count + 1

In [11]:
def render_environment(env, algo, num_episodes=1, video_filename="output_video.mp4", fps=10):
    frame_count = 0
    
    # Initialize video writer
    video_writer = None
    
    for episode in range(num_episodes):
        obs, _ = env.reset()
        done = {agent: False for agent in obs.keys()}
        episode_reward = 0
        step = 0
        
        while not all(done.values()):
            actions = {agent: algo.compute_single_action(obs[agent], policy_id=agent) for agent in obs.keys()}
            obs, rewards, terminated, truncated, _ = env.step(actions)
            done = {agent: terminated[agent] or truncated[agent] for agent in obs.keys()}
            episode_reward += sum(rewards.values())
            
            # Capture and save the rendered frame
            rendered_frame = env.render(mode='rgb_array')  # Get pixel array from render()
            
            if rendered_frame is not None:
                # Save individual frames as images
                save_rendered_frame(rendered_frame, frame_count)
                
                # Initialize VideoWriter if it's not already initialized
                if video_writer is None:
                    height, width, layers = rendered_frame.shape
                    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
                    video_writer = cv2.VideoWriter(f"{video_dir}/{video_filename}", fourcc, fps, (width, height))
                
                # Write the frame to the video file
                video_writer.write(cv2.cvtColor(rendered_frame, cv2.COLOR_RGB2BGR))
                
                frame_count += 1
            
            time.sleep(0.1)
            step += 1
        
        print(f"Episode {episode + 1} finished after {step} steps. Total reward: {episode_reward}")
    
    env.close()
    
    # Release the VideoWriter to finalize the video file
    if video_writer is not None:
        video_writer.release()
    
    print(f"Frames saved in {frame_dir}")
    print(f"Video saved as {video_filename}")

In [12]:
class CustomParallelPettingZooEnv(ParallelPettingZooEnv):
    def __init__(self, env):
        super().__init__(env)
        self.env = env  # Store the original environment

    def render(self, mode='human'):
        return self.env.render()  # Pass the mode parameter

In [13]:
# Specify the path to your checkpoint
checkpoint_path = "./results/DDPG_simple_tag_e2375_00000_0_2024-11-10_20-40-08/checkpoint_000015/"

# Load the algorithm from the checkpoint
loaded_algo = load_checkpoint(checkpoint_path)

# Create a new environment instance for rendering
render_env = CustomParallelPettingZooEnv(env_creator())

# Render the environment with the loaded algorithm
render_environment(render_env, loaded_algo)

`UnifiedLogger` will be removed in Ray 2.7.
  return UnifiedLogger(config, logdir, loggers=None)
The `JsonLogger interface is deprecated in favor of the `ray.tune.json.JsonLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `CSVLogger interface is deprecated in favor of the `ray.tune.csv.CSVLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
The `TBXLogger interface is deprecated in favor of the `ray.tune.tensorboardx.TBXLoggerCallback` interface and will be removed in Ray 2.7.
  self._loggers.append(cls(self.config, self.logdir, self.trial))
2024-11-10 18:00:33,409	INFO trainable.py:585 -- Restored on 10.145.37.19 from checkpoint: Checkpoint(filesystem=local, path=./results/DDPG_2024-11-10_14-38-02/DDPG_simple_tag_4c77a_00000_0_2024-11-10_14-38-02/checkpoint_000015/)




Episode 1 finished after 25 steps. Total reward: 90.0
Episode 2 finished after 25 steps. Total reward: -0.23013443673187584
Episode 3 finished after 25 steps. Total reward: -2.347127912987083
Episode 4 finished after 25 steps. Total reward: -16.125085627463154
Episode 5 finished after 25 steps. Total reward: 60.0
Frames saved in ./outputs/saved_frames
Video saved as output_video.mp4
