In [None]:
# Start the humanoid simulation
humanoidisaacsim_0.run()

# Import necessary utilities for velocity control, logging, and timing
from grid.utils.types import Velocity
from grid.utils import log
import time

In [None]:
# Rotate the robot to get a better view of the scene, while capturing and logging images for 5 seconds.

start_time = time.time()
while time.time() - start_time < 5:
    # Turn the robot by applying a small angular velocity (in this case, rotating about the Z-axis)
    humanoidisaacsim_0.locomotion.moveByVelocity(Velocity(0, 0, 0), Velocity(0, 0, -2))

    # Retrieve the RGB and Depth images from the simulation
    rgb_img = humanoidisaacsim_0.locomotion.getImage()
    depth_img = humanoidisaacsim_0.locomotion.getImage("camera_depth_0")

    # Log the RGB image if available
    if rgb_img.data is not None:
        log("rgb_img", rgb_img)
    else:
        print("RGB image is none")

    # Log the depth image if available; scale the depth data for visualization purposes
    if depth_img.data is not None:
        depth_img.data = depth_img.data * 255
        log("depth_img", depth_img)
    else:
        print("Depth image is none")
    
    time.sleep(0.2)

# Stop the robot's movement after the loop completes
humanoidisaacsim_0.locomotion.moveByVelocity(Velocity(0, 0, 0), Velocity(0, 0, 0))

In [None]:
# Import and initialize perception models for segmentation, depth estimation, and vision-language processing.

from grid.model.perception.segmentation.oneformer import OneFormer 
from grid.model.perception.depth.depth_anything_v2 import DepthAnything_V2 
from grid.model.perception.vlm.llava import LLaVA

# Create model instances
seg_model = OneFormer()
depth_model = DepthAnything_V2()
vlm_model = LLaVA()

# Retrieve the latest RGB image from the simulation for processing
rgb = humanoidisaacsim_0.getImage()

# Import rerun for logging the perception outputs
import rerun as rr

In [None]:
# Run the segmentation model with a "panoptic" prompt and log the result.
seg = seg_model.run(rgb_img.data, "panoptic")
rr.log("humanoid/segmentation", rr.SegmentationImage(seg))

# Run the depth estimation model and log the result.
depth = depth_model.run(rgb_img.data)
rr.log("humanoid/depth", rr.DepthImage(depth))

# Run the vision-language model to generate a caption describing the scene.
caption = vlm_model.run(rgb_img.data, "Describe what you see.")
print(caption)

# You can modify the caption prompt, or ask a question, and rerun this cell to see updated results.