In [1]:
import h5py
import numpy as np
import argparse
import os
import cv2
from pathlib import Path

import natsort

## Config

In [3]:
parent_dir = Path("../ALOHATaskCompressedData/")  # ← CHANGE THIS

In [4]:
for dataset_folder in sorted(parent_dir.iterdir()):
    print(dataset_folder)

    # Using natsort to naturally sort the episode files
    sorted_files = natsort.natsorted(dataset_folder.glob("episode*.hdf5"))

    for hdf5_file in sorted_files:
        print(hdf5_file)
        break

..\ALOHATaskCompressedData\apply_tape_closed_box
..\ALOHATaskCompressedData\apply_tape_closed_box\episode_0.hdf5
..\ALOHATaskCompressedData\pick_scraper_from_rack
..\ALOHATaskCompressedData\pick_scraper_from_rack\episode_0.hdf5
..\ALOHATaskCompressedData\usb_mark_003
..\ALOHATaskCompressedData\usb_mark_003\episode_0.hdf5


## Helper Function

In [6]:
# Function to handle datasets inside the HDF5 file
def extract_data(f, arrays):
    for name, obj in f.items():  # Iterate over items in the root group
        if isinstance(obj, h5py.Dataset):  # If it's a dataset (not a group)
            array = obj[()]  # Extract the data from the dataset
            arrays[name] = array  # Store it in the arrays dictionary
        elif isinstance(obj, h5py.Group):  # If it's a group, recurse into it
            for sub_name, sub_obj in obj.items():
                if isinstance(sub_obj, h5py.Dataset):
                    array = sub_obj[()]
                    arrays[f"{name}/{sub_name}"] = array  # Store the data with full path as key

In [7]:
def extract_data(f, arrays):
    # Iterate over items in the root group (top-level groups and datasets)
    for name, obj in f.items():
        if isinstance(obj, h5py.Dataset):  # If it's a dataset (not a group)
            # Extract data from the dataset
            array = obj[()]
            arrays[name] = array  # Store it in the arrays dictionary
        elif isinstance(obj, h5py.Group):  # If it's a group, recurse into it
            for sub_name, sub_obj in obj.items():
                if isinstance(sub_obj, h5py.Dataset):  # If it's a dataset inside the group
                    # Extract data from the sub-group dataset
                    array = sub_obj[()]
                    # Use full path (group name + dataset name) as the key in the dictionary
                    full_name = f"{name}/{sub_name}"
                    arrays[full_name] = array  # Store the dataset with full path as key
                elif isinstance(sub_obj, h5py.Group):  # If it's a subgroup, recurse further
                    # Recursively extract data from nested groups
                    extract_data(sub_obj, arrays)

In [8]:
def update_dict_keys(arrays):
    
    old_keys = ['cam_high', 'cam_left_wrist', 'cam_low', 'cam_right_wrist']
    updated_arrays = {}

    for key, value in arrays.items():
        # Check if the current key is in the old_keys list
        if key in old_keys:
            # Update key with the new path
            new_key = f'observations/images/{key}'
            updated_arrays[new_key] = value
        else:
            # Keep the existing key-value pair
            updated_arrays[key] = value

    return updated_arrays

In [19]:
for dataset_folder in sorted(parent_dir.iterdir()):
    # Using natsort to naturally sort the episode files
    sorted_files = natsort.natsorted(dataset_folder.glob("episode*.hdf5"))

    for hdf5_file in sorted_files:
        arrays = {}

        with h5py.File(hdf5_file, 'r') as f:
            extract_data(f, arrays)

        arrays = update_dict_keys(arrays)
        print(arrays.keys())
        break

dict_keys(['action', 'compress_len', 'observations/effort', 'observations/images/cam_high', 'observations/images/cam_left_wrist', 'observations/images/cam_low', 'observations/images/cam_right_wrist', 'observations/qpos', 'observations/qvel'])
dict_keys(['action', 'compress_len', 'observations/effort', 'observations/images/cam_high', 'observations/images/cam_left_wrist', 'observations/images/cam_low', 'observations/images/cam_right_wrist', 'observations/qpos', 'observations/qvel'])
dict_keys(['action', 'compress_len', 'observations/effort', 'observations/images/cam_high', 'observations/images/cam_left_wrist', 'observations/images/cam_low', 'observations/images/cam_right_wrist', 'observations/qpos', 'observations/qvel'])


In [15]:
def save_images_to_video(images, out_path, fps=30, is_depth=False):
    print(images.shape)
    if images.ndim == 4:  # [T, H, W, C]
        h, w = images.shape[1:3]
    else:  # [T, H, W]
        h, w = images.shape[1:3]
        images = np.expand_dims(images, -1)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    writer = cv2.VideoWriter(str(out_path), fourcc, fps, (w, h), isColor=True)

    for frame in images:
        if is_depth:
            frame = np.squeeze(frame)
            frame = cv2.normalize(frame, None, 0, 255, cv2.NORM_MINMAX)
            frame = frame.astype(np.uint8)
            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)
        else:
            frame = frame.astype(np.uint8)
        writer.write(frame)

    writer.release()

## Dataset Processing

In [12]:
# Parameters
fps = 30
is_depth = True

In [17]:
# Process each dataset folder
for dataset_folder in sorted(parent_dir.iterdir()):
    if dataset_folder.is_dir():  
        print(f"📦 Processing: {dataset_folder.name}")

        # Create necessary output directories specific to this dataset
        out_dir = Path(f"./rmb_dataset/{dataset_folder.name}")  

        # Process each .h5 file in the dataset folder
        for i, hdf5_file in enumerate(natsort.natsorted(dataset_folder.glob("episode*.hdf5"))):
            print(f"📦 Processing {hdf5_file.name}")
            episode_name = f"episode_{i:06d}.rmb"
            rmb_dir = out_dir / episode_name
            rmb_dir.mkdir(parents=True, exist_ok=True)

            with h5py.File(hdf5_file, 'r') as f:
                arrays = {}
                extract_data(f, arrays)
                arrays = update_dict_keys(arrays)

                obs_keys = [key for key in arrays.keys() if 'observations' in key]
                # print(obs_keys)
        
                # Write non-image data to main.rmb.hdf5
                with h5py.File(rmb_dir / "main.rmb.hdf5", 'w') as out_f:
                    for key, array in arrays.items():
                        if 'image' not in key:
                            if '/' in key:
                                group_name, sub_key = key.split('/', 1)
                                group = out_f.require_group(group_name)
                                group.create_dataset(sub_key, data=array)
                            else:
                                out_f.create_dataset(key, data=array)
        
                # Handle image data
                for key in arrays:
                    if key.startswith("observations/images/"):
                        is_depth = False  # your dataset only has RGB images
                        cam_name = key.split("/")[-1]  # e.g., cam_left_wrist
                        suffix = "rgb_image"
                        video_name = f"{cam_name}_{suffix}.rmb.mp4"
                        print(f"🎞️ Saving video: {video_name} from key: {key}")
                        save_images_to_video(arrays[key], rmb_dir / video_name, fps=fps, is_depth=is_depth)
                        
            print(f"✅ Done: {episode_name}")

📦 Processing: apply_tape_closed_box
📦 Processing episode_0.hdf5
🎞️ Saving video: cam_high_rgb_image.rmb.mp4 from key: observations/images/cam_high
(800, 23091)


ValueError: not enough values to unpack (expected 2, got 1)

---