In [None]:
import os
import h5py
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
import cv2
from pathlib import Path

import natsort

## Config

In [None]:
parent_dir = Path("../ALOHATaskCompressedData/")  # ← CHANGE THIS

In [None]:
for dataset_folder in sorted(parent_dir.iterdir()):
    print(dataset_folder)

    # Using natsort to naturally sort the episode files
    sorted_files = natsort.natsorted(dataset_folder.glob("episode*.hdf5"))

    for hdf5_file in sorted_files:
        print(hdf5_file)
        break

In [None]:
## CONSTANTS
CAMERA_NAMES = ["cam_high", "cam_left_wrist", "cam_low", "cam_right_wrist"]
DCAMERA_NAMES = ["dcam_high", "dcam_low"]

## Helper Function

In [None]:
# Function to handle datasets inside the HDF5 file
def extract_data(f, arrays):
    for name, obj in f.items():  # Iterate over items in the root group
        if isinstance(obj, h5py.Dataset):  # If it's a dataset (not a group)
            array = obj[()]  # Extract the data from the dataset
            arrays[name] = array  # Store it in the arrays dictionary
        elif isinstance(obj, h5py.Group):  # If it's a group, recurse into it
            for sub_name, sub_obj in obj.items():
                if isinstance(sub_obj, h5py.Dataset):
                    array = sub_obj[()]
                    arrays[f"{name}/{sub_name}"] = array  # Store the data with full path as key

In [None]:
def extract_data(f, arrays):
    # Iterate over items in the root group (top-level groups and datasets)
    for name, obj in f.items():
        if isinstance(obj, h5py.Dataset):  # If it's a dataset (not a group)
            # Extract data from the dataset
            array = obj[()]
            arrays[name] = array  # Store it in the arrays dictionary
        elif isinstance(obj, h5py.Group):  # If it's a group, recurse into it
            for sub_name, sub_obj in obj.items():
                if isinstance(sub_obj, h5py.Dataset):  # If it's a dataset inside the group
                    # Extract data from the sub-group dataset
                    array = sub_obj[()]
                    # Use full path (group name + dataset name) as the key in the dictionary
                    full_name = f"{name}/{sub_name}"
                    arrays[full_name] = array  # Store the dataset with full path as key
                elif isinstance(sub_obj, h5py.Group):  # If it's a subgroup, recurse further
                    # Recursively extract data from nested groups
                    extract_data(sub_obj, arrays)

In [None]:
def update_dict_keys(arrays):
    
    old_keys = ['cam_high', 'cam_left_wrist', 'cam_low', 'cam_right_wrist']
    updated_arrays = {}

    for key, value in arrays.items():
        # Check if the current key is in the old_keys list
        if key in old_keys:
            # Update key with the new path
            new_key = f'observations/images/{key}'
            updated_arrays[new_key] = value
        else:
            # Keep the existing key-value pair
            updated_arrays[key] = value

    return updated_arrays

In [None]:
def load_compressed_hdf5(dataset_path):
    if not os.path.isfile(dataset_path):
        print(f'Dataset does not exist at \n{dataset_path}\n')
        exit()
    with h5py.File(dataset_path, 'r') as root:
        is_sim = root.attrs['sim']
        qpos = root['/observations/qpos'][()]
        qvel = root['/observations/qvel'][()]
        effort = root['/observations/effort'][()]
        action = root['/action'][()]

        image_dict = dict()
        for cam_name in root[f'/observations/images/'].keys():
            # decode images
            emc_images = root[f'/observations/images/{cam_name}'][()]
            image_dict[cam_name] = list()
            for img in emc_images:
                decompressed_image = cv2.imdecode(img , 1)
                image_dict[cam_name].append(decompressed_image)
    return is_sim, qpos, qvel, effort, action, image_dict

## Dataset Processing

In [None]:
# Parameters
fps = 30
task_string = "default task"
task_index = 0
frame_time_interval = 0.1
episode_index = 0
frame_index = 0

In [None]:
# Process each dataset folder
for dataset_folder in sorted(parent_dir.iterdir()):
    if dataset_folder.is_dir():
        print(f"📦 Processing: {dataset_folder.name}")
        out_dir = Path(f"./lerobot_dataset/{dataset_folder.name}")
        (out_dir / "data/chunk-000").mkdir(parents=True, exist_ok=True)
        (out_dir / "meta").mkdir(parents=True, exist_ok=True)

        episodes_meta = []
        total_frames = 0
        episode_count = episode_index

        for hdf5_file in natsort.natsorted(dataset_folder.glob("episode*.hdf5")):
            
            # Load data using provided function
            is_sim, qpos, qvel, effort, action, image_dict = load_compressed_hdf5(hdf5_file)
            T = action.shape[0]

            # Prepare all arrays
            arrays = {
                "observations.qpos": qpos,
                "observations.qvel": qvel,
                "observations.effort": effort,
                "action": action
            }

            # Add image vectors to the array dictionary
            for cam_name in CAMERA_NAMES:
                flattened_images = np.stack([img.flatten() for img in image_dict[cam_name]], axis=0)
                arrays[f'observations.images.{cam_name}'] = flattened_images

            # Observation keys (everything under 'observations.*')
            obs_keys = [k for k in arrays if k.startswith('observations')]
            act_keys = [k for k in arrays if k == 'action']

            # Stack all observation components
            observation = np.concatenate([arrays[k] for k in obs_keys], axis=1).astype(np.float32)
            action = np.concatenate([arrays[k] for k in act_keys], axis=1).astype(np.float32)

            # === Construct dataset
            new_data = {
                "observation.state": observation.tolist(),
                "action": action.tolist(),
                "episode_index": [episode_count] * T,
                "frame_index": list(np.arange(frame_index, frame_index + T)),
                "timestamp": list(np.arange(T) * frame_time_interval),
                "next.done": [False] * T,
                "index": list(np.arange(total_frames, total_frames + T)),
                "task_index": [task_index] * T
            }
            new_data["next.done"][-1] = True

            print("SAVE!")
            # Save to parquet
            df = pd.DataFrame(new_data)
            table = pa.Table.from_pandas(df)
            pq.write_table(table, out_dir / f"data/chunk-000/episode_{episode_count:06d}.parquet")

            episodes_meta.append({
                "episode_index": episode_count,
                "length": T,
                "tasks": [task_string]
            })

            total_frames += T
            frame_index += T
            episode_count += 1

            break

        # === Save metadata
        obs_dim = observation.shape[1]
        act_dim = action.shape[1]

        features = {
            "observation.state": {"dtype": "float32", "shape": [obs_dim]},
            "action": {"dtype": "float32", "shape": [act_dim]},
            "episode_index": {"dtype": "int64", "shape": []},
            "frame_index": {"dtype": "int64", "shape": []},
            "timestamp": {"dtype": "float64", "shape": []},
            "next.done": {"dtype": "bool", "shape": []},
            "index": {"dtype": "int64", "shape": []},
            "task_index": {"dtype": "int64", "shape": []}
        }

        info = {
            "fps": fps,
            "codebase_version": "v2.1",
            "robot_type": None,
            "features": features,
            "data_path": "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet",
            "video_path": None,
            "total_episodes": episode_count,
            "total_frames": total_frames,
            "chunks_size": 1000,
            "total_chunks": 1,
            "total_tasks": 1
        }

        # Save meta files
        with open(out_dir / "meta/info.json", "w") as f:
            json.dump(info, f, indent=2)

        with open(out_dir / "meta/episodes.jsonl", "w") as f:
            for ep in episodes_meta:
                f.write(json.dumps(ep) + "\n")

        with open(out_dir / "meta/tasks.jsonl", "w") as f:
            f.write(json.dumps({"task_index": task_index, "task": task_string}) + "\n")

        print(f"✅ Done: {dataset_folder.name} → {episode_count} episodes.\n")

---