In [1]:
import h5py
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import json
from pathlib import Path

import natsort

## Config

In [3]:
parent_dir = Path("./ALOHATaskCompressedData/")  # ← CHANGE THIS

In [4]:
for dataset_folder in sorted(parent_dir.iterdir()):
    print(dataset_folder)

    # Using natsort to naturally sort the episode files
    sorted_files = natsort.natsorted(dataset_folder.glob("episode*.hdf5"))

    for hdf5_file in sorted_files:
        print(hdf5_file)
        break

..\ALOHATaskCompressedData\apply_tape_closed_box
..\ALOHATaskCompressedData\apply_tape_closed_box\episode_0.hdf5
..\ALOHATaskCompressedData\pick_scraper_from_rack
..\ALOHATaskCompressedData\pick_scraper_from_rack\episode_0.hdf5
..\ALOHATaskCompressedData\usb_mark_003
..\ALOHATaskCompressedData\usb_mark_003\episode_0.hdf5


## Helper Function

In [6]:
# Function to handle datasets inside the HDF5 file
def extract_data(f, arrays):
    for name, obj in f.items():  # Iterate over items in the root group
        if isinstance(obj, h5py.Dataset):  # If it's a dataset (not a group)
            array = obj[()]  # Extract the data from the dataset
            arrays[name] = array  # Store it in the arrays dictionary
        elif isinstance(obj, h5py.Group):  # If it's a group, recurse into it
            for sub_name, sub_obj in obj.items():
                if isinstance(sub_obj, h5py.Dataset):
                    array = sub_obj[()]
                    arrays[f"{name}/{sub_name}"] = array  # Store the data with full path as key

## PROCESS EACH DATASET FOLDER

In [8]:
# Parameters
fps = 30
task_string = "default task"
task_index = 0
frame_time_interval = 0.1
episode_index = 0
frame_index = 0

In [9]:
# Process each dataset folder
for dataset_folder in sorted(parent_dir.iterdir()):
    if dataset_folder.is_dir():  
        print(f"📦 Processing: {dataset_folder.name}")

        # Create necessary output directories specific to this dataset
        out_dir = Path(f"./lerobot_dataset/{dataset_folder.name}")  
        (out_dir / "data/chunk-000").mkdir(parents=True, exist_ok=True)
        (out_dir / "meta").mkdir(parents=True, exist_ok=True)

        episodes_meta = []
        total_frames = 0
        episode_count = episode_index  # Start with the provided episode_index

        # Process each .h5 file in the dataset folder
        for hdf5_file in sorted(dataset_folder.glob("episode*.hdf5")):
            with h5py.File(hdf5_file, 'r') as f:
                # Dictionary to store arrays
                arrays = {}

                # Manually iterate over the HDF5 file items and extract data
                extract_data(f, arrays)

                # Extract observation and action components
                obs_keys = [key for key in arrays.keys() if 'observations' in key]
                action_keys = [key for key in arrays.keys() if 'action' in key]

                # Stack all observation components into one array (axis=1 for horizontal stacking)
                observation = np.concatenate([arrays[key] for key in obs_keys], axis=1).astype(np.float32)

                # Stack all action components into one array
                action = np.concatenate([arrays[key] for key in action_keys], axis=1).astype(np.float32)

                T = action.shape[0]  # Number of time steps (same for observations and actions)
                
            # === Construct the new data dict ===
            new_data = {
                "observation.state": observation.tolist(),
                "action": action.tolist(),
                "episode_index": [episode_count] * T,
                "frame_index": list(np.arange(frame_index, frame_index + T)),
                "timestamp": list(np.arange(T) * frame_time_interval),  # Use the frame_time_interval parameter
                "next.done": [False] * T,
                "index": list(np.arange(total_frames, total_frames + T)),
                "task_index": [task_index] * T,
            }
            new_data["next.done"][-1] = True  # mark final frame as done

            # === Save as .parquet ===
            df = pd.DataFrame(new_data)
            table = pa.Table.from_pandas(df)
            pq.write_table(table, out_dir / f"data/chunk-000/episode_{episode_count:06d}.parquet")

            episodes_meta.append({
                "episode_index": episode_count,
                "length": T,
                "tasks": [task_string],
            })

            total_frames += T
            frame_index += T  # Update frame_index for next set of frames
            episode_count += 1  # Update episode_count for next episode

        # === Save metadata ===
        obs_dim = observation.shape[1]
        act_dim = action.shape[1]

        features = {
            "observation.state": {"dtype": "float32", "shape": [obs_dim]},
            "action": {"dtype": "float32", "shape": [act_dim]},
            "episode_index": {"dtype": "int64", "shape": []},
            "frame_index": {"dtype": "int64", "shape": []},
            "timestamp": {"dtype": "float64", "shape": []},
            "next.done": {"dtype": "bool", "shape": []},
            "index": {"dtype": "int64", "shape": []},
            "task_index": {"dtype": "int64", "shape": []}
        }

        info = {
            "fps": fps,
            "codebase_version": "v2.1",
            "robot_type": None,
            "features": features,
            "data_path": "data/chunk-{episode_chunk:03d}/episode_{episode_index:06d}.parquet",
            "video_path": None,
            "total_episodes": episode_count,
            "total_frames": total_frames,
            "chunks_size": 1000,
            "total_chunks": 1,
            "total_tasks": 1
        }

        # Save the info JSON file with dataset-specific name
        with open(out_dir / "meta/info.json", "w") as f:
            json.dump(info, f, indent=2)

        # Save the episodes metadata JSONL file with dataset-specific name
        with open(out_dir / "meta/episodes.jsonl", "w") as f:
            for ep in episodes_meta:
                f.write(json.dumps(ep) + "\n")

        # Save the tasks metadata JSONL file with dataset-specific name
        with open(out_dir / "meta/tasks.jsonl", "w") as f:
            f.write(json.dumps({"task_index": task_index, "task": "default task"}) + "\n")

        print(f"✅ Done: {dataset_folder.name} → {episode_count} episodes.\n")

📦 Processing: apply_tape_closed_box
✅ Done: apply_tape_closed_box → 100 episodes.

📦 Processing: pick_scraper_from_rack
✅ Done: pick_scraper_from_rack → 50 episodes.

📦 Processing: usb_mark_003
✅ Done: usb_mark_003 → 50 episodes.



---