In [1]:
import json, pathlib, collections
babel_root = pathlib.Path('./babel_v1.0_release')
names = collections.Counter()

for split in ['train','val','test','extra_train','extra_val']:
    data = json.load(open(babel_root/f'{split}.json'))
    for seq in data.values():
        ds_name = pathlib.Path(seq['feat_p']).parts[0]   # e.g. 'ACCAD'
        names[ds_name] += 1

print('共 %d 个子库:'%len(names), *names)

共 17 个子库: BMLrub ACCAD CMU MPIHDM05 EyesJapanDataset KIT EKUT MPImosh TCDhandMocap DFaust67 MPILimits SFU TotalCapture HumanEva SSMsynced BMLmovi Transitionsmocap


In [2]:
import json
import pathlib
import collections

babel_root = pathlib.Path('./babel_v1.0_release')  # Make sure this path is correct
ASSUMED_FPS = 30.0  # Based on BABEL paper's normalization statement

sample_counts = collections.Counter()
frame_counts = collections.Counter()

print(f"Attempting to read from BABEL root: {babel_root.resolve()}")

if not babel_root.exists():
    print(f"Error: BABEL root directory '{babel_root}' does not exist. Please check the path.")
else:
    splits_processed = 0
    for split in ['train', 'val', 'test', 'extra_train', 'extra_val']:
        json_file_path = babel_root / f'{split}.json'

        if not json_file_path.exists():
            print(f"Warning: File '{json_file_path}' not found. Skipping this split.")
            continue

        print(f"Processing file: {json_file_path}")
        splits_processed += 1
        try:
            with open(json_file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            if not isinstance(data, dict):
                print(f"Warning: Data in '{json_file_path}' is not a dictionary as expected. Skipping.")
                continue

            for seq_id, seq_info in data.items():
                if not isinstance(seq_info, dict):
                    print(f"Warning: Sequence entry for ID '{seq_id}' in '{json_file_path}' is not a dictionary. Skipping.")
                    continue

                # 1. Extract sub-dataset name
                ds_name = "Unknown_Dataset" # Default
                if 'feat_p' in seq_info and isinstance(seq_info['feat_p'], str) and seq_info['feat_p']:
                    try:
                        # e.g., 'BMLrub/BioMotionLab_NTroje/rub055/...' -> 'BMLrub'
                        ds_name = pathlib.Path(seq_info['feat_p']).parts[0]
                    except IndexError:
                        print(f"Warning: Could not determine dataset name from 'feat_p': '{seq_info['feat_p']}' for seq ID '{seq_id}'. Using '{ds_name}'.")
                else:
                    print(f"Warning: 'feat_p' key missing or invalid for seq ID '{seq_id}'. Using '{ds_name}'.")

                # 2. Count samples
                sample_counts[ds_name] += 1

                # 3. Calculate and count frames
                if 'dur' in seq_info:
                    try:
                        duration = float(seq_info['dur'])
                        frames_in_sequence = int(round(duration * ASSUMED_FPS))
                        frame_counts[ds_name] += frames_in_sequence
                    except (ValueError, TypeError):
                        print(f"Warning: Could not parse 'dur': '{seq_info['dur']}' as float for seq ID '{seq_id}' from '{ds_name}'. Frame count not added for this sequence.")
                else:
                    print(f"Warning: 'dur' key not found for seq ID '{seq_id}' from '{ds_name}'. Frame count not added for this sequence.")

        except json.JSONDecodeError as e:
            print(f"Error: Could not decode JSON from '{json_file_path}': {e}. Skipping this split.")
        except Exception as e:
            print(f"An unexpected error occurred while processing '{json_file_path}': {e}")

    if splits_processed > 0:
        print(f'\n--- BABEL Dataset Statistics (FPS Assumed: {ASSUMED_FPS}) ---')
        if not sample_counts:
            print("No samples were successfully processed from any sub-dataset.")
        else:
            print(f'Found {len(sample_counts)} unique sub-datasets:')
            print(f"{'Sub-dataset':<25} | {'Sample Count':<15} | {'Total Frames (Est.)':<20}")
            print(f"{'-'*25} | {'-'*15} | {'-'*20}")
            for ds_name in sorted(sample_counts.keys()):
                samples = sample_counts[ds_name]
                frames = frame_counts[ds_name] # frame_counts will have the same keys
                print(f"{ds_name:<25} | {samples:<15} | {frames:<20}")
    elif not babel_root.exists():
        pass # Error already printed at the beginning
    else:
        print("No split files ('train.json', 'val.json', etc.) were found or processed in the specified BABEL directory.")

Attempting to read from BABEL root: D:\MotionPretrain\data\babel_v1.0_release
Processing file: babel_v1.0_release\train.json
Processing file: babel_v1.0_release\val.json
Processing file: babel_v1.0_release\test.json
Processing file: babel_v1.0_release\extra_train.json
Processing file: babel_v1.0_release\extra_val.json

--- BABEL Dataset Statistics (FPS Assumed: 30.0) ---
Found 17 unique sub-datasets:
Sub-dataset               | Sample Count    | Total Frames (Est.) 
------------------------- | --------------- | --------------------
ACCAD                     | 386             | 72577               
BMLmovi                   | 2812            | 477086              
BMLrub                    | 5177            | 1597157             
CMU                       | 3319            | 1634353             
DFaust67                  | 199             | 31523               
EKUT                      | 548             | 86559               
EyesJapanDataset          | 1273            | 1111422       