In [11]:
import json
import os
from tqdm import tqdm

# 配置路径
JSONL_PATH = "/home/vcj9002/jianshu/workspace/code/ProgressLM/data/raw/visual_demo/visual_h5_franka_3rgb_train.jsonl"
IMAGE_ROOT = "/home/vcj9002/jianshu/workspace/data/robomind/data/images"

missing_images = 0
missing_samples = 0
total_samples = 0

# 可选：保存缺失详情
missing_details = []

with open(JSONL_PATH, "r") as f:
    lines = f.readlines()

for line in tqdm(lines, desc="Checking samples"):
    total_samples += 1
    sample = json.loads(line.strip())
    sample_id = sample["id"]
    image_missing_in_sample = False  # 本样本是否缺失

    # 遍历所有可能包含图片的字段
    for field in ["visual_demo", "stage_to_estimate"]:
        if field in sample and isinstance(sample[field], list):
            for img in sample[field]:
                full_path = os.path.join(IMAGE_ROOT, sample_id, img)
                if not os.path.exists(full_path):
                    missing_images += 1
                    image_missing_in_sample = True
                    missing_details.append(full_path)

    if image_missing_in_sample:
        missing_samples += 1

# ====== 总结统计 ======
print("\n=== 检查完成 ===")
print(f"总样本数          : {total_samples}")
print(f"缺失图像样本数     : {missing_samples}")
print(f"缺失图像总数量     : {missing_images}")


Checking samples:   0%|          | 0/36195 [00:00<?, ?it/s]

Checking samples: 100%|██████████| 36195/36195 [00:01<00:00, 21637.68it/s]


=== 检查完成 ===
总样本数          : 36195
缺失图像样本数     : 3009
缺失图像总数量     : 21879





In [None]:
import json
import os
from tqdm import tqdm

# === 配置区 === #
JSONL_PATH = "/home/vcj9002/jianshu/workspace/code/ProgressLM/data/raw/visual_demo/visual_h5_franka_3rgb_raw.jsonl"
IMAGE_ROOT = "/home/vcj9002/jianshu/workspace/data/robomind/data/images"
OUTPUT_JSONL = "/home/vcj9002/jianshu/workspace/code/ProgressLM/data/raw/visual_demo/visual_h5_franka_3rgb_train.jsonl"       # 输出的新 jsonl
SAVE_MISSING = False
MISSING_OUTPUT = "missing_images.txt"

# === 统计量 === #
total_samples = 0
kept_samples = 0
removed_samples = 0

missing_details = []  # 记录缺失图片路径（可选）

def iter_images_in_sample(sample):
    """统一提取需要检查的所有图片名称"""
    fields = ["visual_demo", "stage_to_estimate"]
    for field in fields:
        v = sample.get(field, [])
        if isinstance(v, str):
            v = [v]
        if not isinstance(v, list):
            continue
        for x in v:
            x = (x or "").strip()
            if x:
                yield x

# === 读取输入 JSONL === #
with open(JSONL_PATH, "r") as f:
    lines = f.readlines()

# === 输出 JSONL === #
f_out = open(OUTPUT_JSONL, "w")

for line in tqdm(lines, desc="Processing samples", unit="sample"):
    line = line.strip()
    if not line:
        continue

    try:
        sample = json.loads(line)
    except json.JSONDecodeError:
        continue

    total_samples += 1
    sample_id = (sample.get("id") or "").strip()
    if not sample_id:
        # id 缺失，无论如何很难匹配图片，直接丢弃该样本
        removed_samples += 1
        continue

    # 检查该样本是否存在图片缺失
    missing_this_sample = False
    for img_name in iter_images_in_sample(sample):
        full_path = os.path.join(IMAGE_ROOT, sample_id, img_name)
        if not os.path.exists(full_path):
            missing_this_sample = True
            if SAVE_MISSING:
                missing_details.append(full_path)
            # 发现缺失可直接跳出，无需继续检查
            break

    # 如果有缺失 => 丢弃整个样本
    if missing_this_sample:
        removed_samples += 1
    else:
        kept_samples += 1
        f_out.write(json.dumps(sample, ensure_ascii=False) + "\n")

f_out.close()

# === 输出统计信息 === #
print("\n=== 处理完成 ===")
print(f"总样本数           : {total_samples}")
print(f"保留样本数         : {kept_samples}")
print(f"删除样本数         : {removed_samples}")
print(f"输出文件           : {OUTPUT_JSONL}")

if SAVE_MISSING and missing_details:
    with open(MISSING_OUTPUT, "w") as f:
        f.write("\n".join(missing_details))
    print(f"缺失图片详情已保存 : {MISSING_OUTPUT}")

print("\n任务结束 ✅")


Processing samples:   0%|          | 0/36195 [00:00<?, ?sample/s]

Processing samples: 100%|██████████| 36195/36195 [00:00<00:00, 81799.45sample/s]


=== 处理完成 ===
总样本数           : 36195
保留样本数         : 0
删除样本数         : 36195
输出文件           : /home/vcj9002/jianshu/workspace/code/ProgressLM/data/raw/visual_demo/visual_h5_franka_3rgb_train.jsonl

任务结束 ✅



