In [5]:
import json
import os
from tqdm import tqdm

# 配置路径
JSONL_PATH = "/home/runsheng/personal_3/qiancx/ProgressLM/data/h5_agilex_3rgb_converted.jsonl"
IMAGE_ROOT = "/home/runsheng/personal_3/qiancx/Sources/datasets/robomind/images"

missing_images = 0
missing_samples = 0
total_samples = 0

# 可选：保存缺失详情
missing_details = []

with open(JSONL_PATH, "r") as f:
    lines = f.readlines()

for line in tqdm(lines, desc="Checking samples"):
    total_samples += 1
    sample = json.loads(line.strip())
    sample_id = sample["id"]
    image_missing_in_sample = False  # 本样本是否缺失

    # 遍历所有可能包含图片的字段
    for field in ["visual_demo", "stage_to_estimate"]:
        if field in sample and isinstance(sample[field], list):
            for img in sample[field]:
                full_path = os.path.join(IMAGE_ROOT, sample_id, img)
                if not os.path.exists(full_path):
                    missing_images += 1
                    image_missing_in_sample = True
                    missing_details.append(full_path)

    if image_missing_in_sample:
        missing_samples += 1

# ====== 总结统计 ======
print("\n=== 检查完成 ===")
print(f"总样本数          : {total_samples}")
print(f"缺失图像样本数     : {missing_samples}")
print(f"缺失图像总数量     : {missing_images}")

# 可选：保存缺失文件详情
if missing_images > 0:
    with open("missing_images.txt", "w") as out:
        out.write("\n".join(missing_details))
    print("缺失图像路径已保存到 missing_images.txt")


Checking samples:   0%|          | 0/56952 [00:00<?, ?it/s]

Checking samples: 100%|██████████| 56952/56952 [00:02<00:00, 27793.74it/s]


=== 检查完成 ===
总样本数          : 56952
缺失图像样本数     : 56952
缺失图像总数量     : 56952
缺失图像路径已保存到 missing_images.txt





In [6]:
import json, os

JSONL_PATH = "/home/runsheng/personal_3/qiancx/ProgressLM/data/h5_agilex_3rgb_converted.jsonl"
IMAGE_ROOT = "/home/runsheng/personal_3/qiancx/Sources/datasets/robomind/images"

with open(JSONL_PATH, "r") as f:
    first_line = f.readline().strip()
    sample = json.loads(first_line)

sample_id = sample["id"]
img = sample["visual_demo"][0]

full_path = os.path.join(IMAGE_ROOT, sample_id, img)

print("RAW PATH:", full_path)
print("REPR PATH:", repr(full_path))
print("os.path.exists:", os.path.exists(full_path))


RAW PATH: /home/runsheng/personal_3/qiancx/Sources/datasets/robomind/images/h5_agilex_3rgb/10_packplate_2/2024_09_28-16_27_42-172863566445571200.00/camera_front_0000.jpg
REPR PATH: '/home/runsheng/personal_3/qiancx/Sources/datasets/robomind/images/h5_agilex_3rgb/10_packplate_2/2024_09_28-16_27_42-172863566445571200.00/camera_front_0000.jpg'
os.path.exists: True


In [1]:
import json
import os
from tqdm import tqdm

JSONL_PATH = "/home/runsheng/personal_3/qiancx/Sources/datasets/robomind/progrsslm/annotation/h5_tienkung_xsens_frm/h5_tienkung_xsens_frm.jsonl"
IMAGE_ROOT = "/home/runsheng/personal_3/qiancx/Sources/datasets/robomind/progrsslm/images"
SAVE_MISSING = True
MISSING_OUTPUT = "missing_images.txt"

# —— 统计量 —— #
total_samples = 0
expected_images_total = 0
existing_images_total = 0
missing_images = 0
missing_samples = 0

missing_details = []

def iter_images_in_sample(sample):
    """把需要检查的所有图片名统一产出（确保是 list 且去掉空白）"""
    fields = ["visual_demo", "stage_to_estimate"]
    for field in fields:
        v = sample.get(field, [])
        if isinstance(v, str):
            v = [v]
        if not isinstance(v, list):
            continue
        for x in v:
            x = (x or "").strip()
            if x:
                yield x

with open(JSONL_PATH, "r") as f:
    lines = f.readlines()

for line in tqdm(lines, desc="Verifying images", unit="sample"):
    line = line.strip()
    if not line:
        continue

    # —— 解析 JSON —— #
    try:
        sample = json.loads(line)
    except json.JSONDecodeError:
        # 解析失败：这个样本不计入图像统计（也可选择计为缺失样本，看你需求）
        continue

    total_samples += 1
    sample_id = (sample.get("id") or "").strip()
    if not sample_id:
        # id 缺失：该样本无法定位任何图片，按“0 张应检查图像”处理
        # 也可以选择：missing_samples += 1（但这会混淆口径，不建议）
        continue

    # —— 逐图统计（关键） —— #
    missing_this_sample = 0
    for img_name in iter_images_in_sample(sample):
        expected_images_total += 1
        full_path = os.path.join(IMAGE_ROOT, sample_id, img_name)
        if os.path.exists(full_path):
            existing_images_total += 1
        else:
            missing_images += 1
            missing_this_sample += 1
            if SAVE_MISSING:
                missing_details.append(full_path)

    # 只要该样本有至少 1 张缺失，就+1（逐样本计数）
    if missing_this_sample > 0:
        missing_samples += 1

# —— 汇总与自检 —— #
print("\n=== 检查完成 ===")
print(f"总样本数              : {total_samples}")
print(f"应检查图像总数        : {expected_images_total}")
print(f"已存在图像总数        : {existing_images_total}")
print(f"缺失图像总数量        : {missing_images}")
print(f"缺失图像样本数        : {missing_samples}")

# 一致性检查（出现不一致就提示）
if expected_images_total != existing_images_total + missing_images:
    print("\n[警告] 期望数量 != 存在 + 缺失，请检查统计逻辑或上游数据！")
    print(f"  expected_images_total = {expected_images_total}")
    print(f"  existing_images_total = {existing_images_total}")
    print(f"  missing_images        = {missing_images}")

if SAVE_MISSING and missing_details:
    with open(MISSING_OUTPUT, "w") as f:
        f.write("\n".join(missing_details))
    print(f"\n缺失图像路径已保存到：{MISSING_OUTPUT}")

print("\n任务结束 ✅")


Verifying images:   0%|          | 0/16359 [00:00<?, ?sample/s]

Verifying images: 100%|██████████| 16359/16359 [00:00<00:00, 34748.55sample/s]


=== 检查完成 ===
总样本数              : 16359
应检查图像总数        : 132495
已存在图像总数        : 132495
缺失图像总数量        : 0
缺失图像样本数        : 0

任务结束 ✅





In [2]:
import json
import os
from tqdm import tqdm

# === 配置区 === #
JSONL_PATH = "/home/runsheng/personal_3/qiancx/ProgressLM/data/h5_agilex_3rgb_converted.jsonl"
IMAGE_ROOT = "/home/runsheng/personal_3/qiancx/Sources/datasets/robomind/images"
OUTPUT_JSONL = "/home/runsheng/personal_3/qiancx/Sources/datasets/robomind/progrsslm/annotation/h5_agilex_3rgb——filtered.jsonl"       # 输出的新 jsonl
SAVE_MISSING = False
MISSING_OUTPUT = "missing_images.txt"

# === 统计量 === #
total_samples = 0
kept_samples = 0
removed_samples = 0

missing_details = []  # 记录缺失图片路径（可选）

def iter_images_in_sample(sample):
    """统一提取需要检查的所有图片名称"""
    fields = ["visual_demo", "stage_to_estimate"]
    for field in fields:
        v = sample.get(field, [])
        if isinstance(v, str):
            v = [v]
        if not isinstance(v, list):
            continue
        for x in v:
            x = (x or "").strip()
            if x:
                yield x

# === 读取输入 JSONL === #
with open(JSONL_PATH, "r") as f:
    lines = f.readlines()

# === 输出 JSONL === #
f_out = open(OUTPUT_JSONL, "w")

for line in tqdm(lines, desc="Processing samples", unit="sample"):
    line = line.strip()
    if not line:
        continue

    try:
        sample = json.loads(line)
    except json.JSONDecodeError:
        continue

    total_samples += 1
    sample_id = (sample.get("id") or "").strip()
    if not sample_id:
        # id 缺失，无论如何很难匹配图片，直接丢弃该样本
        removed_samples += 1
        continue

    # 检查该样本是否存在图片缺失
    missing_this_sample = False
    for img_name in iter_images_in_sample(sample):
        full_path = os.path.join(IMAGE_ROOT, sample_id, img_name)
        if not os.path.exists(full_path):
            missing_this_sample = True
            if SAVE_MISSING:
                missing_details.append(full_path)
            # 发现缺失可直接跳出，无需继续检查
            break

    # 如果有缺失 => 丢弃整个样本
    if missing_this_sample:
        removed_samples += 1
    else:
        kept_samples += 1
        f_out.write(json.dumps(sample, ensure_ascii=False) + "\n")

f_out.close()

# === 输出统计信息 === #
print("\n=== 处理完成 ===")
print(f"总样本数           : {total_samples}")
print(f"保留样本数         : {kept_samples}")
print(f"删除样本数         : {removed_samples}")
print(f"输出文件           : {OUTPUT_JSONL}")

if SAVE_MISSING and missing_details:
    with open(MISSING_OUTPUT, "w") as f:
        f.write("\n".join(missing_details))
    print(f"缺失图片详情已保存 : {MISSING_OUTPUT}")

print("\n任务结束 ✅")


FileNotFoundError: [Errno 2] No such file or directory: '/home/runsheng/personal_3/qiancx/ProgressLM/data/h5_agilex_3rgb_converted.jsonl'