In [5]:
import json
import os
from pathlib import Path

def check_image_exists(jsonl_file, image_root):
    """
    检查JSONL文件中引用的图像是否存在
    
    Args:
        jsonl_file: JSONL文件路径
        image_root: 图像根目录路径
    """
    missing_images = []
    existing_images = []
    
    with open(jsonl_file, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            try:
                data = json.loads(line.strip())
                
                # 构建完整的图像路径
                data_id = data.get('id', '')
                stage_to_estimate = data.get('stage_to_estimate', '')
                
                if data_id and stage_to_estimate:
                    image_path = Path(image_root) / data_id / stage_to_estimate
                    
                    if image_path.exists():
                        existing_images.append({
                            'line': line_num,
                            'id': data_id,
                            'image': stage_to_estimate,
                            'full_path': str(image_path)
                        })
                    else:
                        missing_images.append({
                            'line': line_num,
                            'id': data_id,
                            'image': stage_to_estimate,
                            'expected_path': str(image_path)
                        })
                else:
                    print(f"警告: 第 {line_num} 行缺少 'id' 或 'stage_to_estimate' 字段")
                    
            except json.JSONDecodeError as e:
                print(f"错误: 第 {line_num} 行JSON解析失败: {e}")
    
    # 输出结果
    print(f"\n{'='*60}")
    print(f"检查结果统计")
    print(f"{'='*60}")
    print(f"总计检查: {len(existing_images) + len(missing_images)} 个条目")
    print(f"存在的图像: {len(existing_images)} 个")
    print(f"缺失的图像: {len(missing_images)} 个")
    
    # if missing_images:
    #     print(f"\n{'='*60}")
    #     print(f"缺失的图像列表:")
    #     print(f"{'='*60}")
    #     for item in missing_images:
    #         print(f"\n第 {item['line']} 行:")
    #         print(f"  ID: {item['id']}")
    #         print(f"  图像: {item['image']}")
    #         print(f"  期望路径: {item['expected_path']}")
    # else:
    #     print("\n✓ 所有图像都存在！")
    
    return existing_images, missing_images


# 使用示例
if __name__ == "__main__":
    # 请修改这两个路径
    jsonl_file = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/raw/text_demo/h5_franka_3rgb_text_all.jsonl"  # 你的JSONL文件路径
    image_root = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/images"  # 你的图像根目录
    
    if not os.path.exists(jsonl_file):
        print(f"错误: JSONL文件不存在: {jsonl_file}")
    elif not os.path.exists(image_root):
        print(f"错误: 图像根目录不存在: {image_root}")
    else:
        existing, missing = check_image_exists(jsonl_file, image_root)
        


检查结果统计
总计检查: 14968 个条目
存在的图像: 13947 个
缺失的图像: 1021 个


In [6]:
# import json
# import os
# from pathlib import Path
# from collections import defaultdict

# def filter_jsonl_by_missing_images(jsonl_file, image_root, output_file=None):
#     """
#     检查JSONL文件中的图像，删除所有包含缺失图像的id的样本
    
#     Args:
#         jsonl_file: 输入JSONL文件路径
#         image_root: 图像根目录路径
#         output_file: 输出JSONL文件路径（默认为原文件名_filtered.jsonl）
#     """
#     if output_file is None:
#         base_name = os.path.splitext(jsonl_file)[0]
#         output_file = f"{base_name}_filtered.jsonl"
    
#     # 第一步：读取所有数据并按id分组
#     all_data = []
#     id_to_samples = defaultdict(list)
    
#     print("正在读取JSONL文件...")
#     with open(jsonl_file, 'r', encoding='utf-8') as f:
#         for line_num, line in enumerate(f, 1):
#             try:
#                 data = json.loads(line.strip())
#                 all_data.append(data)
#                 data_id = data.get('id', '')
#                 if data_id:
#                     id_to_samples[data_id].append(line_num)
#             except json.JSONDecodeError as e:
#                 print(f"警告: 第 {line_num} 行JSON解析失败: {e}")
    
#     print(f"总计读取: {len(all_data)} 个样本")
#     print(f"涉及的唯一ID数量: {len(id_to_samples)}")
    
#     # 第二步：检查每个样本的图像是否存在，并标记有问题的id
#     invalid_ids = set()
#     missing_details = []
    
#     print("\n正在检查图像文件...")
#     for idx, data in enumerate(all_data, 1):
#         data_id = data.get('id', '')
#         stage_to_estimate = data.get('stage_to_estimate', '')
        
#         if not data_id or not stage_to_estimate:
#             print(f"警告: 样本 {idx} 缺少必要字段")
#             invalid_ids.add(data_id)
#             continue
        
#         # 构建完整的图像路径
#         image_path = Path(image_root) / data_id / stage_to_estimate
        
#         if not image_path.exists():
#             invalid_ids.add(data_id)
#             missing_details.append({
#                 'sample_idx': idx,
#                 'id': data_id,
#                 'image': stage_to_estimate,
#                 'expected_path': str(image_path)
#             })
    
#     # 第三步：过滤数据，移除所有无效id的样本
#     filtered_data = [data for data in all_data if data.get('id', '') not in invalid_ids]
    
#     # 第四步：写入新文件
#     print(f"\n正在写入过滤后的数据到: {output_file}")
#     with open(output_file, 'w', encoding='utf-8') as f:
#         for data in filtered_data:
#             f.write(json.dumps(data, ensure_ascii=False) + '\n')
    
#     # 输出统计结果
#     print(f"\n{'='*70}")
#     print(f"处理结果")
#     print(f"{'='*70}")
#     print(f"原始样本总数: {len(all_data)}")
#     print(f"有缺失图像的ID数量: {len(invalid_ids)}")
#     print(f"被删除的样本总数: {len(all_data) - len(filtered_data)}")
#     print(f"保留的样本总数: {len(filtered_data)}")
#     print(f"保留率: {len(filtered_data)/len(all_data)*100:.2f}%")
    
#     if invalid_ids:
#         print(f"\n{'='*70}")
#         print(f"被删除的ID列表 (共 {len(invalid_ids)} 个):")
#         print(f"{'='*70}")
#         for invalid_id in sorted(invalid_ids):
#             sample_count = len(id_to_samples[invalid_id])
#             print(f"  - {invalid_id} (删除了 {sample_count} 个样本)")
        
#         print(f"\n{'='*70}")
#         print(f"缺失图像详情 (共 {len(missing_details)} 处):")
#         print(f"{'='*70}")
#         for detail in missing_details[:10]:  # 只显示前10个
#             print(f"\n样本 {detail['sample_idx']}:")
#             print(f"  ID: {detail['id']}")
#             print(f"  图像: {detail['image']}")
#             print(f"  期望路径: {detail['expected_path']}")
        
#         if len(missing_details) > 10:
#             print(f"\n... 还有 {len(missing_details) - 10} 处缺失未显示")
        
#         # 保存详细报告
#         report_file = f"{os.path.splitext(output_file)[0]}_report.json"
#         with open(report_file, 'w', encoding='utf-8') as f:
#             report = {
#                 'invalid_ids': sorted(list(invalid_ids)),
#                 'missing_details': missing_details,
#                 'statistics': {
#                     'original_samples': len(all_data),
#                     'filtered_samples': len(filtered_data),
#                     'deleted_samples': len(all_data) - len(filtered_data),
#                     'invalid_ids_count': len(invalid_ids)
#                 }
#             }
#             json.dump(report, f, indent=2, ensure_ascii=False)
#         print(f"\n详细报告已保存到: {report_file}")
#     else:
#         print("\n✓ 所有图像都存在，无需过滤！")
    
#     print(f"\n过滤后的文件已保存到: {output_file}")
#     return filtered_data, invalid_ids


# # 使用示例
# if __name__ == "__main__":
#     # 请修改这些路径
#     jsonl_file = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/train/text_demo/text_h5_tienkung_xsens_rl.jsonl"  # 输入JSONL文件
#     image_root = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/images"  # 图像根目录
#     output_file = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/train/text_demo/text_h5_tienkung_xsens_rll.jsonl"  # 输出文件（可选）
    
#     if not os.path.exists(jsonl_file):
#         print(f"错误: JSONL文件不存在: {jsonl_file}")
#     elif not os.path.exists(image_root):
#         print(f"错误: 图像根目录不存在: {image_root}")
#     else:
#         filtered_data, invalid_ids = filter_jsonl_by_missing_images(
#             jsonl_file, 
#             image_root, 
#             output_file
#         )

In [None]:
# import json
# import os
# from pathlib import Path
# from collections import defaultdict

# def filter_jsonl_by_missing_images(jsonl_file, image_root, output_file=None):
#     """
#     检查JSONL文件中的图像，删除所有包含缺失图像的id的样本
    
#     Args:
#         jsonl_file: 输入JSONL文件路径
#         image_root: 图像根目录路径
#         output_file: 输出JSONL文件路径（默认为原文件名_filtered.jsonl）
#     """
#     if output_file is None:
#         base_name = os.path.splitext(jsonl_file)[0]
#         output_file = f"{base_name}_filtered.jsonl"
    
#     # 第一步：读取所有数据并按id分组
#     all_data = []
#     id_to_samples = defaultdict(list)
    
#     print("正在读取JSONL文件...")
#     with open(jsonl_file, 'r', encoding='utf-8') as f:
#         for line_num, line in enumerate(f, 1):
#             try:
#                 data = json.loads(line.strip())
#                 all_data.append(data)
#                 data_id = data.get('id', '')
#                 if data_id:
#                     id_to_samples[data_id].append(line_num)
#             except json.JSONDecodeError as e:
#                 print(f"警告: 第 {line_num} 行JSON解析失败: {e}")
    
#     print(f"总计读取: {len(all_data)} 个样本")
#     print(f"涉及的唯一ID数量: {len(id_to_samples)}")
    
#     # 第二步：检查每个样本的图像是否存在，并标记有问题的id
#     invalid_ids = set()
#     missing_details = []
    
#     print("\n正在检查图像文件...")
#     for idx, data in enumerate(all_data, 1):
#         data_id = data.get('id', '')
#         stage_to_estimate = data.get('stage_to_estimate', '')
        
#         if not data_id or not stage_to_estimate:
#             print(f"警告: 样本 {idx} 缺少必要字段")
#             invalid_ids.add(data_id)
#             continue
        
#         # 构建完整的图像路径
#         image_path = Path(image_root) / data_id / stage_to_estimate
        
#         if not image_path.exists():
#             invalid_ids.add(data_id)
#             missing_details.append({
#                 'sample_idx': idx,
#                 'id': data_id,
#                 'image': stage_to_estimate,
#                 'expected_path': str(image_path)
#             })
    
#     # 第三步：过滤数据，移除所有无效id的样本
#     filtered_data = [data for data in all_data if data.get('id', '') not in invalid_ids]
    
#     # 第四步：写入新文件
#     print(f"\n正在写入过滤后的数据到: {output_file}")
#     with open(output_file, 'w', encoding='utf-8') as f:
#         for data in filtered_data:
#             f.write(json.dumps(data, ensure_ascii=False) + '\n')
    
#     # 输出统计结果
#     print(f"\n{'='*70}")
#     print(f"处理结果")
#     print(f"{'='*70}")
#     print(f"原始样本总数: {len(all_data)}")
#     print(f"有缺失图像的ID数量: {len(invalid_ids)}")
#     print(f"被删除的样本总数: {len(all_data) - len(filtered_data)}")
#     print(f"保留的样本总数: {len(filtered_data)}")
#     print(f"保留率: {len(filtered_data)/len(all_data)*100:.2f}%")
    
#     if invalid_ids:
#         print(f"\n{'='*70}")
#         print(f"被删除的ID列表 (共 {len(invalid_ids)} 个):")
#         print(f"{'='*70}")
#         for invalid_id in sorted(invalid_ids):
#             sample_count = len(id_to_samples[invalid_id])
#             print(f"  - {invalid_id} (删除了 {sample_count} 个样本)")
        
#         print(f"\n{'='*70}")
#         print(f"缺失图像详情 (共 {len(missing_details)} 处):")
#         print(f"{'='*70}")
#         for detail in missing_details[:10]:  # 只显示前10个
#             print(f"\n样本 {detail['sample_idx']}:")
#             print(f"  ID: {detail['id']}")
#             print(f"  图像: {detail['image']}")
#             print(f"  期望路径: {detail['expected_path']}")
        
#         if len(missing_details) > 10:
#             print(f"\n... 还有 {len(missing_details) - 10} 处缺失未显示")
        
#         # 保存详细报告
#         report_file = f"{os.path.splitext(output_file)[0]}_report.json"
#         with open(report_file, 'w', encoding='utf-8') as f:
#             report = {
#                 'invalid_ids': sorted(list(invalid_ids)),
#                 'missing_details': missing_details,
#                 'statistics': {
#                     'original_samples': len(all_data),
#                     'filtered_samples': len(filtered_data),
#                     'deleted_samples': len(all_data) - len(filtered_data),
#                     'invalid_ids_count': len(invalid_ids)
#                 }
#             }
#             json.dump(report, f, indent=2, ensure_ascii=False)
#         print(f"\n详细报告已保存到: {report_file}")
#     else:
#         print("\n✓ 所有图像都存在，无需过滤！")
    
#     print(f"\n过滤后的文件已保存到: {output_file}")
#     return filtered_data, invalid_ids


# # 使用示例
# if __name__ == "__main__":
#     # 请修改这些路径
#     jsonl_file = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/raw/text_demo/h5_franka_3rgb_text_al.jsonl"  # 输入JSONL文件
#     image_root = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/images"  # 图像根目录
#     output_file = "/projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/raw/text_demo/h5_franka_3rgb_text_all.jsonl"  # 输出文件（可选）
    
#     if not os.path.exists(jsonl_file):
#         print(f"错误: JSONL文件不存在: {jsonl_file}")
#     elif not os.path.exists(image_root):
#         print(f"错误: 图像根目录不存在: {image_root}")
#     else:
#         filtered_data, invalid_ids = filter_jsonl_by_missing_images(
#             jsonl_file, 
#             image_root, 
#             output_file
#         )

正在读取JSONL文件...
总计读取: 14968 个样本
涉及的唯一ID数量: 3242

正在检查图像文件...

正在写入过滤后的数据到: /projects/b1222/userdata/jianshu/chengxuan/ProgressLM/data/raw/text_demo/h5_franka_3rgb_text_all.jsonl

处理结果
原始样本总数: 14968
有缺失图像的ID数量: 214
被删除的样本总数: 1021
保留的样本总数: 13947
保留率: 93.18%

被删除的ID列表 (共 214 个):
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_142729 (删除了 4 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_143309 (删除了 4 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_143936 (删除了 3 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_144321 (删除了 3 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_144833 (删除了 4 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_145055 (删除了 4 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_145112 (删除了 4 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_145703 (删除了 3 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_145907 (删除了 6 个样本)
  - h5_franka_3rgb/2024_09_20_close_cabinet/0920_150359 (删除了 3 个样本)
  - h5_franka_3rgb/2024_09_20_close_chest/09

In [9]:
# import json
# import os

# def filter_failed_status(jsonl_file, output_file=None):
#     """
#     删除JSONL文件中所有status为"failed"的样本
    
#     Args:
#         jsonl_file: 输入JSONL文件路径
#         output_file: 输出JSONL文件路径（默认为原文件名_no_failed.jsonl）
#     """
#     if output_file is None:
#         base_name = os.path.splitext(jsonl_file)[0]
#         output_file = f"{base_name}_no_failed.jsonl"
    
#     success_samples = []
#     failed_samples = []
#     total_count = 0
    
#     print("正在读取并过滤JSONL文件...")
    
#     with open(jsonl_file, 'r', encoding='utf-8') as f:
#         for line_num, line in enumerate(f, 1):
#             total_count += 1
#             try:
#                 data = json.loads(line.strip())
                
#                 # 检查status字段（可能在meta_data中）
#                 status = None
#                 if 'status' in data:
#                     status = data['status']
#                 elif 'meta_data' in data and 'status' in data['meta_data']:
#                     status = data['meta_data']['status']
                
#                 # 如果status不是"failed"，则保留
#                 if status != "failed":
#                     success_samples.append(data)
#                 else:
#                     failed_samples.append({
#                         'line': line_num,
#                         'id': data.get('meta_data', {}).get('id', 'N/A') if 'meta_data' in data else data.get('id', 'N/A'),
#                         'status': status
#                     })
                    
#             except json.JSONDecodeError as e:
#                 print(f"警告: 第 {line_num} 行JSON解析失败: {e}")
    
#     # 写入过滤后的数据
#     print(f"\n正在写入过滤后的数据到: {output_file}")
#     with open(output_file, 'w', encoding='utf-8') as f:
#         for data in success_samples:
#             f.write(json.dumps(data, ensure_ascii=False) + '\n')
    
#     # 输出统计结果
#     print(f"\n{'='*70}")
#     print(f"处理结果")
#     print(f"{'='*70}")
#     print(f"原始样本总数: {total_count}")
#     print(f"status='failed'的样本数: {len(failed_samples)}")
#     print(f"保留的样本数: {len(success_samples)}")
#     print(f"删除率: {len(failed_samples)/total_count*100:.2f}%")
#     print(f"保留率: {len(success_samples)/total_count*100:.2f}%")
    
#     if failed_samples:
#         print(f"\n{'='*70}")
#         print(f"被删除的样本详情 (前20个):")
#         print(f"{'='*70}")
#         for sample in failed_samples[:20]:
#             print(f"第 {sample['line']} 行 - ID: {sample['id']}")
        
#         if len(failed_samples) > 20:
#             print(f"... 还有 {len(failed_samples) - 20} 个未显示")
        
#         # 保存删除详情
#         report_file = f"{os.path.splitext(output_file)[0]}_removed_failed.json"
#         with open(report_file, 'w', encoding='utf-8') as f:
#             report = {
#                 'removed_samples': failed_samples,
#                 'statistics': {
#                     'total_samples': total_count,
#                     'failed_samples': len(failed_samples),
#                     'kept_samples': len(success_samples)
#                 }
#             }
#             json.dump(report, f, indent=2, ensure_ascii=False)
#         print(f"\n删除详情已保存到: {report_file}")
#     else:
#         print("\n✓ 没有发现status='failed'的样本！")
    
#     print(f"\n过滤后的文件已保存到: {output_file}")
#     return success_samples, failed_samples


# # 使用示例
# if __name__ == "__main__":
#     # 请修改这个路径
#     jsonl_file = "/projects/b1222/userdata/jianshu/chengxuan/saved/saved_results/progresslm/text_think/tienkung/text_demo_results_20251020_132406.jsonl"  # 输入JSONL文件
#     output_file = "/projects/b1222/userdata/jianshu/chengxuan/saved/saved_results/progresslm/text_think/tienkung/tienkung_text_cold.jsonl"  # 输出文件（可选）
    
#     if not os.path.exists(jsonl_file):
#         print(f"错误: JSONL文件不存在: {jsonl_file}")
#     else:
#         success_samples, failed_samples = filter_failed_status(jsonl_file, output_file)