In [5]:
import json

def convert_dataset(input_file, output_file):
    """
    将原始数据集转换为新格式
    
    Args:
        input_file: 输入的JSON文件路径
        output_file: 输出的JSONL文件路径
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted_data = []
    
    for item in data:
        original_id = item['id']
        response = item['response']
        
        # 1. 处理id：删除 /success_episodes/train/ 和 /data
        new_id = original_id.replace('/success_episodes/train/', '/')
        new_id = new_id.replace('/data', '')
        
        # 2. 获取task_goal
        task_goal = response['task_summary']
        
        # 3. 获取所有step_description作为text_demo
        text_demo = [step['step_description'] for step in response['steps']]
        
        # 4. 计算total_steps
        total_steps = len(response['steps'])
        
        # 5. 获取data_source（第一个/之前的内容）
        data_source = new_id.split('/')[0]
        
        # 6. 为每个step生成一条记录
        for idx, step in enumerate(response['steps']):
            # closest_idx是当前step的索引（从1开始）
            closest_idx = idx + 1
            
            # progress_score是完成该step后的进度百分比（字符串格式）
            progress_score = f"{int(((idx + 1) / total_steps) * 100)}%"
            
            # stage_to_estimate是该step的end_frame
            stage_to_estimate = step['end_frame']
            
            converted_item = {
                "id": new_id,
                "task_goal": task_goal,
                "text_demo": text_demo,
                "total_steps": total_steps,
                "stage_to_estimate": stage_to_estimate,
                "closest_idx": closest_idx,
                "progress_score": progress_score,
                "data_source": data_source
            }
            
            converted_data.append(converted_item)
    
    # 写入JSONL文件（每行一个JSON对象）
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"转换完成！共生成 {len(converted_data)} 条记录")
    print(f"输出文件：{output_file}")


# 使用示例
# if __name__ == "__main__":
#     input_file = "/home/vcj9002/jianshu/workspace/data/robomind/data/annotations/language_description_annotation_json/h5_tienkung_xsens.json"
#     output_file = "/home/vcj9002/jianshu/workspace/data/COCO/annotations/raw/text/h5_tienkung_xsens_text_all.jsonl"
    
# if __name__ == "__main__":
#     input_file = "/home/vcj9002/jianshu/workspace/data/robomind/data/annotations/language_description_annotation_json/h5_ur_1rgb.json"
#     output_file = "/home/vcj9002/jianshu/workspace/data/COCO/annotations/raw/text/h5_ur_1rgb_text_all.jsonl"

if __name__ == "__main__":
    input_file = "/home/vcj9002/jianshu/workspace/data/robomind/data/annotations/language_description_annotation_json/h5_franka_3rgb.json"
    output_file = "/home/vcj9002/jianshu/workspace/data/COCO/annotations/raw/text/h5_franka_3rgb_text_all.jsonl"

    convert_dataset(input_file, output_file)

转换完成！共生成 15056 条记录
输出文件：/home/vcj9002/jianshu/workspace/data/COCO/annotations/raw/text/h5_franka_3rgb_text_all.jsonl


In [None]:
import json

def convert_dataset(input_file, output_file):
    """
    将原始数据集转换为新格式
    
    Args:
        input_file: 输入的JSON文件路径
        output_file: 输出的JSONL文件路径
    """
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    converted_data = []
    
    for item in data:
        original_id = item['id']
        response = item['response']
        
        # 1. 处理id：删除 /success_episodes/train/ 和 /data
        new_id = original_id.replace('/success_episodes/train/', '/')
        new_id = new_id.replace('/data', '')
        
        # 2. 获取task_goal
        task_goal = response['task_summary']
        
        # 3. 获取所有step_description作为text_demo
        text_demo = [step['step_description'] for step in response['steps']]
        
        # 4. 计算total_steps（忽略最后一个step）
        total_steps = len(response['steps']) - 1
        
        # 5. 获取data_source（第一个/之前的内容）
        data_source = new_id.split('/')[0]
        
        # 6. 为每个step生成一条记录（忽略最后一个step）
        for idx, step in enumerate(response['steps'][:-1]):
            # closest_idx是当前step的索引（从1开始）
            closest_idx = idx + 1
            
            # progress_score是完成该step后的进度百分比（字符串格式）
            progress_score = f"{int(((idx + 1) / total_steps) * 100)}%"
            
            # stage_to_estimate是该step的end_frame
            stage_to_estimate = step['end_frame']
            
            converted_item = {
                "id": new_id,
                "task_goal": task_goal,
                "text_demo": text_demo,
                "total_steps": total_steps,
                "stage_to_estimate": stage_to_estimate,
                "closest_idx": closest_idx,
                "progress_score": progress_score,
                "data_source": data_source
            }
            
            converted_data.append(converted_item)
    
    # 写入JSONL文件（每行一个JSON对象）
    with open(output_file, 'w', encoding='utf-8') as f:
        for item in converted_data:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')
    
    print(f"转换完成！共生成 {len(converted_data)} 条记录")
    print(f"输出文件：{output_file}")


# 使用示例
if __name__ == "__main__":
    input_file = "/home/vcj9002/jianshu/workspace/data/robomind/data/annotations/language_description_annotation_json/h5_agilex_3rgb.json"
    output_file = "/home/vcj9002/jianshu/workspace/data/COCO/annotations/raw/text/h5_agilex_3rgb_text.jsonl"
    
    convert_dataset(input_file, output_file)