In [6]:
import json
from collections import defaultdict

def process_jsonl(input_file, output_file):
    """
    处理JSONL文件，按source_id分组并转换为新格式
    
    参数:
        input_file: 输入JSONL文件路径
        output_file: 输出JSONL文件路径
    """
    # 读取所有数据并按source_id分组
    grouped_data = defaultdict(list)
    
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                data = json.loads(line)
                grouped_data[data['source_id']].append(data)
    
    # 处理每个source_id的数据
    results = []
    
    for source_id, items in grouped_data.items():
        # 按step排序
        items.sort(key=lambda x: int(x['step']))
        
        # 获取基本信息
        total_steps = len(items)
        task_goal = items[0]['task_goal']
        data_source = items[0]['data_source']
        
        # 收集所有action
        text_demo = [item['action'] for item in items]
        
        # 为每个step生成一条记录
        for item in items:
            step = int(item['step'])
            
            # 处理end_img，保留最后一个斜杠后的内容
            end_img = item['end_img']
            stage_to_estimate = end_img.split('/')[-1]
            
            # 计算progress_score (整数百分比，带百分号)
            progress_score = f"{int((step / total_steps) * 100)}%"
            
            # 构建输出记录
            output_record = {
                "id": f"{data_source}/{source_id}",
                "task_goal": task_goal,
                "text_demo": text_demo,
                "total_steps": str(total_steps),
                "stage_to_estimate": stage_to_estimate,
                "closest_idx": str(step),
                "progress_score": str(progress_score),
                "data_source": data_source
            }
            
            results.append(output_record)
    
    # 写入输出文件
    with open(output_file, 'w', encoding='utf-8') as f:
        for record in results:
            f.write(json.dumps(record, ensure_ascii=False) + '\n')
    
    print(f"处理完成！共生成 {len(results)} 条记录")
    print(f"涉及 {len(grouped_data)} 个不同的source_id")


# 使用示例
if __name__ == "__main__":
    # 实际使用时调用这个函数
    # process_jsonl('/home/vcj9002/jianshu/workspace/data/worldvlm/annotations/coin.jsonl', '/home/vcj9002/jianshu/workspace/data/robomind/codes/text_demo/coin_text.jsonl')
    process_jsonl('/home/vcj9002/jianshu/workspace/data/worldvlm/annotations/crosstask.jsonl', '/home/vcj9002/jianshu/workspace/data/robomind/codes/text_demo/crosstask_text.jsonl')

处理完成！共生成 12114 条记录
涉及 1593 个不同的source_id
