# 从phase1的数据中获取错误正确步骤对

In [1]:
import json

def process_jsonl(filename):
    results = {}
    current_problem = None
    current_steps = []
    
    with open(filename, 'r') as f:
        for line in f:
            data = json.loads(line)
            
            # 获取问题
            if 'question' in data and 'problem' in data['question']:
                current_problem = data['question']['problem']
                current_steps = []
            
            # 处理每个步骤
            if 'label' in data and 'steps' in data['label']:
                for step in data['label']['steps']:
                    wrong_steps = []
                    correct_step = None
                    
                    # 判断是否有错误步骤
                    if step.get('chosen_completion') is None and step.get('human_completion') is not None:
                        # 获取错误步骤
                        for completion in step.get('completions', []):
                            if completion.get('rating') == -1:
                                wrong_steps.append(completion['text'])
                    
                    # 获取正确步骤
                    if step.get('chosen_completion') is not None:
                        idx = step['chosen_completion']
                        if step['completions'] and len(step['completions']) > idx:
                            correct_step = step['completions'][idx]['text']
                    elif step.get('human_completion') is not None:
                        correct_step = step['human_completion']['text']
                        
                    if correct_step is not None:
                        # 如果没有错误步骤用None代替
                        wrong_steps = wrong_steps if wrong_steps else None
                        current_steps.append((wrong_steps, correct_step))
                
                # 只记录有错误步骤的问题
                if current_problem and current_steps and any(step[0] for step in current_steps):
                    results[current_problem] = current_steps
    
    return results

def output_results(results, save_path):
    output = []
    for problem, steps in results.items():
        formatted = {
            "problem": problem,
            "steps": steps
        }
        output.append(json.dumps(formatted, ensure_ascii=False))
        
    with open(save_path, 'w') as f:
        f.write('\n'.join(output))
read_dir = '/zhuangkai/openo1/dataset/prm800k/data'
save_dir = '/zhuangkai/openo1/outputs/verifier'
# 使用示例
for stage in ['train', 'test']:
    save_path = f'{save_dir}/phase1_{stage}.jsonl'
    read_path = f'{read_dir}/phase1_{stage}.jsonl'
    results = process_jsonl(read_path)
    output_results(results, save_path)

In [1]:
import random

def split_file(input_path, test_path, validation_path):
    with open(input_path, 'r') as f:
        lines = f.readlines()
    
    random.shuffle(lines)
    split_index = len(lines) // 2
    
    test_lines = lines[:split_index]
    validation_lines = lines[split_index:]
    
    with open(test_path, 'w') as f:
        f.writelines(test_lines)
    
    with open(validation_path, 'w') as f:
        f.writelines(validation_lines)

save_dir = '/zhuangkai/openo1/outputs/verifier'
test_file_path = f'{save_dir}/phase1_test.jsonl'
test_output_path = f'{save_dir}/phase1_test_split.jsonl'
validation_output_path = f'{save_dir}/phase1_validation.jsonl'

split_file(test_file_path, test_output_path, validation_output_path)

# 做进一步处理，steps只保留到最后一个wrong_step和correct_step都有的