# 首先对gt_solution每个step打上标签

In [4]:
#每个step的标签为1.0，文件路径：outputs/sft/phase2_{stage}_gt_solution_to_step_list_capital.jsonl
#每一行的格式为：
# {
#     "problem": "问题",
#     "steps": ["step1", "step2", "step3"],
# }
#处理后格式为：
# {
#     "problem": "问题",
#     "steps": [
#         ("step1", 1.0),
#         ("step2", 1.0),
#         ("step3", 1.0)
#     ]
# }
import json
from tqdm import tqdm

# 文件路径和保存路径
path = '/zhuangkai/openo1/outputs/sft/phase2_{stage}_gt_solution_to_step_list_capital.jsonl'
save_path = '/zhuangkai/openo1/outputs/rm/phase2_{stage}_gt_solution.jsonl'

for stage in ['train', 'test']: 
    result = []
    with open(path.format(stage=stage), 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            data = json.loads(line)
            problem = data['problem']
            gt_steps = data["steps"]
            # 如果出现step小于10，则不将该problem写入
            if any(len(step) < 10 for step in gt_steps):
                continue
            
            # 将每个step与标签1.0配对
            labeled_steps = [(step, 1.0) for step in gt_steps]
            result.append((problem, labeled_steps))

    with open(save_path.format(stage=stage), 'w', encoding='utf-8') as f:
        for data in tqdm(result):
            problem, steps = data
            f.write(json.dumps({"problem": problem, "steps": steps}) + "\n")
            
            


10819it [00:00, 52436.25it/s]
100%|██████████| 10791/10791 [00:00<00:00, 135538.78it/s]
458it [00:00, 120400.58it/s]
100%|██████████| 455/455 [00:00<00:00, 148560.51it/s]


# 再对pre_generated_solution每个step打上标签

In [1]:
# 文件路径为dataset/prm800k/data/phase2_{stage}.jsonl
import json
from tqdm import tqdm


def process_data(input_file, output_file):
    result = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f):    
            data = json.loads(line)
    
            processed_data = {
                "problem": data["question"]["problem"],
                "steps": []
            }
            
            for step in data["label"]["steps"]:
                if step["chosen_completion"] is not None:
                    # 如果有选中的完成项，使用它
                    chosen_completion = step["completions"][step["chosen_completion"]]
                    if chosen_completion["rating"] is not None:
                        processed_data["steps"].append((chosen_completion["text"], chosen_completion["rating"]))
                elif step["human_completion"] is not None:
                    # 如果有人工完成项，使用它（假设人工完成项的评分为1）
                    processed_data["steps"].append((step["human_completion"], 1))
                elif step["completions"]:
                    # 如果没有选中的完成项和人工完成项，但有其他完成项，选择第一个
                    processed_data["steps"].append((step["completions"][0]["text"], step["completions"][0]["rating"]))
            
            result.append(processed_data)

    with open(output_file, 'w', encoding='utf-8') as f:
        for data in tqdm(result):
            f.write(json.dumps(data) + "\n")

# 使用示例
input_file = "/zhuangkai/openo1/dataset/prm800k/data/phase2_{stage}.jsonl"
output_file = "/zhuangkai/openo1/outputs/rm/phase2_{stage}_pre_generated_solution.jsonl"
for stage in ['train', 'test']: 
    process_data(input_file.format(stage=stage), output_file.format(stage=stage))


0it [00:00, ?it/s]

97782it [00:03, 25756.12it/s]
100%|██████████| 97782/97782 [00:00<00:00, 109436.68it/s]
2762it [00:00, 35404.60it/s]
100%|██████████| 2762/2762 [00:00<00:00, 53316.52it/s]


# 将两个文件合并

In [2]:
#将每个stage的两个文件合并
#合并规则是直接合并，不是按相同problem合并，即f1+f2
import json
from tqdm import tqdm

def merge_files(file1, file2, output_file):
    result = []
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        for line1 in f1:
            result.append(json.loads(line1))
        for line2 in f2:
            result.append(json.loads(line2))

    with open(output_file, 'w', encoding='utf-8') as f:
        for data in tqdm(result):
            f.write(json.dumps(data) + "\n")

for stage in ['train', 'test']:
    merge_files(
        "/zhuangkai/openo1/outputs/rm/phase2_{stage}_gt_solution.jsonl".format(stage=stage),
        "/zhuangkai/openo1/outputs/rm/phase2_{stage}_pre_generated_solution.jsonl".format(stage=stage),
        "/zhuangkai/openo1/outputs/rm/phase2_{stage}_gt_pre_generated_solution.jsonl".format(stage=stage)
    )



100%|██████████| 108573/108573 [00:00<00:00, 115763.54it/s]
100%|██████████| 3217/3217 [00:00<00:00, 81260.09it/s]


# 将test划分为test与valid

In [3]:
#将处理过的test划分为test与validation
#按照1：1的比例划分
import json
from tqdm import tqdm
import random
path = "/zhuangkai/openo1/outputs/rm/phase2_test_gt_pre_generated_solution.jsonl"
results = []
with open(path, 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        line = json.loads(line)
        problem = line["problem"]
        results.append((problem, line))

# 随机打乱results
random.shuffle(results)

# 将results分为两部分，按照1：1的比例划分
test_results = results[:len(results)//2]
validation_results = results[len(results)//2:]  

# 将test_results和validation_results写入文件
with open("/zhuangkai/openo1/dataset/prm800k/processsed/rm/phase2_test_gt_pre_generated_solution.jsonl", 'w', encoding='utf-8') as f:
    for problem, line in test_results:
        f.write(json.dumps({"problem": problem, "steps": line["steps"]}) + "\n")

with open("/zhuangkai/openo1/dataset/prm800k/processsed/rm/phase2_validation_gt_pre_generated_solution.jsonl", 'w', encoding='utf-8') as f:
    for problem, line in validation_results:
        f.write(json.dumps({"problem": problem, "steps": line["steps"]}) + "\n")


3217it [00:00, 60999.44it/s]


In [1]:
#统计有多少step
import json
from tqdm import tqdm

path = "/zhuangkai/openo1/dataset/prm800k/processsed/rm/phase2_{stage}_gt_pre_generated_solution.jsonl"
for stage in ['train', 'test']:
    count = 0
    with open(path.format(stage=stage), 'r', encoding='utf-8') as f:
        for line in tqdm(f):        
            line = json.loads(line)
            count += len(line["steps"])
    print(f"{stage} has {count} steps")

108573it [00:01, 82251.31it/s] 


train has 709250 steps


1608it [00:00, 23965.49it/s]

test has 10361 steps



