In [1]:
import json
import os
import random
from collections import Counter

def load_json(file_path):
    """
    Load a JSON file and return its contents as a Python object (dict or list).
    """
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data

def save_json(data, filepath, indent=2):
    """
    Save Python object as JSON file, creating parent directories if needed.

    Args:
        data: Python object (dict, list, etc.)
        filepath: path to save the JSON file
        indent: indentation level for pretty printing (default=2)
    """
    # Ensure parent directory exists
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=indent)

def read_jsonl(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
      data = [json.loads(line) for line in f]
    return data

# Pref Dataset

In [None]:
data = read_jsonl("<path to prm800k phase2_train.jsonl>")

In [12]:
def generate_listwise_data_hard_negatives_v2(sample, num_negatives = 3):
    """
    Generates a listwise dataset with 1 positive and its 3 "hardest" negatives.

    It handles two cases for defining the "positive" sample:
    1. For intermediate steps, it uses the 'chosen_completion'.
    2. For the final step, it treats every completion with 'rating: 1' as a positive.
    
    Negatives are selected by sorting all other candidates by rating (1 > 0 > -1)
    and picking the top three.

    Args:
        sample (dict): A dictionary containing 'question' and 'label' keys.

    Returns:
        list[dict]: A list of dictionaries, each structured for listwise ranking.
    """
    listwise_data = []
    question_str = sample['question']['problem']
    ancestor_path = []
    num_steps = len(sample['label']['steps'])

    for step_idx, step in enumerate(sample['label']['steps']):
        is_last_step = (step_idx == num_steps - 1)
        
        positive_candidates = []
        
        # --- Determine Positive Candidate(s) ---
        if not is_last_step:
            # Intermediate step: Use the chosen_completion as the single positive
            chosen_id = step.get('chosen_completion')
            if chosen_id is not None:
                positive_candidates.append(step['completions'][chosen_id])
            else:
                raise ValueError(f"Chosen Candidate Doesnt exist at depth {step_idx}")
        else:
            # Last step: Use all completions with rating 1 as positives
            positive_candidates = [c for c in step['completions'] if c.get('rating') == 1]

        if not positive_candidates:
            chosen_id = step.get('chosen_completion')
            if chosen_id is not None:
                 ancestor_path.append(step['completions'][chosen_id]['text'])
            continue

        # --- Process each positive candidate ---
        for positive_completion in positive_candidates:
            # Pool of negatives: all completions EXCEPT the current positive one
            # V2 Change for last step make sure negatives are not rating 1
            if is_last_step:
                negative_pool = [
                    c for c in step['completions'] 
                    if c['text'] != positive_completion['text'] and c['rating'] is not None
                ]
            else:
                negative_pool = [
                    c for c in step['completions'] 
                    if c['text'] != positive_completion['text'] and c['rating'] is not None \
                        and c.get('rating') != 1
                ]

            # --- Condition Check ---
            # We need at least 3 negatives to form a sample
            if len(negative_pool) < num_negatives:
                continue

            # print(negative_pool)

            # --- Select the Hardest Negatives ---
            # Sort the pool by rating in descending order (1 > 0 > -1)
            negative_pool.sort(key=lambda x: x.get('rating', 0), reverse=True)
            
            # Select the top 3 strongest competitors
            # hard_negatives = negative_pool[:3]
            # --> V3 include 1 bad sample in negatives for class balance
            hard_negatives = negative_pool[:num_negatives-1] + negative_pool[-1:] 
            
            listwise_data.append({
                "question": question_str,
                "prompt": ancestor_path,
                "positive": positive_completion['text'],
                "pos_rating": positive_completion['rating'],
                "negatives": [c['text'] for c in hard_negatives],
                "neg_ratings": [c['rating'] for c in hard_negatives]
            })

        # --- Update Ancestor Path for the next step ---
        chosen_id = step.get('chosen_completion')
        if chosen_id is not None:
            ancestor_path.append(step['completions'][chosen_id]['text'])
            
    return listwise_data

In [13]:
# num_samples = {}
# for num_negatives in range(2,7):
preference_data = []
num_negatives=4
# error_idx = []
error_counter = 0
for sidx, sample in enumerate(data):
    try:
        # processed_sample = generate_listwise_data_hard_negatives_v1(sample)
        processed_sample = generate_listwise_data_hard_negatives_v2(sample, num_negatives)
        preference_data.extend(processed_sample)
    except Exception as e:
        # print("Error on index:", sidx)
        # print(e)
        # error_idx.append(sidx)
        error_counter += 1

# num_samples[num_negatives] = len(preference_data)
# print(error_counter)

# print(num_samples)

In [14]:
print(error_counter)

83


In [15]:
len(preference_data)

96559

In [16]:
preference_data[6]

{'question': "Three points are chosen uniformly at random on a circle. What is the probability that no two of these points form an obtuse triangle with the circle's center?",
 'prompt': ['This is a problem about the angles subtended by arcs of the circle.',
  'For example, if we choose three points A, B, and C on the circle, then the angle subtended by arc AB at the center is twice the angle subtended by arc AB at any point on the circle.',
  'Similarly, the angle subtended by arc BC at the center is twice the angle subtended by arc BC at any point on the circle, and so on.'],
 'positive': 'So, we want to find the probability that all three angles subtended by the arcs AB, BC, and CA at the center are less than or equal to 90 degrees.',
 'pos_rating': 1,
 'negatives': ['If we want to avoid forming an obtuse triangle with the center, then we need to make sure that none of these angles at the center exceed 90 degrees.',
  'An obtuse triangle is one where one angle is greater than 90 degr

In [17]:
from collections import Counter
counter = Counter()

for sample in preference_data:
    counter[tuple(sample['neg_ratings'])]+=1

counter

Counter({(1, 1, 1, -1): 27885,
         (1, 1, -1, -1): 20696,
         (1, -1, -1, -1): 14230,
         (1, 1, 0, -1): 8799,
         (-1, -1, -1, -1): 8209,
         (1, 0, -1, -1): 6066,
         (1, 0, 0, -1): 4322,
         (0, -1, -1, -1): 2808,
         (0, 0, -1, -1): 1919,
         (0, 0, 0, -1): 1625})

In [18]:
save_json(preference_data, "./prm800k_pref_data/listwise_data_with_labels_4negs.json")

# Format pref data

In [2]:
pref_data_raw = load_json("./prm800k_pref_data/listwise_data_with_labels_4negs.json")

In [3]:
pref_data_raw[0]

{'question': 'The sum of the squares of three consecutive positive even numbers is $12296$. Find the product of the three numbers divided by $8$.',
 'prompt': ['I need to find three consecutive positive even numbers whose squares add up to $12296$.',
  'Let me call them $2n-2$, $2n$, and $2n+2$, where $n$ is a positive integer.',
  'Then I have the equation $(2n-2)^2+(2n)^2+(2n+2)^2=12296$.',
  'I can expand the squares and simplify the equation: $4n^2-8n+4+4n^2+4n^2+8n+4=12296$.',
  'Combining like terms, I get $12n^2+8=12296$.',
  'Subtracting $8$ from both sides, I get $12n^2=12288$.',
  'Dividing both sides by $12$, I get $n^2=1024$.',
  'Taking the square root of both sides, I get $n=\\pm 32$.',
  'Since $n$ has to be positive, I choose $n=32$.',
  'This means that the three consecutive positive even numbers are $2n-2=62$, $2n=64$, and $2n+2=66$.'],
 'positive': 'To find their product divided by $8$, I can write it as $\\frac{(2n-2)(2n)(2n+2)}{8}$.',
 'pos_rating': 1,
 'negatives'

In [None]:
import re
def format_to_boxed(text):
    # This pattern captures the content, but we use a lambda function 
    # to .strip() the result before putting it in the box.
    return re.sub(
        r'# Answer\s+(.*)', 
        lambda m: f"\\boxed{{{m.group(1).strip()}}}", 
        text, 
        flags=re.DOTALL
    )

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    # 'Qwen/Qwen2.5-0.5B-Instruct',
    "microsoft/Phi-4-mini-instruct",
    token="hf_token"
    )

def format_chat(messages):
    chat_msg = tokenizer.apply_chat_template(messages, tokenize=False)
    if "\\boxed" not in chat_msg:
        # chat_msg = chat_msg.removesuffix("<|im_end|>\n")
        # chat_msg = chat_msg.removesuffix("<|eot_id|>")
        # chat_msg = chat_msg.removesuffix("<end_of_turn>\n")
        chat_msg = chat_msg.removesuffix("<|end|><|endoftext|>")
        return chat_msg
    return chat_msg

In [None]:
def format_label(label):
    if label == 1:
        return label
    elif label == -1:
        return 0
    elif label == 0:
        return -100
    else:
        raise ValueError(f"Unknown Label {label}")
# SEPERATOR = "<extra_0>"
# SEPERATOR = "ки" # Math Sheperd
def format_final_step(step):
    if "# Answer" in step:
        new_step = format_to_boxed(step)
        return new_step
    return step

def format_qwen3_steps(previous_steps, current):
    new_steps = previous_steps + [current]
    final_output = ""
    for i, step in enumerate(new_steps):
        final_output += f"\n\n### Step {i+1}: {step}"
    return final_output

def format_prompt(sample):
    question = sample['question']
    previous_steps = sample['prompt']
    chosen = sample['positive']
    rejected_samples = sample['negatives']

    chosen_output = format_qwen3_steps(previous_steps, format_final_step(chosen))
    
    chosen_messages = [
        {"role": "user", "content": question},
        {"role": "assistant", "content": chosen_output},
    ]

    chosen_sample = format_chat(chosen_messages)

    rejected_messages = []

    for rej_step in rejected_samples:
        rej_output = format_qwen3_steps(previous_steps, format_final_step(rej_step))

        rejected_message = [
            {"role": "user", "content": question},
            {"role": "assistant", "content": rej_output},
        ]
        
        rejected_messages.append(format_chat(rejected_message))

    return {
        "chosen": chosen_sample,
        # "rejected": rejected_messages,
        "chosen_label": format_label(sample['pos_rating']),
        "rejected": rejected_messages,
        "rejected_labels": [format_label(neg_rating) for neg_rating in sample['neg_ratings']]
    }

In [17]:
new_samples = []

for sample in pref_data_raw:
    new_samples.append(format_prompt(sample))
    # break

In [18]:
new_samples[0]

{'chosen': '<|user|>The sum of the squares of three consecutive positive even numbers is $12296$. Find the product of the three numbers divided by $8$.<|end|><|assistant|>\n\n### Step 1: I need to find three consecutive positive even numbers whose squares add up to $12296$.\n\n### Step 2: Let me call them $2n-2$, $2n$, and $2n+2$, where $n$ is a positive integer.\n\n### Step 3: Then I have the equation $(2n-2)^2+(2n)^2+(2n+2)^2=12296$.\n\n### Step 4: I can expand the squares and simplify the equation: $4n^2-8n+4+4n^2+4n^2+8n+4=12296$.\n\n### Step 5: Combining like terms, I get $12n^2+8=12296$.\n\n### Step 6: Subtracting $8$ from both sides, I get $12n^2=12288$.\n\n### Step 7: Dividing both sides by $12$, I get $n^2=1024$.\n\n### Step 8: Taking the square root of both sides, I get $n=\\pm 32$.\n\n### Step 9: Since $n$ has to be positive, I choose $n=32$.\n\n### Step 10: This means that the three consecutive positive even numbers are $2n-2=62$, $2n=64$, and $2n+2=66$.\n\n### Step 11: To 

In [19]:
for sample in new_samples:
    if "\\boxed" in sample['chosen']:
        print(sample['chosen'])
        # print(sample['chosen'].removesuffix("<|eot_id|>"))
        break

<|user|>When each edge of a cube is increased by $50\%$, by what percent is the surface area of the cube increased?<|end|><|assistant|>

### Step 1: So the surface area of the original cube is 6 times the area of each face.

### Step 2: And each face is a square, so its area is the length of each edge squared.

### Step 3: Let's call the length of each edge x. So the surface area of the original cube is 6*$x^2$.

### Step 4: And if we increase the length of each edge by 50%, we get 1.5*x.

### Step 5: So the area of each face is $(1.5*x)^2$ which is $2.25*x^2$.

### Step 6: So the surface area of the new cube is 6*$2.25*x^2$ which is $13.5*x^2$.

### Step 7: And if we divide that by the original surface area, we get $13.5*x^2$/$6*x^2$ which is 2.25.

### Step 8: So the surface area of the new cube is 2.25 times the original surface area, which means it has increased by 125%.

\boxed{125}<|end|><|endoftext|>


In [20]:
# save_json(new_samples, "./prm800k_pref_data/listwise_data_chat_with_labels_4negs_qwen3.json")
# save_json(new_samples, "./prm800k_pref_data/listwise_data_chat_with_labels_4negs_llama32.json")
# save_json(new_samples, "./prm800k_pref_data/listwise_data_chat_with_labels_4negs_gemma3.json")
save_json(new_samples, "./prm800k_pref_data/listwise_data_chat_with_labels_4negs_phi4.json")

In [21]:
from datasets import Dataset

hf_pref_data_chat = Dataset.from_list(new_samples)

# Split the dataset into training and testing sets (95% train, 5% test)
split_dataset = hf_pref_data_chat.train_test_split(test_size=0.05, seed=42)

print(split_dataset)

print("\n--- Training Set ---")
print(split_dataset['train'])

print("\n--- Test Set ---")
print(split_dataset['test'])

save_path = './prm800k_pref_data/listwise_data_chat_with_labels_4negs_phi4'
split_dataset.save_to_disk(save_path)

DatasetDict({
    train: Dataset({
        features: ['chosen', 'chosen_label', 'rejected', 'rejected_labels'],
        num_rows: 91731
    })
    test: Dataset({
        features: ['chosen', 'chosen_label', 'rejected', 'rejected_labels'],
        num_rows: 4828
    })
})

--- Training Set ---
Dataset({
    features: ['chosen', 'chosen_label', 'rejected', 'rejected_labels'],
    num_rows: 91731
})

--- Test Set ---
Dataset({
    features: ['chosen', 'chosen_label', 'rejected', 'rejected_labels'],
    num_rows: 4828
})


Saving the dataset (0/1 shards):   0%|          | 0/91731 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4828 [00:00<?, ? examples/s]

In [22]:
from datasets import DatasetDict

# Calculate 10% size for each split
train_size_10_percent = int(len(split_dataset['train']) * 0.1)
test_size_10_percent = int(len(split_dataset['test']) * 0.1)

# Shuffle and select the top N items
mini_train = split_dataset['train'].shuffle(seed=42).select(range(train_size_10_percent))
mini_test = split_dataset['test'].shuffle(seed=42).select(range(test_size_10_percent))

# Combine them into a new DatasetDict
mini_dataset = DatasetDict({
    'train': mini_train,
    'test': mini_test
})

print("\n--- Mini Dataset (10%) via Shuffle/Select ---")
print(mini_dataset)


mini_save_path = './prm800k_pref_data/listwise_data_chat_with_labels_4negs_phi4_mini'
mini_dataset.save_to_disk(mini_save_path)


--- Mini Dataset (10%) via Shuffle/Select ---
DatasetDict({
    train: Dataset({
        features: ['chosen', 'chosen_label', 'rejected', 'rejected_labels'],
        num_rows: 9173
    })
    test: Dataset({
        features: ['chosen', 'chosen_label', 'rejected', 'rejected_labels'],
        num_rows: 482
    })
})


Saving the dataset (0/1 shards):   0%|          | 0/9173 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/482 [00:00<?, ? examples/s]