In [1]:
import json, os
from collections import defaultdict
import glob

In [25]:
raw_data_prefix = "release-32b-it1-trainlite-temp_1.0-fp8_"
raw_data_files = glob.glob(raw_data_prefix + "*")
raw_datasets = [json.load(open(f)) for f in raw_data_files]

In [26]:
# full dataset format D[instance_id][True/False] = [msg1, msg2, ...]
D = defaultdict(lambda: defaultdict(lambda: []))
for raw_dataset in raw_datasets:
    for instance_id, messages in raw_dataset.items():
        is_success = "True" in messages[2]['content']
        D[instance_id][is_success].append(messages)

# Cap each instance at 2 messages for both positive and negative
from random import shuffle, seed
seed(42)

CAP_PER_INSTANCE = 2
success_msgs = []
for instance_id in D.keys():
    if D[instance_id][True]:  # If there are positive messages
        instance_msgs = D[instance_id][True]
        shuffle(instance_msgs)
        success_msgs.extend(instance_msgs[:CAP_PER_INSTANCE])  # Cap at 2 per instance

fail_msgs = []
for instance_id in D.keys():
    if D[instance_id][False]:  # If there are negative messages
        instance_msgs = D[instance_id][False]
        shuffle(instance_msgs)
        fail_msgs.extend(instance_msgs[:CAP_PER_INSTANCE])  # Cap at 2 per instance

# Randomly subsample negative messages to match positive count
shuffle(fail_msgs)
complete_msgs = success_msgs + fail_msgs[:len(success_msgs)]  # 1:1 ratio
shuffle(complete_msgs)

In [None]:
len(success_msgs), len(fail_msgs)

In [None]:
len(complete_msgs)

In [29]:
openai_dataset = []
for msg in complete_msgs:
    openai_dataset.append({
        "messages": msg,
    })

In [30]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-32B-Instruct")

In [31]:
def msg_len(msg):
    return len(tokenizer.apply_chat_template(msg['messages'], tokenize=True))

In [None]:
lens_msg_pair = [(msg_len(msg), msg) for msg in openai_dataset]
import matplotlib.pyplot as plt
import numpy as np

lens = [l for l, _ in lens_msg_pair]
# Sort lengths for CDF
sorted_lens = np.sort(lens)
# Calculate cumulative probabilities
cumulative_probs = np.arange(1, len(sorted_lens) + 1) / len(sorted_lens)

plt.figure(figsize=(10, 6))
plt.plot(sorted_lens, cumulative_probs)
plt.grid(True)
plt.xlabel('Message Length (tokens)')
plt.ylabel('Cumulative Proportion')
plt.title('CDF of Message Lengths')
plt.show()

In [33]:
def filter_pairs(lens_msg_pair, max_len=10240):
    return [msg for l, msg in lens_msg_pair if l <= max_len]
filtered_pairs = filter_pairs(lens_msg_pair, max_len=10240)

In [None]:
filtered_pairs

In [35]:
len(filtered_pairs)
with open("release_orm_32b-cap2.openai.jsonl", "w") as f:
    for msg in filtered_pairs:
        f.write(json.dumps(msg) + "\n")