In [1]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("jerry128/Musique-Ans-Train-CL-Sorted-By-Hops", cache_dir="./Musique", split="train")

ds

Using the latest cached version of the dataset since jerry128/Musique-Ans-Train-CL-Sorted-By-Hops couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at Musique/jerry128___musique-ans-train-cl-sorted-by-hops/default/0.0.0/b2917a388aa4a35c355e71782630d3f1b18ae5ef (last modified on Mon Jul 14 05:56:42 2025).


Dataset({
    features: ['id', 'paragraphs', 'question', 'answer', 'answer_aliases', 'context', 'citations'],
    num_rows: 19938
})

In [None]:
# 将数据展平，处理成想要的格式，方便后续嵌入
from datasets import Dataset
flatten_ds = []
for id, paragraphs in zip(ds["id"], ds["paragraphs"]):
    for para in paragraphs:
        idx = para["idx"]  # 段落的索引
        text = para["paragraph_text"]  # 段落内容
        title = para["title"]  # 段落标题
        flatten_ds.append(
            {
                "id": f"{id}_{idx}",  # 创建一个唯一的段落ID
                "title": title,  # 段落标题
                "text": text,
                "hop": int(id[0]),
            }
        )

flatten_ds = Dataset.from_list(flatten_ds)
flatten_ds

18123


In [None]:
# 可选是否保存
flatten_ds.save_to_disk("./Musique_flattened")
flatten_ds.to_parquet("./Musique_flattened/Musique_flattened.parquet")

In [2]:
def make_prefix(dp):
    question = dp["question"]

    # NOTE: also need to change reward_score/countdown.py
    prefix = f"""You are an efficient research assistant. Your goal is to answer the question by using tool to find information. You can search and scrape as many times as your want.
    If you need to use tool, you should first search for relevant documents, and then scrape the content of the documents to answer the question.
    If you find no further external knowledge needed, you can directly provide the answer inside <answer> and </answer>, without detailed illustrations. For example, <answer> Beijing </answer>.
Now, answer the following question.
Question: {question}"""

    return prefix


# add a row to each data item that represents a unique id
def make_map_fn(split):

    def process_fn(example, idx):
        example["question"] = example["question"].strip()
        if example["question"][-1] != "?":
            example["question"] += "?"
        question = make_prefix(example)
        solution = {
            "target": example["answer"],
        }

        data = {
            "data_source": "oringinal_dataset(Musique)",
            "prompt": [
                {
                    "role": "user",
                    "content": question,
                }
            ],
            "ability": "fact-reasoning",
            "reward_model": {"style": "rule", "ground_truth": solution},
            "extra_info": {
                "split": split,
                "index": idx,
            },
        }
        return data

    return process_fn


In [3]:
train_dataset = ds.map(function=make_map_fn("train"), with_indices=True)

train_dataset.to_parquet("./Musique_RL/train.parquet")

Map:   0%|          | 0/19938 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/20 [00:00<?, ?ba/s]

341407293

In [4]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("jerry128/Musique-Ans-Eval-1000", cache_dir="./Musique", split="train")

train_dataset = ds.map(function=make_map_fn("test"), with_indices=True)

train_dataset.to_parquet("./Musique_RL/test.parquet")


Using the latest cached version of the dataset since jerry128/Musique-Ans-Eval-1000 couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at Musique/jerry128___musique-ans-eval-1000/default/0.0.0/70b39609f807b16b9d6bfdbfcc07efb701d3e7c8 (last modified on Mon Jul 14 05:56:42 2025).


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

21820819