In [20]:
import datasets
cache_path = '/data/kcl/lpy/hf'
data_path = "/data/kcl/lpy/data/AIME_2025"
dataset = datasets.load_dataset(data_path, "default", cache_dir=cache_path)

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'problem', 'answer', 'solution', 'url', 'year', '__index_level_0__'],
        num_rows: 30
    })
})

In [22]:
system_prompt = '''Solve the following math problem step by step. The last line of your response should be of the form Answer: $Answer (without quotes) where $Answer is the answer to the problem.\n\n'''

In [26]:
    train_dataset = dataset["train"]
    def make_map_fn(split):
        def process_fn(example, idx):
            extra_info = {'index':example['id']}
            extra_info["need_tools_kwargs"] = True
            extra_info["tools_kwargs"] = {
                "code_interpreter": {
                    "create_kwargs": {
                        "ground_truth": example["answer"],
                    },
                },
            }
            extra_info['split'] = None
            extra_info['raw_problem'] = example['problem']
            example["extra_info"] = extra_info
            example['data_source'] = 'aime_2025'
            example['ability'] = 'MATH'
            example['reward_model'] = {
                'ground_truth': example["answer"],
                'style': 'rule-lighteval/MATH_v2'
            }
            example['prompt'] = [{
                'content' : f'{system_prompt}{example['problem']}\n\nRemember to put your answer on its own line after "Answer:".',
                'role' : 'user'
            }]
            return example
        return process_fn

    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True).remove_columns("__index_level_0__")
    train_dataset = datasets.concatenate_datasets([train_dataset]*64).shuffle(seed=7)

Map: 100%|██████████| 30/30 [00:00<00:00, 2352.12 examples/s]


In [29]:
train_dataset[0]

{'id': 1,
 'problem': 'On $\\triangle ABC$ points $A,D,E$, and $B$ lie that order on side $\\overline{AB}$ with $AD=4, DE=16$, and $EB=8$. Points $A,F,G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF=13, FG=52$, and $GC=26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon $AFNBCEM$.',
 'answer': '588',
 'solution': '588',
 'url': 'https://artofproblemsolving.com/wiki/index.php/2025_AIME_I_Problems/Problem_2',
 'year': 2025,
 'extra_info': {'index': 1,
  'need_tools_kwargs': True,
  'raw_problem': 'On $\\triangle ABC$ points $A,D,E$, and $B$ lie that order on side $\\overline{AB}$ with $AD=4, DE=16$, and $EB=8$. Points $A,F,G$, and $C$ lie in that order on side $\\overline{AC}$ with $AF=13, FG=52$, and $GC=26$. Let $M$ be the reflection of $D$ through $F$, and let $N$ be the reflection of $G$ through $E$. Quadrilateral $DEGF$ has area 288. Find the area of heptagon

In [28]:
    import os
    train_dataset.to_parquet(os.path.join(data_path, "test_verl.parquet"))

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  7.05ba/s]


3485760

In [18]:
__index_level_0__list = []
for item in train_dataset:
    __index_level_0__list.append(item['__index_level_0__'])

In [23]:
len(set(__index_level_0__list))

960

In [21]:
len(train_dataset)

960

In [13]:
datasets.__version__

'4.5.0'