In [1]:
import os

print("Current working directory:", os.getcwd())

os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

dataset = "../hf_datasets/open-r1/OpenR1-Math-220k"

Current working directory: /inspire/ssd/ws-ba572160-47f8-4ca1-984e-d6bcdeb95dbb/a100-maybe/public/xushuyao/reft-private/experiments_trl/data_preprocess


In [None]:
from datasets import load_dataset_builder

ds_builder = load_dataset_builder(dataset)

ds_builder.info.description

ds_builder.info.features

In [3]:
from datasets import load_dataset

# Login using e.g. `huggingface-cli login` to access this dataset
ds = load_dataset("../hf_datasets/open-r1/OpenR1-Math-220k", "default", split="train")

Generating train split: 100%|██████████| 93733/93733 [00:08<00:00, 10635.47 examples/s]


In [4]:
print(((ds[0]["messages"])))

[{'content': '## Task B-1.3.\n\nA ship traveling along a river has covered $24 \\mathrm{~km}$ upstream and $28 \\mathrm{~km}$ downstream. For this journey, it took half an hour less than for traveling $30 \\mathrm{~km}$ upstream and $21 \\mathrm{~km}$ downstream, or half an hour more than for traveling $15 \\mathrm{~km}$ upstream and $42 \\mathrm{~km}$ downstream, assuming that both the ship and the river move uniformly.\n\nDetermine the speed of the ship in still water and the speed of the river.', 'role': 'user'}, {'content': '<think>\nOkay, so I need to find the speed of the ship in still water and the speed of the river. Let me start by recalling that when a ship is moving upstream, its effective speed is the speed of the ship minus the speed of the river. Conversely, when moving downstream, its effective speed is the ship\'s speed plus the river\'s speed. \n\nLet me denote the speed of the ship in still water as \\( v \\) (in km/h) and the speed of the river as \\( r \\) (also in 

In [5]:
# Count the occurrences of each length in the "generations" field
generations = ds["generations"]
length_counts = {}

for gen in generations:
    length = len(gen)
    if length in length_counts:
        length_counts[length] += 1
    else:
        length_counts[length] = 1

# Print the counts in a sorted manner
for length, count in sorted(length_counts.items()):
    print(f"Length {length}: {count} occurrences")

# Also print the total number of items
print(f"\nTotal number of items: {len(generations)}")


Length 1: 1446 occurrences
Length 2: 88456 occurrences
Length 3: 49 occurrences
Length 4: 3706 occurrences
Length 5: 18 occurrences
Length 6: 58 occurrences

Total number of items: 93733


In [6]:
from collections import defaultdict

def analyze_dataset(dataset, filter_func=None):
    # Filter the dataset
    if filter_func is None:
        filtered_data = dataset
    else:
        filtered_data = [item for item in dataset if filter_func(item)]
    
    # 1. Analyze source distribution
    source_counter = defaultdict(int)
    
    for item in filtered_data:
        source = item["source"]
        source_counter[source] += 1
    
    print(f"Total entries in filtered dataset: {len(filtered_data)}")
    print("\nSource Distribution:")
    print("-" * 40)
    
    total_items = len(filtered_data)
    for source, count in sorted(source_counter.items()):
        percentage = count / total_items * 100
        print(f"Source {source}: {count} occurrences ({percentage:.2f}% of the dataset)")
    
    # 2. Calculate difficulty based on correctness_math_verify
    total_difficulty = 0
    difficulty_distribution = defaultdict(int)
    
    for item in filtered_data:
        correctness = item["correctness_math_verify"]
        difficulty = sum(correctness) / len(correctness)
        total_difficulty += difficulty
        
        # Round difficulty to nearest 0.1 for distribution
        rounded_difficulty = round(difficulty * 10) / 10
        difficulty_distribution[rounded_difficulty] += 1
    
    avg_difficulty = total_difficulty / total_items if total_items > 0 else 0
    
    print("\nDifficulty Analysis:")
    print("-" * 40)
    print(f"Average difficulty: {avg_difficulty:.4f}")
    print("\nDifficulty Distribution:")
    
    for diff, count in sorted(difficulty_distribution.items()):
        percentage = count / total_items * 100
        print(f"Difficulty {diff:.2f}: {count} occurrences ({percentage:.2f}% of the dataset)")
    

In [7]:
def all(item):
    return True

all = analyze_dataset(ds, all)

Total entries in filtered dataset: 93733

Source Distribution:
----------------------------------------
Source amc_aime: 3403 occurrences (3.63% of the dataset)
Source aops_forum: 8345 occurrences (8.90% of the dataset)
Source cn_contest: 12046 occurrences (12.85% of the dataset)
Source inequalities: 806 occurrences (0.86% of the dataset)
Source number_theory: 380 occurrences (0.41% of the dataset)
Source olympiads: 68089 occurrences (72.64% of the dataset)
Source olympiads_ref: 664 occurrences (0.71% of the dataset)

Difficulty Analysis:
----------------------------------------
Average difficulty: 0.6312

Difficulty Distribution:
Difficulty 0.00: 28627 occurrences (30.54% of the dataset)
Difficulty 0.20: 592 occurrences (0.63% of the dataset)
Difficulty 0.30: 38 occurrences (0.04% of the dataset)
Difficulty 0.50: 10682 occurrences (11.40% of the dataset)
Difficulty 0.60: 1 occurrences (0.00% of the dataset)
Difficulty 0.70: 16 occurrences (0.02% of the dataset)
Difficulty 0.80: 494 oc

In [8]:
def length_2_fn(item):
    return len(item["generations"]) == 2

def length_4_fn(item):
    return len(item["generations"]) == 4

length_2_data = analyze_dataset(ds, length_2_fn)

Total entries in filtered dataset: 88456

Source Distribution:
----------------------------------------
Source amc_aime: 3168 occurrences (3.58% of the dataset)
Source aops_forum: 7003 occurrences (7.92% of the dataset)
Source cn_contest: 9866 occurrences (11.15% of the dataset)
Source inequalities: 686 occurrences (0.78% of the dataset)
Source number_theory: 322 occurrences (0.36% of the dataset)
Source olympiads: 66755 occurrences (75.47% of the dataset)
Source olympiads_ref: 656 occurrences (0.74% of the dataset)

Difficulty Analysis:
----------------------------------------
Average difficulty: 0.6345

Difficulty Distribution:
Difficulty 0.00: 27181 occurrences (30.73% of the dataset)
Difficulty 0.50: 10292 occurrences (11.64% of the dataset)
Difficulty 1.00: 50983 occurrences (57.64% of the dataset)


In [9]:
def analyze_difficulty_by_source(dataset, filter_func=None):
    """
    Analyzes the average difficulty of the dataset grouped by source.
    
    Parameters:
    dataset -- the dataset to analyze
    filter_func -- optional filter function to apply to the dataset
    
    Returns:
    A dictionary with source as key and average difficulty as value
    """
    # Filter the dataset if a filter function is provided
    if filter_func:
        filtered_data = [item for item in dataset if filter_func(item)]
    else:
        filtered_data = dataset
    
    # Initialize counters for each source
    source_difficulties = defaultdict(list)
    
    # Calculate difficulty for each item and group by source
    for item in filtered_data:
        source = item["source"]
        correctness = item["correctness_math_verify"]
        
        # Only calculate if correctness data exists and is not empty
        if correctness and len(correctness) > 0:
            difficulty = sum(correctness) / len(correctness)
            source_difficulties[source].append(difficulty)
    
    # Calculate average difficulty for each source
    avg_difficulties = {}
    for source, difficulties in source_difficulties.items():
        if difficulties:
            avg_difficulties[source] = sum(difficulties) / len(difficulties)
    
    # Print the results
    print("Average Difficulty by Source:")
    print("-" * 40)
    
    for source, avg_diff in sorted(avg_difficulties.items(), key=lambda x: x[1]):
        count = len(source_difficulties[source])
        print(f"Source {source}: {avg_diff:.4f} (from {count} items)")
    
    return avg_difficulties

In [10]:
analyze_difficulty_by_source(ds)

Average Difficulty by Source:
----------------------------------------
Source amc_aime: 0.2820 (from 3403 items)
Source number_theory: 0.4779 (from 380 items)
Source olympiads: 0.6079 (from 68089 items)
Source olympiads_ref: 0.6627 (from 664 items)
Source aops_forum: 0.6905 (from 8345 items)
Source inequalities: 0.7260 (from 806 items)
Source cn_contest: 0.8173 (from 12046 items)


{'olympiads': 0.6078845334782417,
 'aops_forum': 0.6905352506490914,
 'cn_contest': 0.8173017322486025,
 'inequalities': 0.725951199338296,
 'amc_aime': 0.2820060730727789,
 'number_theory': 0.47785087719298247,
 'olympiads_ref': 0.6626506024096386}

In [11]:
analyze_difficulty_by_source(ds, length_2_fn)

Average Difficulty by Source:
----------------------------------------
Source amc_aime: 0.2563 (from 3168 items)
Source number_theory: 0.4286 (from 322 items)
Source olympiads: 0.6185 (from 66755 items)
Source olympiads_ref: 0.6707 (from 656 items)
Source aops_forum: 0.6918 (from 7003 items)
Source inequalities: 0.7369 (from 686 items)
Source cn_contest: 0.8211 (from 9866 items)


{'olympiads': 0.6185004868549172,
 'aops_forum': 0.6918463515636156,
 'cn_contest': 0.8210520981147374,
 'amc_aime': 0.2563131313131313,
 'inequalities': 0.7368804664723032,
 'number_theory': 0.42857142857142855,
 'olympiads_ref': 0.6707317073170732}

In [12]:
analyze_difficulty_by_source(ds, length_4_fn)

Average Difficulty by Source:
----------------------------------------
Source olympiads: 0.4094 (from 240 items)
Source inequalities: 0.7525 (from 102 items)
Source aops_forum: 0.7923 (from 1123 items)
Source number_theory: 0.8284 (from 51 items)
Source cn_contest: 0.8500 (from 2018 items)
Source amc_aime: 0.8547 (from 172 items)


{'aops_forum': 0.7922974176313446,
 'olympiads': 0.409375,
 'inequalities': 0.7524509803921569,
 'cn_contest': 0.8499752229930624,
 'amc_aime': 0.8546511627906976,
 'number_theory': 0.8284313725490197}

In [13]:
import random
import numpy as np

def sample_dataset(dataset, num_samples=5, seed=42):
    """
    Sample random examples from the dataset with reproducibility.
    
    Parameters:
    - dataset: The Huggingface dataset to sample from
    - num_samples: Number of samples to return (default: 5)
    - seed: Random seed for reproducibility (default: 42)
    
    Returns:
    - A new dataset containing the sampled examples
    """
    # Set seeds for reproducibility
    random.seed(seed)
    np.random.seed(seed)
    
    # Get dataset size
    dataset_size = len(dataset)
    
    # Generate random indices
    indices = random.sample(range(dataset_size), min(num_samples, dataset_size))
    
    # Use Huggingface's built-in select method to get samples
    sampled_dataset = dataset.select(indices)
    
    return sampled_dataset


In [14]:
ds_10k = sample_dataset(ds, num_samples=10000)

In [15]:
analyze_dataset(ds_10k)
analyze_difficulty_by_source(ds_10k)

Total entries in filtered dataset: 10000

Source Distribution:
----------------------------------------
Source amc_aime: 382 occurrences (3.82% of the dataset)
Source aops_forum: 901 occurrences (9.01% of the dataset)
Source cn_contest: 1320 occurrences (13.20% of the dataset)
Source inequalities: 79 occurrences (0.79% of the dataset)
Source number_theory: 43 occurrences (0.43% of the dataset)
Source olympiads: 7206 occurrences (72.06% of the dataset)
Source olympiads_ref: 69 occurrences (0.69% of the dataset)

Difficulty Analysis:
----------------------------------------
Average difficulty: 0.6323

Difficulty Distribution:
Difficulty 0.00: 3040 occurrences (30.40% of the dataset)
Difficulty 0.20: 64 occurrences (0.64% of the dataset)
Difficulty 0.30: 4 occurrences (0.04% of the dataset)
Difficulty 0.50: 1143 occurrences (11.43% of the dataset)
Difficulty 0.70: 1 occurrences (0.01% of the dataset)
Difficulty 0.80: 57 occurrences (0.57% of the dataset)
Difficulty 1.00: 5691 occurrences 

{'olympiads': 0.6086247571468221,
 'cn_contest': 0.8396464646464648,
 'olympiads_ref': 0.6376811594202898,
 'aops_forum': 0.6761931187569368,
 'amc_aime': 0.25392670157068065,
 'number_theory': 0.6104651162790697,
 'inequalities': 0.6656118143459915}

In [18]:
from datasets import Dataset, DatasetDict

def save_dataset(dataset, output_dir, dataset_name="openr1_math_subset"):
    """
    Save a dataset in the HuggingFace dataset format with a 'train' split
    
    Parameters:
    - dataset: The Huggingface dataset to save
    - output_dir: Directory where the dataset will be saved
    - dataset_name: Name for the saved dataset
    
    Returns:
    - Path to the saved dataset
    """
    # Create output directory if it doesn't exist
    full_output_path = os.path.join(output_dir, dataset_name)
    os.makedirs(full_output_path, exist_ok=True)
    
    # Create a DatasetDict with a 'train' split
    dataset_dict = DatasetDict({"train": dataset})
    
    # Save the dataset
    dataset_dict.save_to_disk(full_output_path)
    
    print(f"Dataset saved to: {full_output_path}")
    print(f"To load it back, use: from datasets import load_from_disk")
    print(f"                       dataset_dict = load_from_disk('{full_output_path}')")
    print(f"                       ds = dataset_dict['train']")
    
    return full_output_path

In [19]:
save_dataset(ds_10k, "../processed_datasets", "openr1_math_subset_10k")

Saving the dataset (0/2 shards):   0%|          | 0/10000 [00:00<?, ? examples/s]

Saving the dataset (2/2 shards): 100%|██████████| 10000/10000 [00:00<00:00, 10587.11 examples/s]

Dataset saved to: ../processed_datasets/openr1_math_subset_10k
To load it back, use: from datasets import load_from_disk
                       dataset_dict = load_from_disk('../processed_datasets/openr1_math_subset_10k')
                       ds = dataset_dict['train']





'../processed_datasets/openr1_math_subset_10k'