In [5]:
import datasets
from datasets import load_dataset

import os
import torch
from tqdm import tqdm
import numpy as np
import tiktoken
tknzr = tiktoken.get_encoding("gpt2")
KODCODE_DATA_PATH = os.path.join("/scratch/homes/sfan/multi_doge/src/data/datasets", "kodcode/")


In [6]:
def tokenize_with_pad(text, pad_to_multiple=512):
    ids = tknzr.encode_ordinary(text)
    ids.append(tknzr.eot_token)
    pad_token_id = tknzr.eot_token
    # Calculate padding length (next multiple of pad_multiple)
    padded_length = ((len(ids) + pad_to_multiple - 1) // pad_to_multiple) * pad_to_multiple
    # Initialize the padded array with pad token (not zeros)
    padded_tokens = np.ones(padded_length, dtype=np.uint16) * pad_token_id
    padded_tokens[:len(ids)] = ids
    return padded_tokens

def get_kodcode(num_proc=10, return_torch=False):
    """
    Load and process the KodCode (for code reasoning) dataset.
    Tokenize the text and store it in binary format for efficient loading.
    """
    if not os.path.exists(os.path.join(KODCODE_DATA_PATH, 'val.bin')):
        os.makedirs(KODCODE_DATA_PATH, exist_ok=True)

        # Load the GSM8K dataset from Hugging Face Datasets
        dataset = load_dataset("KodCode/KodCode-V1-SFT-R1", trust_remote_code=True)
        dataset = dataset["train"].train_test_split(test_size=0.1, seed=2357, shuffle=True)
        data_dict = {
            'train': dataset["train"],
            'val': dataset["test"],
        }

        def process(example, pad_to_multiple=512):
            """
            Tokenize the example text by encoding it into token IDs.
            """
            question = example['question']
            answer = example['solution']
            
            concatenated_text = f"{question}\n{answer}"
            # print(concatenated_text)
            ids = tokenize_with_pad(text=concatenated_text,
                                    pad_to_multiple=512)
            return {'ids': ids, 'len': len(ids)}

        # Tokenize and map the dataset
        tokenized = {}
        for split, dset in data_dict.items():
            tokenized[split] = dset.map(
                process,
                remove_columns=['style', 'question_id', 'subset', 'question', 'solution', 'test_code', 'test_info',
                                'gpt_pass_sequence', 'gpt_pass_trial_num', 'gpt_difficulty', 'gpt_pass_percentage', 
                                'r1_pass_sequence', 'r1_pass_trial_num', 'r1_correctness', 'r1_solution', 'metadata', 'conversations'],
                desc=f"Tokenizing {split} split",
                num_proc=num_proc
            )

        # Concatenate all the token IDs into one large binary file per split
        for split, dset in tokenized.items():
            # Save token IDs length
            len_arr = np.array(dset['len'], dtype=np.uint16)
            with open(os.path.join(KODCODE_DATA_PATH, f'{split}.len'), 'wb') as f:
                np.save(f, len_arr)
            # Total number of tokens
            arr_len = np.sum(dset['len'])
            filename = os.path.join(KODCODE_DATA_PATH, f'{split}.bin')
            dtype = np.uint16
            arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
            total_batches = 10

            idx = 0
            for batch_idx in tqdm(range(total_batches), desc=f'Writing {filename}'):
                batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy').to_dict()
                arr_batch = np.concatenate(batch['ids'])
                arr[idx: idx + len(arr_batch)] = arr_batch
                idx += len(arr_batch)
            arr.flush()
    
    # Load tokenized binary files for training, validation
    train_data = np.memmap(os.path.join(KODCODE_DATA_PATH, 'train.bin'), dtype=np.uint16, mode='r')
    val_data = np.memmap(os.path.join(KODCODE_DATA_PATH, 'val.bin'), dtype=np.uint16, mode='r')

    if return_torch:
        train_data = torch.tensor(np.array(train_data, dtype=np.uint16))
        val_data = torch.tensor(np.array(val_data, dtype=np.uint16))
    print(f'Benchmark KodCode: train[{len(train_data)}] | val[{len(val_data)}]')
    return {
        'train': train_data,
        'train_len': np.load(os.path.join(KODCODE_DATA_PATH, 'train.len')), 
        'val': val_data, 
        'val_len': np.load(os.path.join(KODCODE_DATA_PATH, 'val.len')),
    }

In [7]:
kodcode = get_kodcode(num_proc=10, return_torch=False)

incorrect-00009-of-00010.parquet:   0%|          | 0.00/173M [00:00<?, ?B/s]

use_with_caution-00000-of-00001.parquet:   0%|          | 0.00/39.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/245937 [00:00<?, ? examples/s]

Generating incorrect split:   0%|          | 0/192557 [00:00<?, ? examples/s]

Generating use_with_caution split:   0%|          | 0/4439 [00:00<?, ? examples/s]

Tokenizing train split (num_proc=10):   0%|          | 0/221343 [00:00<?, ? examples/s]

Tokenizing val split (num_proc=10):   0%|          | 0/24594 [00:00<?, ? examples/s]

Writing /scratch/homes/sfan/multi_doge/src/data/datasets/kodcode/train.bin: 100%|██████████| 10/10 [00:51<00:00,  5.14s/it]
Writing /scratch/homes/sfan/multi_doge/src/data/datasets/kodcode/val.bin: 100%|██████████| 10/10 [00:05<00:00,  1.86it/s]

Benchmark KodCode: train[193297920] | val[21355520]



