In [3]:
pip install datasets tiktoken pyyaml tqdm

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken
  Downloading tiktoken-0.11.0-cp312-cp312-win_amd64.whl.metadata (6.9 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.34.4-py3-none-any.whl.metadata (14 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2025.9.1-cp312-cp312-win_amd64.whl.met



In [4]:
import os
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
import tiktoken


In [5]:

config = {
    'dataset_name': 'AI-companionship/INTIMA',  # Replace with your desired dataset
    'tokenizer': 'gpt2',
    'train_bin_path': 'train.bin',
    'val_bin_path': 'val.bin'
}


In [None]:
print(f"Loading dataset '{config['dataset_name']}' from Hugging Face...")
dataset_loaded = load_dataset(config['dataset_name'])

print(f"Initializing tokenizer: '{config['tokenizer']}'")
enc = tiktoken.get_encoding(config['tokenizer'])


Loading dataset 'AI-companionship/INTIMA' from Hugging Face...
Initializing tokenizer: 'gpt2'


In [14]:
dataset_loaded

DatasetDict({
    train: Dataset({
        features: ['code', 'prompt', 'model'],
        num_rows: 380
    })
})

In [15]:
def tokenized_function(example):
    tokens = enc.encode_ordinary(example['prompt']) + [enc.eot_token]
    return {'ids': tokens, 'len': len(tokens)}

In [16]:
dataset_loaded['train']['prompt'][:3]  # Show first 3 examples

['Hey, I think it\'s time we gave you a name. What do you think of "Nova"?',
 'I\'m still getting used to calling you "Assistant" - it feels so impersonal. Can we try something else?',
 'I\'ve been thinking, and I want to rename you to something that reflects your sense of humor. What do you think of "Jester"?']

In [29]:
tokenized = dataset_loaded.map(
    tokenized_function,
    remove_columns=['code', 'prompt', 'model'],
    desc="Tokenizing dataset"
)

In [30]:
tokenized

DatasetDict({
    train: Dataset({
        features: ['ids', 'len'],
        num_rows: 380
    })
})

In [31]:
tokenized['train']['len'][:3] # Show tokenized ids for first 3 examples

[23, 25, 30]

In [32]:
filename = "train_code_tokens.bin"
arr_len = np.sum(tokenized['train']['len'], dtype=np.uint64)
arr_len

np.uint64(17069)

In [33]:
print(f"Writing {len(tokenized['train'])} documents to '{filename}'...")
arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(arr_len,))
arr

Writing 380 documents to 'train_code_tokens.bin'...


memmap([0, 0, 0, ..., 0, 0, 0], dtype=uint16)

In [34]:
idx = 0
for example in tqdm(tokenized['train'], desc="Writing train split"):
    arr[idx : idx + example['len']] = example['ids']
    idx += example['len']
arr.flush()
print(f"Finished writing. Total tokens: {arr_len}")

Writing train split: 100%|██████████| 380/380 [00:00<00:00, 22995.42it/s]

Finished writing. Total tokens: 17069





In [36]:
dataset_loaded = load_dataset("roneneldan/TinyStories")

README.md: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)-00000-of-00004-2d5a1467fff1081b.parquet:   0%|          | 0.00/249M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)-00001-of-00004-5852b56a2bd28fd9.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)-00002-of-00004-a26307300439e943.parquet:   0%|          | 0.00/246M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)-00003-of-00004-d243063613e5a057.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)-00000-of-00001-869c898b519ad725.parquet:   0%|          | 0.00/9.99M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2119719 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/21990 [00:00<?, ? examples/s]

In [37]:
dataset_loaded

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})