# Preparing the OpenWebText dataset

This notebook mirrors `prepare.py` and walks through the steps needed to build the training binary files for nanoGPT.

In [None]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset

### Load the dataset and create train/val splits

In [None]:
num_proc = 8
num_proc_load_dataset = num_proc
dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test')

### Tokenize using GPT-2 BPE

In [None]:
enc = tiktoken.get_encoding("gpt2")
def process(example):
    ids = enc.encode_ordinary(example['text'])
    ids.append(enc.eot_token)
    return {'ids': ids, 'len': len(ids)}
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

### Write the tokens to binary files

In [None]:
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'], dtype=np.uint64)
    filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
    arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(arr_len,))
    total_batches = 1024
    idx = 0
    for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
        batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
        arr_batch = np.concatenate(batch['ids'])
        arr[idx : idx + len(arr_batch)] = arr_batch
        idx += len(arr_batch)
    arr.flush()

The resulting `train.bin` is about 17GB and `val.bin` around 8.5MB.