In [1]:
import json
import random
from datasets import load_dataset

import warnings
warnings.filterwarnings("ignore")

DATASET = "wikimedia/wikipedia"
SUBSET = "20231101.en"               # default subset
SPLIT = "train"
STREAMING = True

OUTPUT_PATH = "pretrain_wikipedia_en.jsonl"

MAX_SAMPLES = 500_000       # 5L
MAX_CHARS = 512             # strict limit

CHUNK_SIZE = 50_000         # ensures strong global shuffling


def write_chunk(chunk, path):
    """Shuffle chunk & append to disk."""
    random.shuffle(chunk)
    with open(path, "a", encoding="utf-8") as f:
        for item in chunk:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


def main():
    print(f"ðŸ”„ Streaming dataset: {DATASET} ({SUBSET} subset)")

    ds = load_dataset(
        DATASET,
        name=SUBSET,
        split=SPLIT,
        streaming=STREAMING
    )

    buffer = []
    written = 0
    total_chars = 0
    chunk_id = 1

    for sample in ds:
        if written >= MAX_SAMPLES:
            break

        text = (sample.get("text") or "").strip()
        if not text:
            continue

        # strict length
        if len(text) >= MAX_CHARS:
            continue

        total_chars += len(text)

        buffer.append({
            "text": f"<|im_start|>{text}<|im_end|>"
        })
        written += 1

        # chunk flushing
        if len(buffer) >= CHUNK_SIZE:
            print(f"ðŸ§¹ Writing chunk {chunk_id} ({len(buffer)}) â€” Total: {written}")
            write_chunk(buffer, OUTPUT_PATH)
            buffer.clear()
            chunk_id += 1

    # final chunk
    if buffer:
        print(f"ðŸ§¹ Writing final chunk ({len(buffer)})")
        write_chunk(buffer, OUTPUT_PATH)

    # summary
    print("\nðŸŽ‰ DONE!")
    print(f"ðŸ“¦ Total samples written: {written}")
    print(f"ðŸ”¢ Estimated tokens: {total_chars / 4:.0f}")  # approx 4 chars/token
    print(f"ðŸ”¢ Tokens (millions): {total_chars / 4 / 1e6:.3f}M")
    print(f"ðŸ“„ Saved to: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()

ðŸ”„ Streaming dataset: wikimedia/wikipedia (20231101.en subset)


Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

ðŸ§¹ Writing chunk 1 (50000) â€” Total: 50000
ðŸ§¹ Writing chunk 2 (50000) â€” Total: 100000
ðŸ§¹ Writing chunk 3 (50000) â€” Total: 150000
ðŸ§¹ Writing chunk 4 (50000) â€” Total: 200000
ðŸ§¹ Writing chunk 5 (50000) â€” Total: 250000
ðŸ§¹ Writing chunk 6 (50000) â€” Total: 300000
ðŸ§¹ Writing chunk 7 (50000) â€” Total: 350000
ðŸ§¹ Writing chunk 8 (50000) â€” Total: 400000
ðŸ§¹ Writing chunk 9 (50000) â€” Total: 450000
ðŸ§¹ Writing chunk 10 (50000) â€” Total: 500000

ðŸŽ‰ DONE!
ðŸ“¦ Total samples written: 500000
ðŸ”¢ Estimated tokens: 36951168
ðŸ”¢ Tokens (millions): 36.951M
ðŸ“„ Saved to: pretrain_wikipedia_en.jsonl
