In [1]:
import json
import random
from datasets import load_dataset

import warnings
warnings.filterwarnings("ignore")

DATASET = "allenai/c4"
SUBSET = "en"               # default subset
SPLIT = "train"
STREAMING = True

OUTPUT_PATH = "pretrain_c4.jsonl"

MAX_SAMPLES = 500_000       # 5L
MAX_CHARS = 512             # strict limit

CHUNK_SIZE = 40_000         # ensures strong global shuffling


def write_chunk(chunk, path):
    """Shuffle chunk & append to disk."""
    random.shuffle(chunk)
    with open(path, "a", encoding="utf-8") as f:
        for item in chunk:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")


def main():
    print(f"ðŸ”„ Streaming dataset: {DATASET} ({SUBSET} subset)")

    ds = load_dataset(
        DATASET,
        name=SUBSET,
        split=SPLIT,
        streaming=STREAMING
    )

    buffer = []
    written = 0
    total_chars = 0
    chunk_id = 1

    for sample in ds:
        if written >= MAX_SAMPLES:
            break

        text = (sample.get("text") or "").strip()
        if not text:
            continue

        # strict length
        if len(text) >= MAX_CHARS:
            continue

        total_chars += len(text)

        buffer.append({
            "text": f"<|im_start|>{text}<|im_end|>"
        })
        written += 1

        # chunk flushing
        if len(buffer) >= CHUNK_SIZE:
            print(f"ðŸ§¹ Writing chunk {chunk_id} ({len(buffer)}) â€” Total: {written}")
            write_chunk(buffer, OUTPUT_PATH)
            buffer.clear()
            chunk_id += 1

    # final chunk
    if buffer:
        print(f"ðŸ§¹ Writing final chunk ({len(buffer)})")
        write_chunk(buffer, OUTPUT_PATH)

    # summary
    print("\nðŸŽ‰ DONE!")
    print(f"ðŸ“¦ Total samples written: {written}")
    print(f"ðŸ”¢ Estimated tokens: {total_chars / 4:.0f}")  # approx 4 chars/token
    print(f"ðŸ”¢ Tokens (millions): {total_chars / 4 / 1e6:.3f}M")
    print(f"ðŸ“„ Saved to: {OUTPUT_PATH}")


if __name__ == "__main__":
    main()

ðŸ”„ Streaming dataset: allenai/c4 (en subset)


Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1024 [00:00<?, ?it/s]

ðŸ§¹ Writing chunk 1 (40000) â€” Total: 40000
ðŸ§¹ Writing chunk 2 (40000) â€” Total: 80000
ðŸ§¹ Writing chunk 3 (40000) â€” Total: 120000
ðŸ§¹ Writing chunk 4 (40000) â€” Total: 160000
ðŸ§¹ Writing chunk 5 (40000) â€” Total: 200000
ðŸ§¹ Writing chunk 6 (40000) â€” Total: 240000
ðŸ§¹ Writing chunk 7 (40000) â€” Total: 280000
ðŸ§¹ Writing chunk 8 (40000) â€” Total: 320000
ðŸ§¹ Writing chunk 9 (40000) â€” Total: 360000
ðŸ§¹ Writing chunk 10 (40000) â€” Total: 400000
ðŸ§¹ Writing chunk 11 (40000) â€” Total: 440000
ðŸ§¹ Writing chunk 12 (40000) â€” Total: 480000
ðŸ§¹ Writing final chunk (20000)

ðŸŽ‰ DONE!
ðŸ“¦ Total samples written: 500000
ðŸ”¢ Estimated tokens: 39842906
ðŸ”¢ Tokens (millions): 39.843M
ðŸ“„ Saved to: pretrain_c4.jsonl
