# ChiniDataset — uint32 Tokenized Write + Read

1. Setup
2. Tokenize + Write
3. Read

## 1. Setup

Download Wikipedia EN shard (156k articles) and load GPT-2 tokenizer.

In [None]:
!uv pip install git+https://github.com/Scicom-AI-Enterprise-Organization/ChiniDataset.git
!uv pip install datasets transformers

In [None]:
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer

ds = load_dataset(
    "parquet",
    data_files="hf://datasets/wikimedia/wikipedia/20231101.en/train-00000-of-00041.parquet",
    split="train",
)
N = len(ds)
print(f"Loaded {N:,} articles")

tokenizer = AutoTokenizer.from_pretrained("gpt2")

def tokenize(text):
    ids = tokenizer(text, add_special_tokens=False)["input_ids"]
    return np.array(ids, dtype=np.uint32)

## 2. Tokenize + Write (inline)

Tokenize each article and write the uint32 token arrays in the same loop — no separate tokenization pass.

In [None]:
from chinidataset import ParquetWriter

chini_out = "./bench_chinidataset"

col = {"input_ids": "uint32[]", "labels": "uint32[]"}

with ParquetWriter(out=chini_out, columns=col, exist_ok=True) as w:
    for row in ds:
        tokens = tokenize(row["text"])
        w.write({"input_ids": tokens, "labels": tokens})

## 3. Read

In [None]:
from chinidataset import StreamingDataset

read_ds = StreamingDataset(local=chini_out)

for sample in read_ds:
    _ = sample["input_ids"]