# ChiniDataset vs MosaicML — uint32 Tokenized Write + Read

1. Setup
2. Tokenize + Write (inline)
3. Read
4. Summary

## 1. Setup

Download Wikipedia EN shard (156k articles) and build a simple word-level tokenizer (O(1) dict lookup).

In [None]:
!uv pip install git+https://github.com/Scicom-AI-Enterprise-Organization/ChiniDataset.git
!uv pip install mosaicml-streaming

In [None]:
import numpy as np
import pyarrow.parquet as pq
import shutil
import time
from pathlib import Path
from tqdm import tqdm
from huggingface_hub import hf_hub_download

print("Downloading Wikipedia EN shard...")
parquet_path = hf_hub_download(
    repo_id="wikimedia/wikipedia",
    filename="20231101.en/train-00000-of-00041.parquet",
    repo_type="dataset",
)

table = pq.read_table(parquet_path)
texts = table["text"].to_pylist()
N = len(texts)
print(f"Loaded {N:,} articles")

# Build word-level vocab — O(1) dict lookup per word
print("Building vocab...")
vocab = {}
for t in tqdm(texts, desc="Building vocab"):
    for word in t.split():
        if word not in vocab:
            vocab[word] = len(vocab)
print(f"Vocab size: {len(vocab):,} words")

def tokenize(text):
    return np.array([vocab.get(w, 0) for w in text.split()], dtype=np.uint32)

## 2. Tokenize + Write (inline)

Tokenize each article and write the uint32 token arrays in the same loop — no separate tokenization pass.

In [None]:
from chinidataset import ParquetWriter

chini_out = "./bench_chinidataset"
if Path(chini_out).exists():
    shutil.rmtree(chini_out)

t0 = time.perf_counter()

with ParquetWriter(out=chini_out, columns={"input_ids": "uint32[]", "labels": "uint32[]"}) as w:
    for t in tqdm(texts, desc="ChiniDataset"):
        tokens = tokenize(t)
        w.write({"input_ids": tokens, "labels": tokens})
        
chini_write = time.perf_counter() - t0
print(f"ChiniDataset: {chini_write:.2f}s | {N / chini_write:,.0f} rows/s")

In [None]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt32(Encoding):
    def encode(self, obj):
        return obj.tobytes()
    def decode(self, data):
        return np.frombuffer(data, np.uint32)

_encodings["uint32"] = UInt32

mds_out = "./bench_mosaicml"
if Path(mds_out).exists():
    shutil.rmtree(mds_out)

t0 = time.perf_counter()

with MDSWriter(out=mds_out, columns={"input_ids": "uint32", "labels": "uint32"}) as w:
    for t in tqdm(texts, desc="MosaicML"):
        tokens = tokenize(t)
        w.write({"input_ids": tokens, "labels": tokens})
        
mds_write = time.perf_counter() - t0
print(f"MosaicML: {mds_write:.2f}s | {N / mds_write:,.0f} rows/s")

## 3. Read

Read all rows back and measure throughput.

In [None]:
from chinidataset import StreamingDataset

ds = StreamingDataset(local=chini_out)

t0 = time.perf_counter()
count = 0
for sample in tqdm(ds, desc="ChiniDataset read", total=N):
    _ = sample["input_ids"]
    count += 1

chini_read = time.perf_counter() - t0
print(f"ChiniDataset: {count:,} rows | {chini_read:.2f}s | {count / chini_read:,.0f} rows/s")

In [None]:
from streaming import StreamingDataset as MosaicDS

mds_ds = MosaicDS(local=mds_out, shuffle=False, batch_size=1)

t0 = time.perf_counter()
count = 0
for sample in tqdm(mds_ds, desc="MosaicML read", total=N):
    _ = sample["input_ids"]
    count += 1

mds_read = time.perf_counter() - t0
print(f"MosaicML: {count:,} rows | {mds_read:.2f}s | {count / mds_read:,.0f} rows/s")

## 4. Summary

In [None]:
print(f"Dataset: Wikipedia EN ({N:,} articles), simple word tokenizer, uint32")
print()
print(f"{'Metric':<25} {'MosaicML':>15} {'ChiniDataset':>15} {'Speedup':>10}")
print("-" * 65)
print(f"{'Tokenize+Write (rows/s)':<25} {N / mds_write:>12,.0f}/s {N / chini_write:>12,.0f}/s {mds_write / chini_write:>9.1f}x")
print(f"{'Read (rows/s)':<25} {count / mds_read:>12,.0f}/s {count / chini_read:>12,.0f}/s {mds_read / chini_read:>9.1f}x")