# `write_mp` — Parallel Write from HuggingFace Dataset

Load a multi-shard HuggingFace dataset, tokenize with a transform function, write in parallel.

In [None]:
!uv pip install git+https://github.com/Scicom-AI-Enterprise-Organization/ChiniDataset.git
!uv pip install datasets transformers

## 1. Load dataset

Wikipedia EN has 41 parquet shards. `load_dataset` handles all of them — returns one indexable dataset.

In [None]:
from datasets import load_dataset

ds = load_dataset("wikimedia/wikipedia", "20231101.en", split="train")
print(f"Loaded {len(ds):,} articles")

## 2. Define transform

The transform runs inside each worker — this is where you do per-row processing.

Must be a **top-level function** (not a lambda) for multiprocessing.

In [None]:
import numpy as np
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")

def transform(row):
    # preprocess text
    text = row["text"].strip().lower()
    text = f"title: {row['title']}\n\n{text}"

    # tokenize
    ids = tokenizer(text, add_special_tokens=False)["input_ids"]

    return {"input_ids": np.array(ids, dtype=np.uint32)}

## 3. `write_mp`

Partitions the dataset across 4 workers. Each worker iterates its chunk, applies `transform`, and writes to its own subdirectory. Index files are merged automatically.

In [None]:
from chinidataset import ParquetWriter

OUT = "./wiki_tokenized"
columns = {"input_ids": "uint32[]"}

with ParquetWriter(out=OUT, columns=columns, exist_ok=True) as writer:
    writer.write_mp(ds, num_workers=4, transform=transform)

## 4. Verify

In [None]:
import json, os

with open(os.path.join(OUT, "index.json")) as f:
    idx = json.load(f)

total = sum(s["samples"] for s in idx["shards"])
shards = len(idx["shards"])
partitions = len(set(s["raw_data"]["basename"].split("/")[0] for s in idx["shards"]))

print(f"Total samples: {total:,}")
print(f"Shards: {shards}")
print(f"Partitions: {partitions}")