# ChiniDataset Example

1. Write from HuggingFace dataset
2. PyTorch DataLoader
3. Custom DataLoader
4. Inspect with pandas

In [None]:
!uv pip install git+https://github.com/Scicom-AI-Enterprise-Organization/ChiniDataset.git
!uv pip install datasets

## 1. Write from HuggingFace Dataset

In [None]:
import shutil
from pathlib import Path
from datasets import load_dataset
from chinidataset import ParquetWriter

hf_ds = load_dataset("stanfordnlp/imdb", split="test")

In [None]:
OUT_DIR = "./dataset"
if Path(OUT_DIR).exists():
    shutil.rmtree(OUT_DIR)

columns = {"text": "str", "label": "int32"}

with ParquetWriter(out=OUT_DIR, columns=columns) as writer:
    for row in hf_ds:
        writer.write(row)

## 2. PyTorch DataLoader

In [None]:
from chinidataset import StreamingDataset
from torch.utils.data import DataLoader

ds = StreamingDataset(local=OUT_DIR)

In [None]:
loader = DataLoader(ds, batch_size=32)

for batch in loader:
    text = batch["text"]
    label = batch["label"]

## 3. Custom DataLoader

1. **`DatasetFixed`** — wraps `StreamingDataset` in a map-style `Dataset`, does per-sample preprocessing in `__getitem__`
2. **Custom `collator`** — batches text + labels (in a real training script, you'd tokenize here)
3. **`DataLoader`** — wires it together with `collate_fn=collator`

In [None]:
import torch
from chinidataset import StreamingDataset
from torch.utils.data import DataLoader


# 1. Dataset wrapper (same pattern as DatasetFixed in qwen3_adamw.py)

class DatasetFixed(torch.utils.data.Dataset):
    def __init__(self, local):
        self.dataset = StreamingDataset(local=local)

    def __getitem__(self, idx):
        data = self.dataset[idx]
        # Per-sample preprocessing:
        # - cast dtypes, drop columns, filter, etc.
        # In qwen3_adamw.py this casts to int64 and drops audio/text columns.
        return data

    def __len__(self):
        return len(self.dataset)


# 2. Custom collator
#    cu_seq_lens for flash attention (packed training).
#    Here we show the same structural pattern with text + labels.

def collator(batch):
    batch = [b for b in batch if b is not None]
    texts = [b['text'] for b in batch]
    labels = torch.tensor([b['label'] for b in batch], dtype=torch.long)

    # In a real training script you'd tokenize here:
    #   encoded = tokenizer(texts, padding=True, return_tensors='pt')
    #   return {'input_ids': encoded.input_ids, 'labels': labels, ...}

    return {
        'text': texts,
        'labels': labels,
        'num_items_in_batch': torch.tensor(len(batch)),
    }


# 3. Wire it up
dataset = DatasetFixed(OUT_DIR)
print(f'Dataset: {len(dataset)} samples')
print(f'Sample: {dataset[0]}')

## 4. Read Parquet

In [None]:
import pandas as pd

df = pd.read_parquet(f"{OUT_DIR}/shard.00000.parquet")
df.head()