In [None]:
import os
from glob import glob
from datasets import load_dataset, Features, Value

os.chdir("/n/home08/zkong/mufan/tmp/moebench/")

# 定义输入和输出目录
input_dir = "OLMo/data/minipile"
output_dir = "OLMo/data/minipile-jsonl"
os.makedirs(output_dir, exist_ok=True)

# 定义数据集的拆分
splits = ["train", "validation", "test"]


features = Features({"text": Value(dtype="string")})

# 处理每个拆分
for split in splits:
    # 构建数据文件路径
    data_files = {split: os.path.join(input_dir, f"data/{split}-*.parquet")}
    print(f"Data files: {data_files}")

    # 加载数据集
    dataset = load_dataset("parquet", data_files=data_files, split=split, features=features)

    # 计算每个文件应包含的样本数量
    num_shards = glob(data_files[split]).__len__()
    print(f"Split: {split}, num_shards: {num_shards}")
    shard_size = len(dataset) // num_shards

    # 保存为多个 JSON Lines 文件
    for shard_id in range(num_shards):
        start_idx = shard_id * shard_size
        end_idx = (shard_id + 1) * shard_size if shard_id < num_shards - 1 else len(dataset)
        shard_dataset = dataset.select(range(start_idx, end_idx))
        shard_path = os.path.join(output_dir, f"{split}-{shard_id:05d}-of-{num_shards:05d}.jsonl")
        shard_dataset.to_json(shard_path)

Data files: {'train': 'OLMo/data/minipile/data/train-*.parquet'}


Generating train split: 0 examples [00:00, ? examples/s]

Split: train, num_shards: 12


Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Data files: {'validation': 'OLMo/data/minipile/data/validation-*.parquet'}


Generating validation split: 0 examples [00:00, ? examples/s]

Split: validation, num_shards: 1


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Data files: {'test': 'OLMo/data/minipile/data/test-*.parquet'}


Generating test split: 0 examples [00:00, ? examples/s]

Split: test, num_shards: 1


Creating json from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
from datasets import load_dataset

dataset = load_dataset("json", data_dir="OLMo/data/minipile-jsonl")

In [9]:
from tqdm.auto import tqdm
import json

data_dir = "OLMo/data/minipile-jsonl"
for filename in tqdm(glob(f"{data_dir}/*.jsonl"), desc="Checking files"):
    print(f"Checking file: {filename}")
    index = filename.split("/")[-1].split("-")[1]

    results = []

    with open(filename, "r") as f:
        for lineno, line in enumerate(f):
            data = json.loads(line)
            data["id"] = f"minipile-{index}-{lineno}"

            results.append(data)

    with open(filename, "w") as f:
        for data in results:
            f.write(json.dumps(data) + "\n")

Checking files:   0%|          | 0/14 [00:00<?, ?it/s]

Checking file: OLMo/data/minipile-jsonl/test-00000-of-00001.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00000-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00001-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00002-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00003-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00004-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00005-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00006-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00007-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00008-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00009-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00010-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/train-00011-of-00012.jsonl
Checking file: OLMo/data/minipile-jsonl/validation-00000-of-00001.jsonl


In [10]:
dataset = load_dataset("json", data_dir="OLMo/data/minipile-jsonl")
dataset.push_to_hub("WhenceFade/minipile_olmoe")

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/12 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/84 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/WhenceFade/minipile_olmoe/commit/936cbea66db52408b3847f497a724a57b310e502', commit_message='Upload dataset', commit_description='', oid='936cbea66db52408b3847f497a724a57b310e502', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/WhenceFade/minipile_olmoe', endpoint='https://huggingface.co', repo_type='dataset', repo_id='WhenceFade/minipile_olmoe'), pr_revision=None, pr_num=None)