### Split up the dataset into 3 chunks

We want to compare training on the full dataset with OSFT versus training on 3 chunks in a sequence.

This notebook handles the data conversion

In [5]:
!ls -al /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/train_p07.jsonl

-rw-r--r--. 1 oleg oleg 44522306 Jul 31 02:34 /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/train_p07.jsonl


In [6]:
# create directory for saving
!mkdir -p /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks

In [26]:
import datasets
import math


# load the dataset
# local_dataset = "/mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/train_p07.jsonl"
# ds = datasets.load_dataset("json", data_files=local_dataset, split="train")

def split_dataset(ds: datasets.Dataset, num_chunks: int) -> list[datasets.Dataset]:
    split_size = round(len(ds) / num_chunks)
    ordered_chunks = []
    for i in range(0, len(ds), split_size):
        selection = ds.select(range(i, min(i + split_size, len(ds))))
        ordered_chunks += [selection]
    
    assert sum(len(c) for c in ordered_chunks) == len(ds)
    return ordered_chunks

# load the dataset and split it per-chunk
chunk_range = range(2, 12)
ds_range = range(2, 235)
for N_chunks in chunk_range:
    for ds_size in ds_range:
        # for debugging
        nums = [{"num": k} for k in range(17)]
        fake_ds = datasets.Dataset.from_list(nums)
        ds = fake_ds

        ordered_chunks = split_dataset(fake_ds, N_chunks)

        assert sum(len(c) for c in ordered_chunks) == len(ds)

In [None]:
!ls -al /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks/

total 8
drwxr-xr-x. 2 oleg oleg 4096 Jul 31 03:57 .
drwxr-xr-x. 3 oleg oleg 4096 Jul 31 03:57 ..


In [28]:
import os

# okay now we copy the same formula
N_chunks = 3
local_dataset = "/mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/train_p07.jsonl"
ds = datasets.load_dataset("json", data_files=local_dataset, split="train")
ds_chunks = split_dataset(ds, N_chunks)
assert sum(len(chunk) for chunk in ds_chunks) == len(ds)

# save the dataset
output_dir = "/mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks"

# write out the datasets
for i, ds_chunk in enumerate(ds_chunks):
    filename = f"chunk_{i}.jsonl"
    output_path = os.path.join(output_dir, filename)
    print(f"writing ds to {output_path}...")
    ds_chunk.to_json(output_path)

# now load the chunks back in order and validate
loaded_chunks = []
for i in range(N_chunks):
    chunk_path = os.path.join(output_dir, f"chunk_{i}.jsonl")
    print(f"loading chunk from {chunk_path}...")
    chunk_ds = datasets.load_dataset("json", data_files=chunk_path, split="train")
    loaded_chunks.append(chunk_ds)
    
# concatenate all chunks
reconstructed_ds = datasets.concatenate_datasets(loaded_chunks)
    
# validate length matches
assert len(reconstructed_ds) == len(ds), f"Length mismatch: {len(reconstructed_ds)} != {len(ds)}"
    
# validate contents match exactly
for i in range(len(ds)):
    assert ds[i] == reconstructed_ds[i], f"Mismatch at index {i}"
    
print("Successfully validated that chunks can be loaded back in same order")



writing ds to /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks/chunk_0.jsonl...


Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

writing ds to /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks/chunk_1.jsonl...


Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

writing ds to /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks/chunk_2.jsonl...


Creating json from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

loading chunk from /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks/chunk_0.jsonl...


Generating train split: 0 examples [00:00, ? examples/s]

loading chunk from /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks/chunk_1.jsonl...


Generating train split: 0 examples [00:00, ? examples/s]

loading chunk from /mnt/nvme1n1/experiments/os-cl-scenario-1-experiment-0/bmo/3-chunks/chunk_2.jsonl...


Generating train split: 0 examples [00:00, ? examples/s]

Successfully validated that chunks can be loaded back in same order
