# Speed testing large dataset

1. Purely huggingface implementation
2. S3 with datasets package
3. PelicanFS with datasets package

## HF baseline

[the-stack-dedup](https://huggingface.co/datasets/bigcode/the-stack-dedup)

In [None]:
import os
from datasets import load_dataset
from random import sample

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("bigcode/the-stack-dedup", split="train", streaming=True)
for x in ds.with_format("torch").take(2):
    print(x)

# Disregard the download time, time to first sample is around 5s

{'hexsha': 'd66b6e8d1802ed0a290dd994b9af0da47fc99e83', 'size': tensor(475), 'ext': 'abap', 'lang': 'ABAP', 'max_stars_repo_path': 'src/ixml/if_ixml_node_list.intf.abap', 'max_stars_repo_name': 'FreHu/deps', 'max_stars_repo_head_hexsha': 'cace18b54b325d99e4c54293624c1d2811a68ddd', 'max_stars_repo_licenses': ['MIT'], 'max_stars_count': tensor(nan), 'max_stars_repo_stars_event_min_datetime': None, 'max_stars_repo_stars_event_max_datetime': None, 'max_issues_repo_path': 'src/ixml/if_ixml_node_list.intf.abap', 'max_issues_repo_name': 'FreHu/deps', 'max_issues_repo_head_hexsha': 'cace18b54b325d99e4c54293624c1d2811a68ddd', 'max_issues_repo_licenses': ['MIT'], 'max_issues_count': tensor(nan), 'max_issues_repo_issues_event_min_datetime': None, 'max_issues_repo_issues_event_max_datetime': None, 'max_forks_repo_path': 'src/ixml/if_ixml_node_list.intf.abap', 'max_forks_repo_name': 'FreHu/deps', 'max_forks_repo_head_hexsha': 'cace18b54b325d99e4c54293624c1d2811a68ddd', 'max_forks_repo_licenses': ['M

### S3 + datasets

In [None]:
storage_options = {
    "key": os.getenv("S3_ACCESS_KEY_ID"),
    "secret": os.getenv("S3_SECRET_ACCESS_KEY"),
    "client_kwargs": {"endpoint_url": "https://web.s3.wisc.edu"},
}

In [4]:
ds = load_dataset(
    "parquet", data_files="s3://pelican-data-loader/data/datasets--bigcode--the-stack-dedup/**/*.parquet", storage_options=storage_options, streaming=True
)
for x in ds.with_format("torch")["train"].take(2):
    print(x)

{'hexsha': 'd66b6e8d1802ed0a290dd994b9af0da47fc99e83', 'size': tensor(475), 'ext': 'abap', 'lang': 'ABAP', 'max_stars_repo_path': 'src/ixml/if_ixml_node_list.intf.abap', 'max_stars_repo_name': 'FreHu/deps', 'max_stars_repo_head_hexsha': 'cace18b54b325d99e4c54293624c1d2811a68ddd', 'max_stars_repo_licenses': ['MIT'], 'max_stars_count': tensor(nan), 'max_stars_repo_stars_event_min_datetime': None, 'max_stars_repo_stars_event_max_datetime': None, 'max_issues_repo_path': 'src/ixml/if_ixml_node_list.intf.abap', 'max_issues_repo_name': 'FreHu/deps', 'max_issues_repo_head_hexsha': 'cace18b54b325d99e4c54293624c1d2811a68ddd', 'max_issues_repo_licenses': ['MIT'], 'max_issues_count': tensor(nan), 'max_issues_repo_issues_event_min_datetime': None, 'max_issues_repo_issues_event_max_datetime': None, 'max_forks_repo_path': 'src/ixml/if_ixml_node_list.intf.abap', 'max_forks_repo_name': 'FreHu/deps', 'max_forks_repo_head_hexsha': 'cace18b54b325d99e4c54293624c1d2811a68ddd', 'max_forks_repo_licenses': ['M

ttfs around 13s, still acceptable, especially with prefetching

### Pelicanfs

In [8]:
# Intuitively this should work, but it doesn't for some reason
# dataset = load_dataset("parquet", data_files="pelican://uwdf-director.chtc.wisc.edu/wisc.edu/dsi/data/datasets--bigcode--the-stack-dedup/**/*.parquet", streaming=True)

In [27]:
from pelicanfs.core import PelicanFileSystem

pelfs = PelicanFileSystem("pelican://uwdf-director.chtc.wisc.edu")
parquet_files = pelfs.glob("/wisc.edu/dsi/pytorch/data/datasets--bigcode--the-stack-dedup/**/*.parquet")
parquet_files = [f"pelican://uwdf-director.chtc.wisc.edu{path}" for path in parquet_files]  # append pelican prefix
parquet_files[:3]

['pelican://uwdf-director.chtc.wisc.edu/wisc.edu/dsi/pytorch/data/datasets--bigcode--the-stack-dedup/snapshots/17cad72c886a2858e08d4c349a00d6466f54df63/data/abap/data-00000-of-00001.parquet',
 'pelican://uwdf-director.chtc.wisc.edu/wisc.edu/dsi/pytorch/data/datasets--bigcode--the-stack-dedup/snapshots/17cad72c886a2858e08d4c349a00d6466f54df63/data/actionscript/data-00000-of-00002.parquet',
 'pelican://uwdf-director.chtc.wisc.edu/wisc.edu/dsi/pytorch/data/datasets--bigcode--the-stack-dedup/snapshots/17cad72c886a2858e08d4c349a00d6466f54df63/data/actionscript/data-00001-of-00002.parquet']

+30s for globbing

In [None]:
# 10 files run fine
def test_pelican(parquet_files: list[str], n: int | None = None) -> None:
    if n is None:
        data_files = parquet_files
    else:
        data_files = sample(parquet_files, n)

    ds = load_dataset("parquet", data_files=data_files, streaming=True)
    for x in ds.with_format("torch")["train"].take(2):
        print(x)


test_pelican(parquet_files=parquet_files, n=10)

{'hexsha': '6f6776872e7864ef2bfc0b7f65ce83c62e010ef0', 'size': tensor(788), 'ext': 'svg', 'lang': 'SVG', 'max_stars_repo_path': 'assets/svg/roundrightfill.svg', 'max_stars_repo_name': 'hzb1/happy-mobile', 'max_stars_repo_head_hexsha': 'ef3e59a50947bfe999ba5474713c1712a0ac0696', 'max_stars_repo_licenses': ['MIT'], 'max_stars_count': tensor(5), 'max_stars_repo_stars_event_min_datetime': '2019-03-10T03:12:43.000Z', 'max_stars_repo_stars_event_max_datetime': '2022-02-08T02:20:11.000Z', 'max_issues_repo_path': 'assets/svg/roundrightfill.svg', 'max_issues_repo_name': 'hzb1/happy-mobile', 'max_issues_repo_head_hexsha': 'ef3e59a50947bfe999ba5474713c1712a0ac0696', 'max_issues_repo_licenses': ['MIT'], 'max_issues_count': tensor(7), 'max_issues_repo_issues_event_min_datetime': '2019-06-11T05:05:12.000Z', 'max_issues_repo_issues_event_max_datetime': '2022-02-26T11:31:58.000Z', 'max_forks_repo_path': 'assets/svg/roundrightfill.svg', 'max_forks_repo_name': 'hzb1/happy-mobile', 'max_forks_repo_head_h

In [None]:
test_pelican(parquet_files=parquet_files, n=100)

# 100 occasionally fails
# Note that the file mentioned in the file not found error vary between runs

{'hexsha': '634a410ddc38e17420f95e7e29e5daf4fc827f10', 'size': tensor(162), 'ext': 'md', 'lang': 'Markdown', 'max_stars_repo_path': 'SystemCommands-PackageCommands.package/SycRenamePackageCommand.class/README.md', 'max_stars_repo_name': 'macta/SystemCommands', 'max_stars_repo_head_hexsha': '71f943ce069777f6708a05eb420ea75dfc5f7717', 'max_stars_repo_licenses': ['MIT'], 'max_stars_count': tensor(nan), 'max_stars_repo_stars_event_min_datetime': None, 'max_stars_repo_stars_event_max_datetime': None, 'max_issues_repo_path': 'SystemCommands-PackageCommands.package/SycRenamePackageCommand.class/README.md', 'max_issues_repo_name': 'macta/SystemCommands', 'max_issues_repo_head_hexsha': '71f943ce069777f6708a05eb420ea75dfc5f7717', 'max_issues_repo_licenses': ['MIT'], 'max_issues_count': tensor(nan), 'max_issues_repo_issues_event_min_datetime': None, 'max_issues_repo_issues_event_max_datetime': None, 'max_forks_repo_path': 'SystemCommands-PackageCommands.package/SycRenamePackageCommand.class/READM

In [40]:
# Full dataset always fails
test_pelican(parquet_files=parquet_files)


FileNotFoundError: Unable to find 'pelican://uwdf-director.chtc.wisc.edu/wisc.edu/dsi/pytorch/data/datasets--bigcode--the-stack-dedup/snapshots/17cad72c886a2858e08d4c349a00d6466f54df63/data/coldfusion/data-00000-of-00001.parquet'