In [3]:
# --- 1. list all repos whose ID contains a keyword ------------
from datasets import get_dataset_infos
from huggingface_hub import list_datasets

def search(keyword, limit=20):
    print(f"\n🔍 searching '{keyword}' …")
    for ds in list_datasets(search=keyword, limit=limit):
        print(" ", ds.id)

search("refinedweb")
search("refined-web")
search("refined_web")
search("slimpajama")
search("dolma")
search("redpajama")



🔍 searching 'refinedweb' …
  tiiuae/falcon-refinedweb
  crumb/flan-t5-small-embed-refinedweb
  crumb/flan-t5-base-embed-refinedweb
  crumb/flan-t5-large-embed-refinedweb
  crumb/flan-t5-xl-embed-refinedweb
  mattymchen/refinedweb-3m
  andersonbcdefg/falcon-refinedweb-labeled
  crumb/refinedweb-2mil-128clusters
  crumb/refinedweb-22mil-128clusters
  mponty/refinedweb_docs
  BEE-spoke-data/falcon-refinedweb-100k_en_med-sample
  BEE-spoke-data/falcon-refinedweb-1M_en_medium
  BEE-spoke-data/falcon-refinedweb-100k_en-long
  BEE-spoke-data/falcon-refinedweb-100k_en-xlong
  pinecone/refinedweb-generated-questions
  vilm/refinedweb-1m-medium
  kenhktsui/refinedweb-3m_quality_score_v1
  chaoscodes/refinedweb-500
  orionweller/refinedweb_mds_incremental
  mlfoundations/refinedweb_banned_domains_curated

🔍 searching 'refined-web' …
  tiiuae/falcon-refinedweb
  crumb/flan-t5-small-embed-refinedweb
  crumb/flan-t5-base-embed-refinedweb
  crumb/flan-t5-large-embed-refinedweb
  crumb/flan-t5-xl-emb

In [2]:
# --- 2. show configs for a candidate repo --------------------
from datasets import get_dataset_config_names

repo = "allenai/dolma"          # edit and re-run
print(repo, "configs →", get_dataset_config_names(repo))


allenai/dolma configs → ['v1', 'v1_5', 'v1_5-sample', 'v1_6', 'v1_6-sample', 'v1_7']


In [5]:
from huggingface_hub import (
    list_repo_files,           # enumerate every object in the repo
    hf_hub_url,                # build its download URL
    get_hf_file_metadata,      # fetch the HEAD metadata
)

repo_id = "tiiuae/falcon-refinedweb"
repo_type = "dataset"

safe_files = []
unsafe_files = []

for fp in list_repo_files(repo_id, repo_type=repo_type):
    url = hf_hub_url(repo_id, fp, repo_type=repo_type)
    meta = get_hf_file_metadata(url)          # <-- only the URL goes here

    # The Hub adds an extra header `x-amz-meta-virus-scan-result`
    # for every file it has scanned with ClamAV.
    scan_result = getattr(meta, "virus_scan_result", None)

    if scan_result in (None, "ok"):           # 'None' = not scanned yet
        safe_files.append(fp)
    elif scan_result == "infected":
        unsafe_files.append(fp)
    else:
        print(f"[warn] unexpected value {scan_result!r} on {fp}")
