In [1]:
# Upload a local folder of .tif files to a Hugging Face *dataset* repo (no sharding)

# If not already installed in this env:
# !pip install -U datasets huggingface_hub pillow

import os
from datasets import load_dataset, Image
from datasets.utils.logging import disable_progress_bar, set_verbosity_error

# ── Silence logs/progress ────────────────────────────────────────────────
disable_progress_bar()
set_verbosity_error()
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# ── Auth (choose ONE of these) ───────────────────────────────────────────
# Option A: If you've already run `huggingface-cli login` in this environment, skip this.
# Option B: Use an access token (recommended for notebooks). Paste your write token:
# from huggingface_hub import login
# login("<YOUR_WRITE_TOKEN>")  # e.g. 'hf_xxx...'  ← your access code with write perms

# ── Local data & target repo ─────────────────────────────────────────────
DATA_DIR = r"C:\Users\Work\Desktop\NucleusNet-10K"   # 10,000 .tif files in a flat folder
REPO_ID  = "RussellBarkley/msa-em-figures"          # target HF *dataset* repo

# ── Build the dataset from a folder of images ────────────────────────────
# Note: For a flat folder (no class subfolders), "imagefolder" will still create a "label" column.
# We'll drop it so the dataset has just {"image", "filename"}.
ds = load_dataset(
    "imagefolder",
    data_dir=DATA_DIR,
    split="train",
    keep_in_memory=False,
)

# Remove dummy label column if present
if "label" in ds.column_names:
    ds = ds.remove_columns("label")

# Attach the filename while avoiding image decoding during the map
ds = ds.cast_column("image", Image(decode=False))

def add_filename(ex):
    return {"filename": os.path.basename(ex["image"]["path"])}

ds = ds.map(add_filename, batched=False)

# Re-enable decode for sanity checks / downstream usage
ds = ds.cast_column("image", Image())

print(ds)
print(ds.features)  # {'image': Image(...), 'filename': Value('string')}

# ── Push to hub without sharding ─────────────────────────────────────────
# Set max_shard_size to something larger than your dataset size so it stays as ONE .arrow file.
# (If your 10K TIFFs total, say, < 8–10 GB, "50GB" is plenty.)
ds.push_to_hub(
    REPO_ID,
    private=True,            # set False if you want it public
    max_shard_size="50GB",   # force a single shard for this small dataset
    # revision="main",       # optional: target branch
)


Dataset({
    features: ['image', 'filename'],
    num_rows: 10000
})
{'image': Image(mode=None, decode=True), 'filename': Value('string')}


README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


CommitInfo(commit_url='https://huggingface.co/datasets/RussellBarkley/msa-em-figures/commit/654b7f771e0e4948390ad0bb16334e3480217339', commit_message='Upload dataset', commit_description='', oid='654b7f771e0e4948390ad0bb16334e3480217339', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/RussellBarkley/msa-em-figures', endpoint='https://huggingface.co', repo_type='dataset', repo_id='RussellBarkley/msa-em-figures'), pr_revision=None, pr_num=None)