In [1]:
import os
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from tqdm import tqdm
from transformers import SegformerForSemanticSegmentation, SegformerImageProcessor
from datasets import Dataset, DatasetDict, Features, Value, Image
import tempfile
from huggingface_hub import HfApi


In [2]:
def create_dataset(train_image_dir, train_mask_dir, test_image_dir):
    # --- Train split ---
    train_image_files = sorted([os.path.join(train_image_dir, f) for f in os.listdir(train_image_dir) if f.endswith('.jpg')])
    train_mask_files = sorted([os.path.join(train_mask_dir, f) for f in os.listdir(train_mask_dir) if f.endswith('.png')])

    assert len(train_image_files) == len(train_mask_files)

    train_ids = [os.path.splitext(os.path.basename(f))[0] for f in train_image_files]
    train_df = pd.DataFrame({
        "image": train_image_files,
        "mask": train_mask_files,
        "ImageId": train_ids
    })

    # --- Test split (no mask) ---
    test_image_files = sorted([os.path.join(test_image_dir, f) for f in os.listdir(test_image_dir) if f.endswith('.jpg')])
    test_ids = [os.path.splitext(os.path.basename(f))[0] for f in test_image_files]
    test_df = pd.DataFrame({
        "image": test_image_files,
        "mask": [None] * len(test_image_files),
        "ImageId": test_ids
    })

    # --- Features ---
    features = Features({
        "image": Image(decode=True),
        "mask": Image(decode=True),
        "ImageId": Value("string"),
    })

    # Convert to Hugging Face datasets
    train_dataset = Dataset.from_pandas(train_df, features=features)
    test_dataset = Dataset.from_pandas(test_df, features=features)

    dataset = DatasetDict({
        "train": train_dataset,
        "test": test_dataset
    })

    return dataset

In [4]:
train_image_dir = "/home/sergio/datasets/imaterialist_processed/images"
train_mask_dir = "/home/sergio/datasets/imaterialist_processed/masks"
test_image_dir = "/media/sergio/6A4A30C94A3093B3/Users/sergi/Desktop/datasets/imaterialist/images/test"

dataset = create_dataset(train_image_dir, train_mask_dir, test_image_dir)

# Preview
print(dataset)
print(dataset["train"][0])
print(dataset["test"][0])  # should show 'mask': None

# Push to Hub
dataset.push_to_hub("sergiomadrid/imaterialist")


DatasetDict({
    train: Dataset({
        features: ['image', 'mask', 'ImageId'],
        num_rows: 45623
    })
    test: Dataset({
        features: ['image', 'mask', 'ImageId'],
        num_rows: 3200
    })
})
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x726BB7D6F290>, 'mask': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x512 at 0x726BB7D6EEA0>, 'ImageId': '00000663ed1ff0c4e0132b9b9ac53f6e'}
{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=852x1024 at 0x726BB7D6CD40>, 'mask': None, 'ImageId': '003d41dd20f271d27219fe7ee6de727d'}


Uploading the dataset shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/11406 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Map:   0%|          | 0/11406 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Map:   0%|          | 0/11406 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Map:   0%|          | 0/11405 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/115 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/32 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/463 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/sergiomadrid/imaterialist/commit/85a220d0a0e10dfb8b20c141f775be4f03e23c41', commit_message='Upload dataset', commit_description='', oid='85a220d0a0e10dfb8b20c141f775be4f03e23c41', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/sergiomadrid/imaterialist', endpoint='https://huggingface.co', repo_type='dataset', repo_id='sergiomadrid/imaterialist'), pr_revision=None, pr_num=None)