In [1]:
from rich import print
from pydantic import BaseModel
import xml.etree.ElementTree as ET

import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (16, 10)  # (w, h)

In [2]:
import PIL.Image
import numpy as np
from pathlib import Path


class Row(BaseModel):  # , arbitrary_types_allowed=True):
    #
    image_path: str  # PIL.Image.Image

    # ...
    industry_name: str
    company_name: str

    # bounding box (xmin, ymin, xmax, ymax)
    bbox: tuple[int, int, int, int]

    # @property
    # def image(self) -> np.ndarray:
    #     img = Image.open(self.image_path)
    #     return np.array(img.convert("RGB"))


rows = []
data_path = Path("../data")
dataset_name = "LogoDet-3K"
dataset_path = data_path / dataset_name

for industry_path in dataset_path.iterdir():
    if not industry_path.is_dir():
        continue

    for company_path in industry_path.iterdir():
        if not company_path.is_dir():
            continue

        for file_path in company_path.iterdir():
            if not file_path.is_file():
                continue

            if file_path.suffix == ".xml":
                # make sure for each xml file there exist corresponding jpg
                image_path = file_path.with_suffix(".jpg")
                assert image_path.exists()

                # parse xml
                tree = ET.parse(file_path)
                root = tree.getroot()

                xmin = int(root[-1][-1][0].text)
                ymin = int(root[-1][-1][1].text)
                xmax = int(root[-1][-1][2].text)
                ymax = int(root[-1][-1][3].text)

                row = Row(
                    # image related
                    image_path=str(image_path.relative_to(data_path)),
                    # meta
                    industry_name=industry_path.name,
                    company_name=company_path.name,
                    # bounding box
                    bbox=(xmin, ymin, xmax, ymax),
                )
                rows.append(row.model_dump())

In [3]:
import datasets

dataset = datasets.Dataset.from_list(rows).cast_column("image_path", datasets.Image())
dataset = dataset.class_encode_column("company_name")  # to be able to split
dataset = dataset.train_test_split(0.2, seed=42, stratify_by_column="company_name")

  from .autonotebook import tqdm as notebook_tqdm
Casting to class labels: 100%|██████████| 158654/158654 [00:00<00:00, 471789.21 examples/s]


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['image_path', 'industry_name', 'company_name', 'bbox'],
        num_rows: 126923
    })
    test: Dataset({
        features: ['image_path', 'industry_name', 'company_name', 'bbox'],
        num_rows: 31731
    })
})

In [8]:
# test = datasets.load_from_disk( data_dir=data_path)

dataset['train'][200]

{'image_path': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=521x382>,
 'industry_name': 'Others',
 'company_name': 2063,
 'bbox': [282, 146, 512, 258]}

In [None]:
dataset.push_to_hub(
    repo_id="PodYapolsky/LogoDet-3K",
    commit_message="",
)

Map: 100%|██████████| 21154/21154 [00:08<00:00, 2525.43 examples/s]
Creating parquet from Arrow format: 100%|██████████| 212/212 [00:00<00:00, 228.37ba/s]
Map: 100%|██████████| 21154/21154 [00:09<00:00, 2176.94 examples/s].92s/it]
Creating parquet from Arrow format: 100%|██████████| 212/212 [00:01<00:00, 187.76ba/s]
Map: 100%|██████████| 21154/21154 [00:08<00:00, 2353.18 examples/s].36s/it]
Creating parquet from Arrow format: 100%|██████████| 212/212 [00:00<00:00, 320.48ba/s]
Map: 100%|██████████| 21154/21154 [00:08<00:00, 2639.89 examples/s].42s/it]
Creating parquet from Arrow format: 100%|██████████| 212/212 [00:00<00:00, 300.00ba/s]
Map: 100%|██████████| 21154/21154 [00:07<00:00, 2652.02 examples/s].67s/it]
Creating parquet from Arrow format: 100%|██████████| 212/212 [00:00<00:00, 216.66ba/s]
Map: 100%|██████████| 21153/21153 [00:08<00:00, 2638.08 examples/s].12s/it]
Creating parquet from Arrow format: 100%|██████████| 212/212 [00:00<00:00, 266.92ba/s]
Uploading the dataset shards: 

CommitInfo(commit_url='https://huggingface.co/datasets/PodYapolsky/LogoDet-3K/commit/312539077a17bd6b56b871ca032e84512f4a6cdd', commit_message='Upload dataset', commit_description='', oid='312539077a17bd6b56b871ca032e84512f4a6cdd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/PodYapolsky/LogoDet-3K', endpoint='https://huggingface.co', repo_type='dataset', repo_id='PodYapolsky/LogoDet-3K'), pr_revision=None, pr_num=None)