In [1]:
import os
import json
import shutil
from pathlib import Path

**Step 1: Balance the images and keys** 

In [2]:
images_dir = Path("data/image")
keys_dir = Path("data/key")

keys = {key_file.stem for key_file in keys_dir.glob("*.json")}

for image_file in images_dir.glob("*.jpg"):  
    image_id = image_file.stem  
    if image_id not in keys:
        print(f"Deleting {image_file}...")
        image_file.unlink() 

print("Cleanup completed.")

Deleting data/image/invoice_833.jpg...
Deleting data/image/invoice_727.jpg...
Deleting data/image/invoice_627.jpg...
Deleting data/image/invoice_841.jpg...
Deleting data/image/invoice_922.jpg...
Deleting data/image/invoice_883.jpg...
Deleting data/image/invoice_818.jpg...
Deleting data/image/invoice_534.jpg...
Deleting data/image/invoice_614.jpg...
Deleting data/image/invoice_941.jpg...
Deleting data/image/invoice_934.jpg...
Deleting data/image/invoice_754.jpg...
Deleting data/image/invoice_998.jpg...
Deleting data/image/invoice_968.jpg...
Deleting data/image/invoice_887.jpg...
Deleting data/image/invoice_694.jpg...
Deleting data/image/invoice_628.jpg...
Deleting data/image/invoice_943.jpg...
Deleting data/image/invoice_517.jpg...
Deleting data/image/invoice_882.jpg...
Deleting data/image/invoice_851.jpg...
Deleting data/image/invoice_538.jpg...
Deleting data/image/invoice_767.jpg...
Deleting data/image/invoice_641.jpg...
Deleting data/image/invoice_741.jpg...
Deleting data/image/invoi

**Step2 : Convert data to Donut Format**

In [4]:
from src.data.tools.metadata_generator import DonutMetadataGenerator
from src.data.tools.dataset_generator import DonutDatasetGenerator

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
ROOT_PTH = os.getcwd()
ROOT_PTH = Path(ROOT_PTH).joinpath("data/processed/")

ROOT_PTH_KEY = Path(ROOT_PTH).joinpath("key/img")
ROOT_PTH_KEY.mkdir(parents=True, exist_ok=True)

dst_dir_json = str(ROOT_PTH_KEY.parent)
dst_dir_img = str(ROOT_PTH_KEY)

# copy JSON files from src to dst
files = os.listdir(keys_dir)
for f in files:
    src_file = os.path.join(keys_dir, f)
    dst_file = os.path.join(dst_dir_json, f)
    shutil.copy(src_file, dst_file)

# copy images from src to dst
files = os.listdir(images_dir)
for f in files:
    # copy img file, only if file with sane name exists in dst_dir_json
    if os.path.isfile(os.path.join(dst_dir_json, f[:-4] + ".json")):
        src_file = os.path.join(images_dir, f)
        dst_file = os.path.join(dst_dir_img, f)
        shutil.copy(src_file, dst_file)

# Convert to Donut format
base_path = ROOT_PTH
data_dir_path = Path(base_path).joinpath("key")
files = data_dir_path.glob("*.json")
files_list = [file for file in files]
# split files_list array into 3 parts, 85% train, 10% validation, 5% test
train_files_list = files_list[: int(len(files_list) * 0.85)]
print("Train set size:", len(train_files_list))
validation_files_list = files_list[
    int(len(files_list) * 0.85) : int(len(files_list) * 0.95)
]
print("Validation set size:", len(validation_files_list))
test_files_list = files_list[int(len(files_list) * 0.95) :]
print("Test set size:", len(test_files_list))

metadata_generator = DonutMetadataGenerator()
metadata_generator.generate(base_path, train_files_list, "train")
metadata_generator.generate(base_path, validation_files_list, "validation")
metadata_generator.generate(base_path, test_files_list, "test")

# Generate dataset
dataset_generator = DonutDatasetGenerator()
dataset_generator.generate(base_path)
# rename to final data
# os.rename(
#     os.path.join(str(base_path), "img"), os.path.join(str(base_path), "final")
# )
# shutil.rmtree(os.path.join(str(base_path), "key"))
Path(os.path.join(str(base_path), "final")).mkdir(parents=True,exist_ok = True)
shutil.rmtree(os.path.join(str(base_path), "final"))

# # rename to final data
# os.rename(
#     os.path.join(str(base_path), "img"), os.path.join(str(base_path), "final")
# )
# shutil.rmtree(os.path.join(str(base_path), "key"))

Train set size: 425
Validation set size: 50
Test set size: 26


Downloading data: 100%|██████████| 429/429 [00:00<00:00, 51984.99files/s]
Downloading data: 100%|██████████| 54/54 [00:00<00:00, 361231.92files/s]
Downloading data: 100%|██████████| 30/30 [00:00<00:00, 255231.48files/s]
Generating train split: 425 examples [00:00, 7238.67 examples/s]
Generating validation split: 50 examples [00:00, 6831.11 examples/s]
Generating test split: 26 examples [00:00, 3863.53 examples/s]


Dataset has 425 images
Dataset features are: dict_keys(['image', 'ground_truth'])
Random sample is 112
OCR text is {"gt_parse": {"header": {"invoice_no": "75367021", "invoice_date": "11/12/2018", "seller": "Martinez-Ramirez 358 Tiffany Prairie Jerryland, AZ 91504", "client": "Lawrence, Stevens and Robinson Unit 4195 Box 3361 DPO AP 86918", "seller_tax_id": "937-84-3596", "client_tax_id": "912-77-9229", "iban": "GB48JUXI42713843742192"}, "items": [{"item_desc": "6'x3' Marble Restaurant Dining Table Top Marquetry Inlay Occasional Decor H5017B", "item_qty": "3,00", "item_net_price": "5138,07", "item_net_worth": "15414,21", "item_vat": "10%", "item_gross_worth": "16 955,63"}, {"item_desc": "Carnelian Stone Floral Pattern Dinette Table Top Marble Coffee Table 30 Inches", "item_qty": "5,00", "item_net_price": "993,75", "item_net_worth": "4968,75", "item_vat": "10%", "item_gross_worth": "5 465,63"}, {"item_desc": "15\"x15\" White Marble Chess Top Table Carnelian Inlay Art Black Friday Gift De

**Step 3: Upload the dataset to Huggingface.**

In [None]:
from datasets import load_dataset

class DonutDatasetUploader:
    def upload(self, data_dir, dataset_name):
        # define paths
        img_dir_path = Path(data_dir).joinpath("img")

        dataset = load_dataset("imagefolder", data_dir=img_dir_path)

        dataset.push_to_hub(dataset_name, private=False)



In [None]:
dataset_uploader = DonutDatasetUploader()
dataset_uploader.upload('data/processed', "Rajan/AIMT-invoices-donut-data")
