In [11]:
# run this script after extracting the dataset into a folder called "archive"
import os
import shutil
import random
from tqdm import tqdm

def make_train_val(folder, divider: int):
    train_folder, val_folder = f"{folder}_train", f"{folder}_test_only"
    # for every sub folder in folder
    for label in tqdm(os.listdir(folder), desc="Making train and test sets"):
        # if label is not a folder
        if not os.path.isdir(os.path.join(folder, label)):
            continue
        # make train and val sub folders
        os.makedirs(os.path.join(train_folder, label), exist_ok=True)
        os.makedirs(os.path.join(val_folder, label), exist_ok=True)
        # for every file in sub folder
        filenames = os.listdir(os.path.join(folder, label))
        random.shuffle(filenames)
        modulo = min(len(filenames), divider) # val is 1/5th but should contain at least 1 element
        for i in range(len(filenames)):
            if i % modulo == 0:
                shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(val_folder, label, filenames[i]))
            else:
                shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(train_folder, label, filenames[i]))

os.rename("./archive/vinted_train", "./archive/vinted")
make_train_val(folder="archive/vinted", divider=5)
shutil.rmtree("./archive/vinted")

Making train and test sets: 100%|██████████| 514/514 [00:27<00:00, 18.56it/s]


In [12]:
# then run this script to remove all classes with not enough images
# for example, if you have less than X images in the folder vinted_train you can remove it

min_nr_images = 100

for folder in ["./archive/vinted_train"]:
    for label in tqdm(os.listdir(folder), desc=f"Removing classes with less than {min_nr_images} images"):
        if not os.path.isdir(os.path.join(folder, label)):
            continue
        if len(os.listdir(os.path.join(folder, label))) < min_nr_images:
            shutil.rmtree(os.path.join(folder, label))
            if os.path.isdir(os.path.join("./archive/vinted_val", label)):
                shutil.rmtree(os.path.join("./archive/vinted_val", label))
            if os.path.isdir(os.path.join("./archive/vinted_test_only", label)):
                shutil.rmtree(os.path.join("./archive/vinted_test_only", label))

Removing classes with less than 100 images: 100%|██████████| 514/514 [00:00<00:00, 1539.32it/s]
