In [1]:
# run this script after extracting the dataset into a folder called "archive"
# it creates new folders with only a few classes

import os
import shutil
import random
from tqdm import tqdm
import json


def merge_folders(source_root,source_dir_list, target_dir):

    """
    Merge two folders into one
    """
    if not os.path.exists(target_dir):
        os.mkdir(target_dir)
    for source_dir in source_dir_list:
        full_path_source = os.path.join(source_root, source_dir)
        for root, dirs, files in os.walk(full_path_source):
            for file in files:
                shutil.copy(os.path.join(root, file), os.path.join(target_dir, file))


def merge_folders_from_dict(source_path, destination_path):
    """"""

    if not os.path.exists(destination_path):
        os.mkdir(destination_path)

    categories ={}
    with open('final_classes.json', 'r') as dict_reader:
        categories=json.load(dict_reader)

    for k, v in categories.items():
        target_dir = os.path.join(destination_path, k) 
        #print(f'the target directory is: {target_dir}') 
        merge_folders(source_path,v, target_dir)

merge_folders_from_dict('./archive/vinted_train', './archive/vinted_train_merged_folder')
merge_folders_from_dict('./archive/vinted_val', './archive/vinted_val_merged_folder')
shutil.rmtree('./archive/vinted_train')
shutil.rmtree('./archive/vinted_val')
os.rename('./archive/vinted_train_merged_folder', './archive/vinted_train')
os.rename('./archive/vinted_val_merged_folder', './archive/vinted_val')


In [2]:

import os
import shutil
import random
from tqdm import tqdm

def make_train_val(folder, divider: int):
    train_folder, val_folder = f"{folder}_train", f"{folder}_test_only"
    # for every sub folder in folder
    for label in tqdm(os.listdir(folder), desc="Making train and test sets"):
        # if label is not a folder
        if not os.path.isdir(os.path.join(folder, label)):
            continue
        # make train and val sub folders
        os.makedirs(os.path.join(train_folder, label), exist_ok=True)
        os.makedirs(os.path.join(val_folder, label), exist_ok=True)
        # for every file in sub folder
        filenames = os.listdir(os.path.join(folder, label))
        random.shuffle(filenames)
        modulo = min(len(filenames), divider) # val is 1/5th but should contain at least 1 element
        for i in range(len(filenames)):
            if i % modulo == 0:
                shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(val_folder, label, filenames[i]))
            else:
                shutil.copy(os.path.join(folder, label, filenames[i]), os.path.join(train_folder, label, filenames[i]))

os.rename("./archive/vinted_train", "./archive/vinted")
make_train_val(folder="archive/vinted", divider=5)
shutil.rmtree("./archive/vinted")

Making train and test sets: 100%|██████████| 21/21 [00:32<00:00,  1.53s/it]


In [3]:
# then run this script to remove all classes with not enough images
# for example, if you have less than X images in the folder vinted_train you can remove it

min_nr_images = 100

for folder in ["./archive/vinted_train"]:
    for label in tqdm(os.listdir(folder), desc=f"Removing classes with less than {min_nr_images} images"):
        if not os.path.isdir(os.path.join(folder, label)):
            continue
        if len(os.listdir(os.path.join(folder, label))) < min_nr_images:
            shutil.rmtree(os.path.join(folder, label))
            if os.path.isdir(os.path.join("./archive/vinted_val", label)):
                shutil.rmtree(os.path.join("./archive/vinted_val", label))
            if os.path.isdir(os.path.join("./archive/vinted_test_only", label)):
                shutil.rmtree(os.path.join("./archive/vinted_test_only", label))

Removing classes with less than 100 images: 100%|██████████| 21/21 [00:00<00:00, 404.71it/s]
