In [29]:
import os
import shutil

getting the correct name for directory

In [30]:
def get_dir_name(dir_name):
    if not os.path.isdir(dir_name):
        return dir_name

    path_contents = os.listdir(f"{os.getcwd()}")
    target_dirs = [_ for _ in path_contents if dir_name in _]
    target_dirs.remove('prepared_dataset')
    if len(target_dirs) == 1:
        return f'{dir_name}_01'

    last_two_elements = target_dirs[-1][-2:]
    dir_num = int(last_two_elements) + 1

    return f'{dir_name}_{dir_num:02}'

check if dataset could be split

In [31]:
def is_dataset_valid(images_path, labels_path):
    images_numbers, labels_numbers = len(os.listdir(images_path)), len(os.listdir(labels_path))
    if images_numbers == labels_numbers:
        return True
    return False

create the collection of target directory

In [32]:
def create_dir_collection(src_path):
    os.mkdir(src_path)
    images_path, labels_path = f"{src_path}/images", f"{src_path}/labels"
    os.mkdir(images_path)
    os.mkdir(labels_path)

checks if directory exist

In [33]:
def is_dir_exist(path):
    return os.path.isdir(path)

decompose directory and the prepared dataset to be split

In [34]:
def decompose(path):
    if not os.path.exists(path):
        raise Exception("the given path is incorrect!")

    target_dir = ['train', 'valid', 'test']
    
    # check if desire directory exist
    pd_dir = f"{os.getcwd()}/prepared_dataset"
    dst_images_dir = f"{pd_dir}/images"
    dst_labels_dir = f"{pd_dir}/labels"
    if is_dir_exist(pd_dir):
        raise Exception("you must delete 'prepared_dataset folder'!")

    os.mkdir(pd_dir)
    os.mkdir(dst_images_dir)
    os.mkdir(dst_labels_dir)

    for dir in target_dir:
        src_dir = f"{path}/{dir}"
        images_src = f"{src_dir}/images"
        labels_src = f"{src_dir}/labels"
        images_list = os.listdir(images_src)
        labels_list = os.listdir(labels_src)
        for img in images_list:
            shutil.copy2(f"{images_src}/{img}", dst_images_dir)

        for label in labels_list:
            shutil.copy2(f"{labels_src}/{label}", dst_labels_dir)

Computer Vision dataset split

In [35]:
def cv_split(raw_dataset_path: str, train_s: int, valid_s: int, test_s: int):
    check_size = train_s + valid_s + test_s
    if check_size != 100:
        raise Exception('make sure that your split data size is incorrect!')

    images_path = os.path.join(raw_dataset_path, 'images')
    labels_path = os.path.join(raw_dataset_path, 'labels')

    if not is_dataset_valid(images_path, labels_path):
        raise Exception('make sure that number of images equal to labels!')

    files = (os.listdir(images_path), os.listdir(labels_path))
    dataset_dir = get_dir_name('dataset')
    os.mkdir(dataset_dir)

    #number of elements
    num_elements = get_elements(files[0], train_s, valid_s, test_s)

    #creating train dir
    train_path = f'{dataset_dir}/train'
    create_dir_collection(train_path)
    setup_train(files, raw_dataset_path, train_path, num_elements['train'])

    #creating valid dir
    valid_path = f'{dataset_dir}/valid'
    create_dir_collection(valid_path)
    valid_index = num_elements['train']
    setup_test_or_valid(files, raw_dataset_path, valid_path, num_elements['valid'], valid_index)

    #creating temp dir
    test_path = f'{dataset_dir}/test'
    create_dir_collection(test_path)
    test_index = num_elements['train'] + num_elements['valid']
    setup_test_or_valid(files, raw_dataset_path, test_path, num_elements['temp'], test_index)


def get_elements(f_list, train_s: int, valid_s: int, test_s: int):
    train_e = len(f_list) * train_s / 100
    if train_e % 1 != 0:
        train_e = int(train_e) + 1
        valid_e = len(f_list) * valid_s // 100
        test_e = len(f_list) * test_s // 100
    else:
        valid_e = len(f_list) * valid_s / 100
        if valid_e % 1 != 0:
            valid_e = int(valid_e) + 1
            test_e = len(f_list) * test_s // 100
        else:
            test_e = len(f_list) * test_s / 100
            if test_e % 1 != 0:
                test_e = int(test_e) + 1

    elements = {
        'train': int(train_e),
        'valid': int(valid_e),
        'temp': int(test_e)
    }
    return elements


def setup_train(files, src_path, dst_path, num_elements):
    src_img_path, dst_img_path = f"{src_path}/images", f"{dst_path}/images"
    src_label_path, dst_label_path = f"{src_path}/labels", f"{dst_path}/labels"
    for image_name, label_name in zip(files[0][:num_elements], files[1][:num_elements]):
        image_to_copy = os.path.join(src_img_path, image_name)
        shutil.copy2(image_to_copy, dst_img_path)  #copy the image 
        label_to_copy = os.path.join(src_label_path, label_name)
        shutil.copy2(label_to_copy, dst_label_path)  #copy the label


def setup_test_or_valid(files, src_path, dst_path, num_elements, start_index):
    src_img_path, dst_img_path = f"{src_path}/images", f"{dst_path}/images"
    src_label_path, dst_label_path = f"{src_path}/labels", f"{dst_path}/labels"

    for image_name, label_name in zip(files[0][start_index: start_index + num_elements],
                                      files[1][start_index: start_index + num_elements]):
        image_to_copy = os.path.join(src_img_path, image_name)
        shutil.copy2(image_to_copy, dst_img_path)  #copy the image 
        label_to_copy = os.path.join(src_label_path, label_name)
        shutil.copy2(label_to_copy, dst_label_path)  #copy the label


Firstly, decompose your dataset by using the decompose function. write down in 'dataset_path' variable the path of your dataset.

Make sure that there is no folder inside in your current path called 'dataset'

In [36]:
dataset_path = 'clash_roboflow'
decompose(dataset_path)

In each variable write down the needed inputs

In [38]:
train_size = 80
valid_size = 10
test_size = 10
cv_split('prepared_dataset', train_size, valid_size, test_size)