In [3]:
import os
import random
import shutil

In [4]:
def clear_directory(directory):
    if os.path.exists(directory):
        shutil.rmtree(directory)
    os.makedirs(directory)

def copy_files(files, source_dir, labels_dir, output_dir, lst_names_file):
    with open(lst_names_file, 'w') as list_file:
        for file_name in files:
            base_name = '.'.join(file_name.split('.')[:-1])  #  friendly to filenames with points in name
            source_image_path = os.path.join(source_dir, file_name) #path to each image in source_dir
            label_file = os.path.join(labels_dir, base_name + '.txt') #path to each label in label_dir
            assert(os.path.isfile(label_file)),f"Cannot find label for file {file_name}"
            if not os.path.isfile(label_file):
                print(f"Cannot find label for file {file_name}")
                continue
            output_image_path = os.path.join(output_dir, 'images', file_name) #path where the image will be copied
            output_label_path = os.path.join(output_dir, 'labels', base_name + '.txt') #path where the labels will be copied
            
            #Copy image file
            if not os.path.exists(output_image_path):
                shutil.copy(source_image_path, output_image_path)
                print(f'Copied {file_name} to {output_image_path}')
            else:
                print(f'File {file_name} already exists in {output_image_path}')
            
            # Copy label file
            if not os.path.exists(output_label_path):
                shutil.copy(label_file, output_label_path)
                print(f'Copied {base_name}.txt to {output_label_path}')
            else:
                print(f'Label file {base_name}.txt already exists in {output_label_path}')
            list_file.write(file_name + '\n')

In [None]:
def split_train_val(source_dir, train_dir, val_dir, labels_dir, val_ratio=0.4):
    # Creating directories
    clear_directory(os.path.join(train_dir, 'labels'))
    clear_directory(os.path.join(train_dir, 'images'))
    clear_directory(os.path.join(val_dir, 'labels'))
    clear_directory(os.path.join(val_dir, 'images'))
 
    all_files =os.listdir(source_dir)
    image_files = [f for f in all_files if os.path.isfile(os.path.join(source_dir, f))]
    random.shuffle(image_files)
    
    split_index = int(len(image_files) * (1 - val_ratio))
    train = image_files[:split_index]
    print(f"Numb of images in training set: {len(train)}")
    val = image_files[split_index:]
    print(f"Numb of images in validation set: {len(val)}")

    # Copy training and validation files

    copy_files(train, source_dir, labels_dir, train_dir, os.path.join(train_dir, 'train.txt'))
    copy_files(val, source_dir, labels_dir, val_dir, os.path.join(val_dir, 'val.txt'))
    print(f"Copied {len(train)} files to {train_dir}")
    print(f"Copied {len(val)} files to {val_dir}")

# Example usage
source_dir = '/data/datasets/model_validation/val_yolo/images'
labels_dir = '/data/datasets/model_validation/val_yolo/labels'
split_train_val(source_dir, 'train_dir', 'val_dir', labels_dir, val_ratio=0.4)

In [None]:
def get_file_names(directory, folder):
    files = os.listdir(os.path.join(directory, folder))
    base_name = {os.path.splitext(f)[0] for f in files}
    return base_name

train_images_folder = get_file_names('/data/datasets/model_validation/split_train_val/train_dir', '/data/datasets/model_validation/split_train_val/train_dir/images')
train_labels_folder = get_file_names('/data/datasets/model_validation/split_train_val/train_dir', '/data/datasets/model_validation/split_train_val/train_dir/labels')
val_images_folder = get_file_names('/data/datasets/model_validation/split_train_val/val_dir', '/data/datasets/model_validation/split_train_val/val_dir/images')
val_labels_folder = get_file_names('/data/datasets/model_validation/split_train_val/val_dir', '/data/datasets/model_validation/split_train_val/val_dir/labels')
images_duplicates = train_images_folder.intersection(val_images_folder)
labels_duplicates = train_labels_folder.intersection(val_labels_folder)
if images_duplicates:
    print(f'Duplicates in train_images_folder and val_images_folder found: {images_duplicates}')
else:
    print('No duplicates in train_images_folder and val_images_folder')
if labels_duplicates:
    print(f'Duplicates in train_labels_folder and val_labels_folder found: {labels_duplicates}')
else:
    print('No duplicates in train_labels_folder and val_labels_folder')

In [7]:
def mismatched_img_and_label(directory, folder_with_img, folder_with_label):
    image_file = os.listdir(os.path.join(directory, folder_with_img))
    label_file = os.listdir(os.path.join(directory, folder_with_label))
    image_base = {os.path.splitext(f)[0] for f in image_file} #set of basenames of images
    label_base = {os.path.splitext(f)[0] for f in label_file} #set of basenames of labels
    mismatched_img = image_base - label_base # returns set of basenames that are in image_base but not in label_base
    mismatched_label = label_base - image_base # returns set of basenames that are in label_base but not in image_base
    if mismatched_img:
        print(f'No corresponding label for image in {directory}:{mismatched_img}')
    if mismatched_label:
        print(f'No corresponding image for label in {directory}:{mismatched_label}')
mismatched_img_and_label('/data/datasets/model_validation/split_train_val/train_dir', '/data/datasets/model_validation/split_train_val/train_dir/images', '/data/datasets/model_validation/split_train_val/train_dir/labels')
mismatched_img_and_label('/data/datasets/model_validation/split_train_val/val_dir','/data/datasets/model_validation/split_train_val/train_dir/images', '/data/datasets/model_validation/split_train_val/train_dir/labels' )