In [4]:
import os
import random
import shutil
from sklearn.model_selection import train_test_split


In [5]:
def split_dataset(image_dir, label_dir, train_image_dir, val_image_dir, train_label_dir, val_label_dir, val_size=0.2, random_seed=42):
    # Create directories if they do not exist
    for directory in [train_image_dir, val_image_dir, train_label_dir, val_label_dir]:
        if not os.path.exists(directory):
            os.makedirs(directory)

    # Get list of images and corresponding labels
    images = sorted(os.listdir(image_dir))
    labels = sorted(os.listdir(label_dir))

    # Ensure images and labels are matched by name
    images = [f for f in images if os.path.splitext(f)[0] in [os.path.splitext(l)[0] for l in labels]]
    labels = [f for f in labels if os.path.splitext(f)[0] in [os.path.splitext(i)[0] for i in images]]

    # Split the dataset
    train_images, val_images, train_labels, val_labels = train_test_split(
        images, labels, test_size=val_size, random_state=random_seed, shuffle=True
    )

    # Move the files to the respective directories
    for img in train_images:
        shutil.copy(os.path.join(image_dir, img), os.path.join(train_image_dir, img))
    for img in val_images:
        shutil.copy(os.path.join(image_dir, img), os.path.join(val_image_dir, img))
    for lbl in train_labels:
        shutil.copy(os.path.join(label_dir, lbl), os.path.join(train_label_dir, lbl))
    for lbl in val_labels:
        shutil.copy(os.path.join(label_dir, lbl), os.path.join(val_label_dir, lbl))

    print(f"Training set: {len(train_images)} images")
    print(f"Validation set: {len(val_images)} images")


In [6]:
if __name__ == "__main__":
    image_directory = 'Crack_Segmentation_Dataset/images'  # Replace with the path to your train images directory
    label_directory = 'Crack_Segmentation_Dataset/labels'  # Replace with the path to your train labels directory
    train_image_directory = 'Crack_Segmentation_Dataset/images_/train'  # Replace with the path to the output train images directory
    val_image_directory = 'Crack_Segmentation_Dataset/images_/val'  # Replace with the path to the output validation images directory
    train_label_directory = 'Crack_Segmentation_Dataset/labels_/train_label'  # Replace with the path to the output train labels directory
    val_label_directory = 'Crack_Segmentation_Dataset/labels_/val_label'  # Replace with the path to the output validation labels directory

    split_dataset(image_directory, label_directory, train_image_directory, val_image_directory, train_label_directory, val_label_directory)

Training set: 7909 images
Validation set: 1978 images
