## Set up folder structure for the YOLO model 

In [1]:
# combine all jpgs and txt with matching file names from separate folders 
import os
import shutil
import random


def match_and_combine_files(jpg_folder, txt_folder, output_folder):
    # Ensure the output directory exists
    os.makedirs(output_folder, exist_ok=True)

    # List all .jpg and .txt files
    jpg_files = {os.path.splitext(f)[0]: f for f in os.listdir(jpg_folder) if f.endswith('.jpg')}
    txt_files = {os.path.splitext(f)[0]: f for f in os.listdir(txt_folder) if f.endswith('.txt')}

    # Find matching filenames (excluding extensions)
    matching_files = jpg_files.keys() & txt_files.keys()

    # Copy matching files to the output folder
    for match in matching_files:
        jpg_src = os.path.join(jpg_folder, jpg_files[match])
        txt_src = os.path.join(txt_folder, txt_files[match])

        shutil.copy(jpg_src, output_folder)
        shutil.copy(txt_src, output_folder)

        print(f"Copied: {jpg_files[match]} and {txt_files[match]} to {output_folder}")



In [2]:
# Oxford:
jpg_folder = '/Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/imgs_filtered_Oxford'
txt_folder = '/Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/oxford'
output_folder = '/Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined'

match_and_combine_files(jpg_folder, txt_folder, output_folder)

Copied: 4420.jpg and 4420.txt to /Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined
Copied: 4999.jpg and 4999.txt to /Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined
Copied: 4233.jpg and 4233.txt to /Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined
Copied: 7128.jpg and 7128.txt to /Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined
Copied: 6974.jpg and 6974.txt to /Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined
Copied: 6761.jpg and 6761.txt to /Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined
Copied: 813.jpg and 813.txt to /Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined
Copied: 3544.jp

In [3]:
# split the combine folder into Training, Validation and Testing 

def split_dataset(source_folder, output_folder, train_ratio=0.7, val_ratio=0.2, test_ratio=0.1):
    # Ensure the output directories exist
    train_folder = os.path.join(output_folder, 'Training')
    val_folder = os.path.join(output_folder, 'Validation')
    test_folder = os.path.join(output_folder, 'Testing')

    os.makedirs(train_folder, exist_ok=True)
    os.makedirs(val_folder, exist_ok=True)
    os.makedirs(test_folder, exist_ok=True)

    # List all .jpg and .txt files
    files = [f for f in os.listdir(source_folder) if f.endswith('.jpg')]
    base_filenames = set(os.path.splitext(f)[0] for f in files)

    # Shuffle the filenames
    base_filenames = list(base_filenames)
    random.shuffle(base_filenames)

    # Split the data
    total_files = len(base_filenames)
    train_cutoff = int(total_files * train_ratio)
    val_cutoff = int(total_files * (train_ratio + val_ratio))

    train_files = base_filenames[:train_cutoff]
    val_files = base_filenames[train_cutoff:val_cutoff]
    test_files = base_filenames[val_cutoff:]

    # Function to copy files to the target directory
    def copy_files(file_list, destination_folder):
        for base_name in file_list:
            jpg_src = os.path.join(source_folder, base_name + '.jpg')
            txt_src = os.path.join(source_folder, base_name + '.txt')

            if os.path.exists(jpg_src):
                shutil.copy(jpg_src, destination_folder)
            if os.path.exists(txt_src):
                shutil.copy(txt_src, destination_folder)

    # Copy the files
    copy_files(train_files, train_folder)
    copy_files(val_files, val_folder)
    copy_files(test_files, test_folder)

    print(f"Dataset split complete:\n"
          f"Training: {len(train_files)} pairs\n"
          f"Validation: {len(val_files)} pairs\n"
          f"Test: {len(test_files)} pairs")

# Example usage:
source_folder = '/Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO/combined'
output_folder = '/Users/ninageyer/Library/CloudStorage/OneDrive-Personal/Dokumente/MScSEDS/2_DLSS/DLSS_project/YOLO'

split_dataset(source_folder, output_folder)


Dataset split complete:
Training: 1551 pairs
Validation: 444 pairs
Test: 222 pairs
