In [3]:
import splitfolders
from pathlib import Path
import shutil
import yaml
import random

In [5]:
def prepare_yolo_dataset_10k_3k_3k():
    """
    Split dataset: 10,000 train, 3,000 test, 3,000 validation
    Total: 16,000 images (leaving out 225 if you have 16,225)
    """
    # Paths
    augmented_path = Path(r"D:\VScodefiles\DeepLearningProject\Augmented")
    split_output = Path(r"E:\sonardataset_10k_3k_3k")
    
    # Get all images
    all_images = list((augmented_path / "images").glob("*.jpg"))
    print(f"Total images found: {len(all_images)}")
    
    # Shuffle randomly
    random.shuffle(all_images)
    
    # Calculate splits
    train_count = 10000
    test_count = 3000
    val_count = 3000
    
    # Verify we have enough images
    total_needed = train_count + test_count + val_count
    if len(all_images) < total_needed:
        print(f"❌ Not enough images! Have {len(all_images)}, need {total_needed}")
        return
    
    # Split the data
    train_images = all_images[:train_count]
    test_images = all_images[train_count:train_count + test_count]
    val_images = all_images[train_count + test_count:train_count + test_count + val_count]
    
    print(f"Dataset split:")
    print(f"Train: {len(train_images)} images")
    print(f"Test: {len(test_images)} images")
    print(f"Val: {len(val_images)} images")
    
    # Remove existing output directory
    if split_output.exists():
        shutil.rmtree(split_output)
    
    # Create directory structure
    for split in ['train', 'test', 'val']:
        (split_output / split / 'images').mkdir(parents=True, exist_ok=True)
        (split_output / split / 'labels').mkdir(parents=True, exist_ok=True)
    
    # Copy files to respective directories
    def copy_split_files(image_list, split_name):
        for img_path in image_list:
            # Copy image
            dst_img = split_output / split_name / 'images' / img_path.name
            shutil.copy2(img_path, dst_img)
            
            # Copy corresponding label
            label_path = augmented_path / 'labels' / img_path.with_suffix('.txt').name
            dst_label = split_output / split_name / 'labels' / label_path.name
            if label_path.exists():
                shutil.copy2(label_path, dst_label)
            else:
                # Create empty label file
                dst_label.touch()
    
    print("Copying training images...")
    copy_split_files(train_images, 'train')
    
    print("Copying test images...")
    copy_split_files(test_images, 'test')
    
    print("Copying validation images...")
    copy_split_files(val_images, 'val')
    
    # Create YAML file
    yaml_content = {
        'path': str(split_output.absolute()),
        'train': 'train/images',
        'val': 'val/images',
        'test': 'test/images',
        'nc': 2,
        'names': ['MILCO', 'NOMBO']
    }
    
    yaml_path = Path("sonar_dataset_10k_3k_3k.yaml")
    with open(yaml_path, 'w') as f:
        yaml.dump(yaml_content, f, default_flow_style=False, sort_keys=False)
    
    # Print final statistics
    print("\n=== Final Dataset Statistics ===")
    for split in ['train', 'val', 'test']:
        images_dir = split_output / split / 'images'
        labels_dir = split_output / split / 'labels'
        num_images = len(list(images_dir.glob('*.jpg')))
        num_labels = len(list(labels_dir.glob('*.txt')))
        
        # Count objects per class - FIXED: Handle float values
        class_counts = {0: 0, 1: 0}
        for label_file in labels_dir.glob('*.txt'):
            with open(label_file, 'r') as f:
                for line in f:
                    if line.strip():
                        parts = line.strip().split()
                        if len(parts) >= 1:
                            try:
                                # Handle both integer and float class IDs
                                class_id = int(float(parts[0]))  # Convert float to int
                                if class_id in class_counts:
                                    class_counts[class_id] += 1
                            except (ValueError, IndexError) as e:
                                print(f"Warning: Could not parse line in {label_file}: {line.strip()} - Error: {e}")
                                continue
        
        print(f"{split.upper()}:")
        print(f"  Images: {num_images}")
        print(f"  Labels: {num_labels}")
        print(f"  MILCO objects: {class_counts[0]}")
        print(f"  NOMBO objects: {class_counts[1]}")
        print(f"  Total objects: {class_counts[0] + class_counts[1]}")
    
    print(f"\n✅ Dataset prepared! YAML file: {yaml_path}")
    return split_output

# Run the preparation
prepare_yolo_dataset_10k_3k_3k()

Total images found: 16225
Dataset split:
Train: 10000 images
Test: 3000 images
Val: 3000 images
Copying training images...
Copying test images...
Copying validation images...

=== Final Dataset Statistics ===
TRAIN:
  Images: 10000
  Labels: 10000
  MILCO objects: 4919
  NOMBO objects: 2581
  Total objects: 7500
VAL:
  Images: 3000
  Labels: 3000
  MILCO objects: 1420
  NOMBO objects: 753
  Total objects: 2173
TEST:
  Images: 3000
  Labels: 3000
  MILCO objects: 1511
  NOMBO objects: 833
  Total objects: 2344

✅ Dataset prepared! YAML file: sonar_dataset_10k_3k_3k.yaml


WindowsPath('E:/sonardataset_10k_3k_3k')