In [1]:
import os
import shutil
from pathlib import Path
from datetime import datetime

def reorganize_dataset(generated_dataset_name: str, original_data_root: Path, output_base_root: Path):
    """
    Reorganizes a generated dataset to ensure only original images are in val/test splits
    and all augmented + original train images are in the train split.

    Args:
        generated_dataset_name (str): The name of the dataset to reorganize (e.g., "modelo_yolov11_dataset_completo").
        original_data_root (Path): The root path of your original (unaugmented) data.
                                   (e.g., Path("data"))
        output_base_root (Path): The base directory where the reorganized dataset will be saved.
                                 (e.g., Path("fixed"))
    """
    GENERATED_DATASET_ROOT = Path(generated_dataset_name)
    OUTPUT_DATASET_ROOT = output_base_root / f"{generated_dataset_name}_reorganized"

    print(f"\n--- Starting reorganization for: {generated_dataset_name} ---")
    print(f"Source: {GENERATED_DATASET_ROOT}")
    print(f"Destination: {OUTPUT_DATASET_ROOT}")

    # --- Configuration for original data paths ---
    ORIGINAL_IMG_DIR = original_data_root / "Imagenes"
    ORIGINAL_LBL_DIR = original_data_root / "Etiquetas"

    # --- Step 1: Identify Original Validation and Test Images ---
    original_val_stems = set()
    original_test_stems = set()

    # Populate original_val_stems
    original_val_img_path = ORIGINAL_IMG_DIR / "val"
    if original_val_img_path.exists():
        for img_file in original_val_img_path.glob("*.jpg"):
            original_val_stems.add(img_file.stem)
        for img_file in original_val_img_path.glob("*.png"):
            original_val_stems.add(img_file.stem)
    else:
        print(f"Warning: Original validation image path not found: {original_val_img_path}")

    # Populate original_test_stems
    original_test_img_path = ORIGINAL_IMG_DIR / "test"
    if original_test_img_path.exists():
        for img_file in original_test_img_path.glob("*.jpg"):
            original_test_stems.add(img_file.stem)
        for img_file in original_test_img_path.glob("*.png"):
            original_test_stems.add(img_file.stem)
    else:
        print(f"Warning: Original test image path not found: {original_test_img_path}")

    print(f"Found {len(original_val_stems)} original validation images from '{original_val_img_path}'.")
    print(f"Found {len(original_test_stems)} original test images from '{original_test_img_path}'.")

    # --- Step 2: Prepare New Output Directories ---
    new_train_img_dir = OUTPUT_DATASET_ROOT / "train" / "images"
    new_train_lbl_dir = OUTPUT_DATASET_ROOT / "train" / "labels"
    new_val_img_dir = OUTPUT_DATASET_ROOT / "val" / "images"
    new_val_lbl_dir = OUTPUT_DATASET_ROOT / "val" / "labels"
    new_test_img_dir = OUTPUT_DATASET_ROOT / "test" / "images"
    new_test_lbl_dir = OUTPUT_DATASET_ROOT / "test" / "labels"

    # Create all necessary directories
    new_train_img_dir.mkdir(parents=True, exist_ok=True)
    new_train_lbl_dir.mkdir(parents=True, exist_ok=True)
    new_val_img_dir.mkdir(parents=True, exist_ok=True)
    new_val_lbl_dir.mkdir(parents=True, exist_ok=True)
    new_test_img_dir.mkdir(parents=True, exist_ok=True)
    new_test_lbl_dir.mkdir(parents=True, exist_ok=True)

    print(f"\nCreated new dataset structure in {OUTPUT_DATASET_ROOT}")

    # --- Step 3: Reorganize Files ---
    moved_counts = {'train': 0, 'val': 0, 'test': 0}

    # Iterate through the images and labels in the *GENERATED* dataset
    # We need to check all original generated splits (train, val, test) because augmented data could be in any of them
    for current_split_name in ["train", "val", "test"]:
        current_img_dir = GENERATED_DATASET_ROOT / current_split_name / "images"
        current_lbl_dir = GENERATED_DATASET_ROOT / current_split_name / "labels"

        if not current_img_dir.exists():
            print(f"Warning: Skipping {current_img_dir}, path does not exist.")
            continue

        print(f"\nProcessing files from: {current_img_dir}")

        # Use os.scandir for potentially better performance with large directories
        with os.scandir(current_img_dir) as entries:
            for entry in entries:
                if entry.is_file() and (entry.name.lower().endswith('.jpg') or entry.name.lower().endswith('.png')):
                    img_file = entry.name
                    img_path = Path(entry.path) # Use Path for easier manipulation
                    lbl_path = current_lbl_dir / (Path(img_file).stem + ".txt")

                    target_split = "train" # Default target is 'train'

                    # Determine if it's an original validation or test image
                    if Path(img_file).stem in original_val_stems:
                        target_split = "val"
                    elif Path(img_file).stem in original_test_stems:
                        target_split = "test"

                    # Define destination paths
                    dest_img_path = OUTPUT_DATASET_ROOT / target_split / "images" / img_file
                    dest_lbl_path = OUTPUT_DATASET_ROOT / target_split / "labels" / (Path(img_file).stem + ".txt")

                    # Move the image and label file
                    try:
                        # Only move if the destination doesn't already have it
                        # This prevents issues if an original val/test image existed in multiple generated splits
                        if not dest_img_path.exists():
                            shutil.move(str(img_path), str(dest_img_path))
                            if lbl_path.exists():
                                shutil.move(str(lbl_path), str(dest_lbl_path))
                            else:
                                # If no label file, create an empty one in the destination
                                Path(dest_lbl_path).touch()
                            moved_counts[target_split] += 1
                        # else:
                        #     print(f"Skipping {img_file} as it already exists in {dest_img_path.parent}.") # Uncomment for verbose info on skips
                    except Exception as e:
                        print(f"Error moving {img_file} to {target_split}: {e}")
                elif entry.is_dir():
                    print(f"Skipping directory: {entry.path}")


    print(f"\n--- Reorganization for {generated_dataset_name} Complete ---")
    print(f"Total images moved to new train: {moved_counts['train']}")
    print(f"Total images moved to new val: {moved_counts['val']}")
    print(f"Total images moved to new test: {moved_counts['test']}")
    print(f"New organized dataset is in: {OUTPUT_DATASET_ROOT}")
    print("Remember to update your YOLO config (data.yaml) to point to this new structure.")
    print("-" * 50)



ORIGINAL_DATA_ROOT = Path("data")

FIXED_OUTPUT_BASE = Path("fixed")
FIXED_OUTPUT_BASE.mkdir(exist_ok=True)

# Run for modelo_yolov11_dataset_completo
reorganize_dataset("modelo_yolov11_dataset_completo", ORIGINAL_DATA_ROOT, FIXED_OUTPUT_BASE)

# Run for modelo_yolov11_dataset_filtrado
reorganize_dataset("modelo_yolov11_dataset_filtrado", ORIGINAL_DATA_ROOT, FIXED_OUTPUT_BASE)

print("\nAll reorganization tasks finished. Check the 'fixed' directory.")


--- Starting reorganization for: modelo_yolov11_dataset_completo ---
Source: modelo_yolov11_dataset_completo
Destination: fixed\modelo_yolov11_dataset_completo_reorganized
Found 107 original validation images from 'data\Imagenes\val'.
Found 372 original test images from 'data\Imagenes\test'.

Created new dataset structure in fixed\modelo_yolov11_dataset_completo_reorganized

Processing files from: modelo_yolov11_dataset_completo\train\images

Processing files from: modelo_yolov11_dataset_completo\val\images

Processing files from: modelo_yolov11_dataset_completo\test\images

--- Reorganization for modelo_yolov11_dataset_completo Complete ---
Total images moved to new train: 11215
Total images moved to new val: 91
Total images moved to new test: 294
New organized dataset is in: fixed\modelo_yolov11_dataset_completo_reorganized
Remember to update your YOLO config (data.yaml) to point to this new structure.
--------------------------------------------------

--- Starting reorganization f