In [11]:
# integrity_check.ipynb (convertible to Jupyter Notebook)

# ------------------------------
# Cell 1: Imports and Constants
# ------------------------------
from pathlib import Path
import os

# Root path
BASE_DIR = Path("/home/ulixes/segmentation_cv/data_augmentation/data/processed/")

# Paths to check
checks = {
    "Val": [
        ("label", "processed_labels"),
        ("color", "resized"),
        ("color", "label")
    ],
    "Test": [
        ("label", "processed_labels"),
        ("color", "resized"),
        ("color", "label")
    ],
    "Train": [
        ("augmented/images", "augmented/masks"),
        ("color", "resized"),
        ("label", "resized_label"),
        ("color", "label")
    ]
}

In [12]:
# ------------------------------
# Cell 2: Comparison Function
# ------------------------------
def compare_dirs(base_path: Path, dir1: str, dir2: str):
    path1 = base_path / dir1
    path2 = base_path / dir2

    # Extract filenames without extensions
    files1 = {f.stem for f in path1.iterdir() if f.is_file()}
    files2 = {f.stem for f in path2.iterdir() if f.is_file()}

    only_in_1 = sorted(list(files1 - files2))
    only_in_2 = sorted(list(files2 - files1))
    common = sorted(list(files1 & files2))

    return {
        "dir1": str(path1),
        "dir2": str(path2),
        "only_in_dir1": only_in_1,
        "only_in_dir2": only_in_2,
        "common": common,
        "count_dir1": len(files1),
        "count_dir2": len(files2),
        "count_common": len(common)
    }



In [13]:
# ------------------------------
# Cell 3: Run All Comparisons
# ------------------------------
summary = []
for split, pairs in checks.items():
    print(f"\n📂 Checking split: {split}")
    for dir1, dir2 in pairs:
        result = compare_dirs(BASE_DIR / split, dir1, dir2)
        summary.append(result)

        print(f"\n🔍 Comparing:")
        print(f"  {result['dir1']} ({result['count_dir1']} files)")
        print(f"  {result['dir2']} ({result['count_dir2']} files)")
        print(f"  ✅ Common files: {result['count_common']}")

        if result['only_in_dir1']:
            print(f"  ❌ Only in {dir1}: {len(result['only_in_dir1'])} files")
        if result['only_in_dir2']:
            print(f"  ❌ Only in {dir2}: {len(result['only_in_dir2'])} files")




📂 Checking split: Val

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/label (734 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/processed_labels (734 files)
  ✅ Common files: 734

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/color (734 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/resized (734 files)
  ✅ Common files: 734

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/color (734 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/label (734 files)
  ✅ Common files: 734

📂 Checking split: Test

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Test/label (3694 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Test/processed_labels (3694 files)
  ✅ Common files: 3694

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Test/color (36

In [14]:

# ------------------------------
# Cell 4: Summary
# ------------------------------
print("\n✅ Done. All file comparisons are complete.")
print("Checked for consistency across: color vs resized, label vs resized_label, augmented/images vs masks, etc.")
print("Compared files by filename stem (ignoring extension differences like .jpg vs .png).")



✅ Done. All file comparisons are complete.
Checked for consistency across: color vs resized, label vs resized_label, augmented/images vs masks, etc.
Compared files by filename stem (ignoring extension differences like .jpg vs .png).
