In [1]:
# integrity_check.ipynb (convertible to Jupyter Notebook)

# ------------------------------
# Cell 1: Imports and Constants
# ------------------------------
from pathlib import Path
import os

# Root path
BASE_DIR = Path("/home/ulixes/segmentation_cv/data_augmentation/data/processed/")

# Paths to check
checks = {
    "Val": [
        ("label", "processed_labels"),
        ("color", "resized"),
        ("color", "label")
    ],
    "Test": [
        ("label", "processed_labels"),
        ("color", "resized"),
        ("color", "label")
    ],
    "Train": [
        ("augmented/images", "augmented/masks"),
        ("color", "resized"),
        ("label", "resized_label"),
        ("color", "label")
    ]
}

In [2]:
# ------------------------------
# Cell 2: Comparison Function
# ------------------------------
def compare_dirs(base_path: Path, dir1: str, dir2: str):
    path1 = base_path / dir1
    path2 = base_path / dir2

    # Extract filenames without extensions
    files1 = {f.stem for f in path1.iterdir() if f.is_file()}
    files2 = {f.stem for f in path2.iterdir() if f.is_file()}

    only_in_1 = sorted(list(files1 - files2))
    only_in_2 = sorted(list(files2 - files1))
    common = sorted(list(files1 & files2))

    return {
        "dir1": str(path1),
        "dir2": str(path2),
        "only_in_dir1": only_in_1,
        "only_in_dir2": only_in_2,
        "common": common,
        "count_dir1": len(files1),
        "count_dir2": len(files2),
        "count_common": len(common)
    }



In [3]:
# ------------------------------
# Cell 3: Run All Comparisons
# ------------------------------
summary = []
for split, pairs in checks.items():
    print(f"\n📂 Checking split: {split}")
    for dir1, dir2 in pairs:
        result = compare_dirs(BASE_DIR / split, dir1, dir2)
        summary.append(result)

        print(f"\n🔍 Comparing:")
        print(f"  {result['dir1']} ({result['count_dir1']} files)")
        print(f"  {result['dir2']} ({result['count_dir2']} files)")
        print(f"  ✅ Common files: {result['count_common']}")

        if result['only_in_dir1']:
            print(f"  ❌ Only in {dir1}: {len(result['only_in_dir1'])} files")
        if result['only_in_dir2']:
            print(f"  ❌ Only in {dir2}: {len(result['only_in_dir2'])} files")




📂 Checking split: Val

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/label (734 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/processed_labels (734 files)
  ✅ Common files: 734

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/color (734 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/resized (734 files)
  ✅ Common files: 734

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/color (734 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Val/label (734 files)
  ✅ Common files: 734

📂 Checking split: Test

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Test/label (3694 files)
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Test/processed_labels (3694 files)
  ✅ Common files: 3694

🔍 Comparing:
  /home/ulixes/segmentation_cv/data_augmentation/data/processed/Test/color (36

In [4]:

# ------------------------------
# Cell 4: Summary
# ------------------------------
print("\n✅ Done. All file comparisons are complete.")
print("Checked for consistency across: color vs resized, label vs resized_label, augmented/images vs masks, etc.")
print("Compared files by filename stem (ignoring extension differences like .jpg vs .png).")



✅ Done. All file comparisons are complete.
Checked for consistency across: color vs resized, label vs resized_label, augmented/images vs masks, etc.
Compared files by filename stem (ignoring extension differences like .jpg vs .png).


# Directory Equality Checks

In [5]:
# mask_value_check.ipynb (convertible into Jupyter notebook cells)

# ------------------------------
# Cell 1: Imports and Constants
# ------------------------------
import os
from pathlib import Path
import numpy as np
import cv2

# Cat breed keywords to identify class from filenames
cat_breeds = [
    'abyssinian', 'bengal', 'birman', 'bombay', 'british', 'egyptian',
    'maine', 'persian', 'ragdoll', 'russian', 'siamese', 'sphynx'
]

# Directories to check
BASE_DIR = Path("/home/ulixes/segmentation_cv/data_augmentation/data/processed")
mask_dirs = {
    "Train (resized_label)": BASE_DIR / "Train" / "resized_label",
    "Train (augmented masks)": BASE_DIR / "Train" / "augmented" / "masks",
    "Val (processed_labels)": BASE_DIR / "Val" / "processed_labels",
    "Test (processed_labels)": BASE_DIR / "Test" / "processed_labels"
}


In [6]:

# ------------------------------
# Cell 2: Utility - Class Inference
# ------------------------------
def is_cat(filename: str) -> bool:
    filename = filename.lower()
    return any(breed in filename for breed in cat_breeds)


# ------------------------------
# Cell 3: Validation Function
# ------------------------------
def validate_mask_values(mask_dir: Path, name: str):
    total_files = 0
    valid_files = 0
    errors = []

    for file in os.listdir(mask_dir):
        if not file.endswith(".png"):
            continue

        total_files += 1
        file_path = mask_dir / file
        mask = cv2.imread(str(file_path), cv2.IMREAD_UNCHANGED)
        if mask is None:
            errors.append((file, "Could not read file"))
            continue

        if mask.ndim == 3:
            mask = mask[:, :, 0]  # drop channel if needed

        unique_values = np.unique(mask)

        # Determine expected values
        if is_cat(file):
            expected_values = set([0, 1, 255])
        else:
            expected_values = set([0, 2, 255])

        if set(unique_values.tolist()) == expected_values:
            valid_files += 1
        else:
            errors.append((file, unique_values))

    print(f"\n✅ {name}")
    print(f"  Checked: {total_files} PNG masks")
    print(f"  Passed:  {valid_files}")
    print(f"  Failed:  {len(errors)}")
    if errors:
        print(f"  ❌ Error examples:")
        for e in errors[:5]:
            print(f"    - {e[0]}: {e[1]}")



In [7]:

# ------------------------------
# Cell 4: Run All Checks
# ------------------------------
for name, directory in mask_dirs.items():
    validate_mask_values(directory, name)


# ------------------------------
# Cell 5: Summary
# ------------------------------
print("\nAll mask value checks completed.")
print("Each mask is expected to contain either [0, 1, 255] for cats or [0, 2, 255] for dogs, based on the filename.")



✅ Train (resized_label)
  Checked: 2939 PNG masks
  Passed:  2939
  Failed:  0

✅ Train (augmented masks)
  Checked: 8721 PNG masks
  Passed:  8721
  Failed:  0

✅ Val (processed_labels)
  Checked: 734 PNG masks
  Passed:  734
  Failed:  0

✅ Test (processed_labels)
  Checked: 3694 PNG masks
  Passed:  3694
  Failed:  0

All mask value checks completed.
Each mask is expected to contain either [0, 1, 255] for cats or [0, 2, 255] for dogs, based on the filename.


In [11]:
# ------------------------------
# Fixed Cell 5: Correct Combined Folder Comparison (Train)
# ------------------------------
def compare_combined_to_resized_clip_train():
    combined_dir1 = BASE_DIR / "Train" / "augmented" / "images"
    combined_dir2 = BASE_DIR / "Train" / "resized"
    target_dir = Path("/home/ulixes/segmentation_cv/unet/data/processed/Train/resized_clip")

    combined_files = {f.stem for f in combined_dir1.iterdir() if f.is_file()}
    combined_files |= {f.stem for f in combined_dir2.iterdir() if f.is_file()}

    target_files = {f.stem for f in target_dir.iterdir() if f.is_file()}

    only_in_combined = sorted(list(combined_files - target_files))
    only_in_target = sorted(list(target_files - combined_files))

    print("\n🔁 Comparing combined (augmented + resized) with TRAIN resized_clip:")
    print(f"  Combined files: {len(combined_files)}")
    print(f"  Resized_clip files: {len(target_files)}")
    print(f"  ✅ Common files: {len(combined_files & target_files)}")

    if only_in_combined:
        print(f"  ❌ Only in combined dirs: {len(only_in_combined)}")
    if only_in_target:
        print(f"  ❌ Only in resized_clip: {len(only_in_target)}")


In [12]:
# ------------------------------
# Cell 6: Extra Test - resized vs resized_clip
# ------------------------------
def compare_resized_to_resized_clip(split: str):
    resized = BASE_DIR / split / "resized"
    resized_clip = Path(f"/home/ulixes/segmentation_cv/unet/data/processed/{split}/resized_clip")

    files_resized = {f.stem for f in resized.iterdir() if f.is_file()}
    files_clip = {f.stem for f in resized_clip.iterdir() if f.is_file()}

    only_in_resized = sorted(list(files_resized - files_clip))
    only_in_clip = sorted(list(files_clip - files_resized))

    print(f"\n📂 Comparing {split}/resized vs resized_clip:")
    print(f"  Resized: {len(files_resized)}")
    print(f"  Resized_clip: {len(files_clip)}")
    print(f"  ✅ Common files: {len(files_resized & files_clip)}")

    if only_in_resized:
        print(f"  ❌ Only in resized: {len(only_in_resized)}")
    if only_in_clip:
        print(f"  ❌ Only in resized_clip: {len(only_in_clip)}")



In [15]:

# ------------------------------
# Cell 7: Run Extra Tests
# ------------------------------
compare_combined_to_resized_clip_train()
compare_resized_to_resized_clip("Val")
compare_resized_to_resized_clip("Test")


🔁 Comparing combined (augmented + resized) with TRAIN resized_clip:
  Combined files: 11660
  Resized_clip files: 11660
  ✅ Common files: 11660

📂 Comparing Val/resized vs resized_clip:
  Resized: 734
  Resized_clip: 734
  ✅ Common files: 734

📂 Comparing Test/resized vs resized_clip:
  Resized: 3694
  Resized_clip: 3694
  ✅ Common files: 3694
