In [1]:
# ==============================================================================
# üîé Auto Label Comparison for Review Candidates (V32)
# ------------------------------------------------------------------------------

# Macro Description
# -----------------
# This script automates label checking for images in:
#     .../V32_YYYYMMDD_xxxxxx/review_candidates_by_predicted_class/<predicted_label>/*.jpg

# It compares the *predicted label* implied by each review folder name against the
# *true label* implied by your original dataset folder structure:
#     .../img_datasets/ferckjalfaga_dataset_14_labels/<true_label>/*.jpg

# Key features:
#   1) Walks every nested class folder under `review_candidates_by_predicted_class/`
#   2) Indexes the original dataset once (filename -> true_label + absolute path)
#   3) For each review image, finds its true label (by filename) and flags mismatches
#   4) Writes a detailed audit CSV:
#         auto_label_comparison_V32.csv
#         Columns: filename, predicted_label, true_label, source_path, review_path, status
#   5) (Optional) Writes a curation patch CSV you can feed to V33 training:
#         curated_additions_V32.csv
#         Columns: filepath, correct_label, notes
#      - Uses ORIGINAL dataset paths when available (preferred for pipeline ingestion)
#      - Falls back to review folder paths if original not found

# Assumptions / Notes:
# - Filenames are unique across the dataset. If not, the script marks them as AMBIGUOUS.
# - If you want absolute reproducibility, keep the original dataset stable while curating.
# - This is metadata-driven (no visual validation). Use it to quickly capture obvious
#   false positives / negatives; do manual spot checks where necessary.

# Recommended Flow:
#   1) Run the script
#   2) Open `auto_label_comparison_V32.csv` ‚Üí filter rows where status == "MISMATCH"
#   3) Review quickly; use the generated `curated_additions_V32.csv` as your patch
#   4) Integrate the patch into V33 dataset-building & oversampling
# ==============================================================================

In [2]:
import os
import csv
from collections import defaultdict

In [3]:
# ---------------------------------------------------------
# üîß CONFIGURATION ‚Äî EDIT THESE PATHS IF NEEDED
# ---------------------------------------------------------
# V32 run directory
V32_DIR = "/Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V32_20251008_115114"

# Review candidates root (predicted-label folders live here)
REVIEW_DIR = os.path.join(V32_DIR, "review_candidates_by_predicted_class")

# Original dataset root (true-label folders live here)
DATASET_ROOT = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/ferckjalfaga_dataset_14_labels"

# Output CSVs
COMPARISON_CSV = os.path.join(V32_DIR, "auto_label_comparison_V32.csv")
GENERATE_PATCH = True  # set False if you don‚Äôt want curated_additions_V32.csv created
PATCH_CSV = os.path.join(V32_DIR, "curated_additions_V32.csv")

# File extensions to consider as images
IMG_EXTS = {".jpg", ".jpeg", ".png", ".tif", ".tiff", ".bmp", ".webp"}

In [4]:
# ---------------------------------------------------------
# üß∞ HELPERS
# ---------------------------------------------------------
def is_image(fname: str) -> bool:
    return os.path.splitext(fname)[1].lower() in IMG_EXTS


def index_original_dataset(dataset_root: str):
    """
    Walk the original dataset once.
    Returns:
        true_label_by_filename : dict[str, str or 'AMBIGUOUS']
        source_path_by_filename: dict[str, str or 'AMBIGUOUS']
        duplicates             : set[str] filenames seen in >1 label folder
    """
    true_label_by_filename = {}
    source_path_by_filename = {}
    occurrences = defaultdict(list)

    for true_label in os.listdir(dataset_root):
        class_dir = os.path.join(dataset_root, true_label)
        if not os.path.isdir(class_dir):
            continue
        for fname in os.listdir(class_dir):
            if not is_image(fname):
                continue
            abs_path = os.path.join(class_dir, fname)
            occurrences[fname].append((true_label, abs_path))

    duplicates = set()
    for fname, items in occurrences.items():
        if len(items) == 1:
            lbl, pth = items[0]
            true_label_by_filename[fname] = lbl
            source_path_by_filename[fname] = pth
        else:
            # filename appears in multiple true-label folders ‚Üí ambiguous
            duplicates.add(fname)
            true_label_by_filename[fname] = "AMBIGUOUS"
            source_path_by_filename[fname] = "AMBIGUOUS"

    return true_label_by_filename, source_path_by_filename, duplicates


def walk_review_candidates(review_root: str):
    """
    Yield tuples of (predicted_label, review_abs_path, filename)
    for every image found under review_candidates_by_predicted_class/<predicted_label>/...
    """
    for predicted_label in os.listdir(review_root):
        pred_dir = os.path.join(review_root, predicted_label)
        if not os.path.isdir(pred_dir):
            continue
        for fname in os.listdir(pred_dir):
            if not is_image(fname):
                continue
            yield predicted_label, os.path.join(pred_dir, fname), fname

In [5]:
# ---------------------------------------------------------
# üöÄ MAIN
# ---------------------------------------------------------
def main():
    if not os.path.isdir(REVIEW_DIR):
        raise FileNotFoundError(f"Review directory not found: {REVIEW_DIR}")
    if not os.path.isdir(DATASET_ROOT):
        raise FileNotFoundError(f"Dataset root not found: {DATASET_ROOT}")

    print("üìã Indexing original dataset (this is fast for moderate datasets)...")
    true_label_by_filename, source_path_by_filename, duplicates = index_original_dataset(DATASET_ROOT)
    print(f"   ‚Üí Indexed {len(true_label_by_filename)} unique filenames "
          f"(ambiguous filenames: {len(duplicates)})")

    rows = []
    mismatches = 0
    total = 0

    print("üîç Scanning review candidates and comparing labels...")
    for predicted_label, review_abs_path, fname in walk_review_candidates(REVIEW_DIR):
        total += 1
        true_label = true_label_by_filename.get(fname, "UNKNOWN_SOURCE")
        source_path = source_path_by_filename.get(fname, "")

        if true_label == "AMBIGUOUS":
            status = "AMBIGUOUS_FILENAME"
        elif true_label == "UNKNOWN_SOURCE":
            status = "MISSING_IN_ORIGINAL"
        elif true_label == predicted_label:
            status = "MATCH"
        else:
            status = "MISMATCH"
            mismatches += 1

        rows.append([
            fname,                 # filename only
            predicted_label,       # from review folder name
            true_label,            # from original dataset index
            source_path,           # absolute path in original dataset (or 'AMBIGUOUS')
            review_abs_path,       # absolute path in review tree
            status                 # MATCH / MISMATCH / AMBIGUOUS_FILENAME / MISSING_IN_ORIGINAL
        ])

    # Write comparison CSV (rich audit)
    os.makedirs(os.path.dirname(COMPARISON_CSV), exist_ok=True)
    with open(COMPARISON_CSV, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["filename", "predicted_label", "true_label",
                    "source_path", "review_path", "status"])
        w.writerows(rows)

    print(f"‚úÖ Wrote comparison: {COMPARISON_CSV}")
    print(f"   ‚Üí Checked {total} files; mismatches: {mismatches}; "
          f"ambiguous filenames: {len(duplicates)}")

    # Optionally create a ready-to-use curation patch for V33
    if GENERATE_PATCH:
        # Only include rows where we have a resolvable true label (not ambiguous/missing)
        patch_rows = []
        for fname, predicted_label, true_label, source_path, review_path, status in rows:
            if status == "MISMATCH" and true_label not in {"AMBIGUOUS", "UNKNOWN_SOURCE"}:
                # Prefer the ORIGINAL dataset path for ingestion; it‚Äôs stable for training
                filepath_for_patch = source_path if source_path else review_path
                patch_rows.append([filepath_for_patch, true_label, f"auto from review vs. dataset ({predicted_label}‚Üí{true_label})"])

        # If nothing mismatched, still write an empty, well-formed file
        with open(PATCH_CSV, "w", newline="") as f:
            w = csv.writer(f)
            w.writerow(["filepath", "correct_label", "notes"])
            w.writerows(patch_rows)

        print(f"üìù Wrote curated patch: {PATCH_CSV} "
              f"(rows: {len(patch_rows)}; only MISMATCH entries with resolvable true labels)")

    # Guidance summary
    print("\nNext steps:")
    print("  1) Open the comparison CSV and filter `status == MISMATCH` to inspect disagreements.")
    print("  2) If PATCH is enabled, review `curated_additions_V32.csv` quickly; "
          "append notes or adjust any edge cases.")
    print("  3) Feed the patch CSV into your V33 dataset builder and oversample it.")

if __name__ == "__main__":
    main()

üìã Indexing original dataset (this is fast for moderate datasets)...
   ‚Üí Indexed 6174 unique filenames (ambiguous filenames: 0)
üîç Scanning review candidates and comparing labels...
‚úÖ Wrote comparison: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V32_20251008_115114/auto_label_comparison_V32.csv
   ‚Üí Checked 11317 files; mismatches: 2016; ambiguous filenames: 0
üìù Wrote curated patch: /Users/natalyagrokh/AI/ml_expressions/img_expressions/sup_training/V32_20251008_115114/curated_additions_V32.csv (rows: 2016; only MISMATCH entries with resolvable true labels)

Next steps:
  1) Open the comparison CSV and filter `status == MISMATCH` to inspect disagreements.
  2) If PATCH is enabled, review `curated_additions_V32.csv` quickly; append notes or adjust any edge cases.
  3) Feed the patch CSV into your V33 dataset builder and oversample it.
