## Imports:

In [1]:
import cv2
import numpy as np
from pathlib import Path
from PIL import Image
from transformers import pipeline
import time

## Now use old cleaning techniques from tiny dataset on the whole dataset V3:

In [5]:
# =============================================================================
# TEST 4: FULL PETIMAGES DATASET - CLIP FILTER WITH BLUR DETECTION
# =============================================================================
# 
# This processes the entire PetImages dataset and outputs:
# 1. clip_exclude_list.txt - paths only (same format as exclude_list.txt)
# 2. clip_exclude_details.txt - rejected images with scores and reasons
# 3. clip_all_images.txt - ALL images with status, scores, reasons
#
# =============================================================================

import cv2
import numpy as np
from pathlib import Path
from PIL import Image
from transformers import pipeline
import time

# -----------------------------------------------------------------------------
# SETUP: Load CLIP model
# -----------------------------------------------------------------------------

classifier = pipeline(
    "zero-shot-image-classification",
    model="openai/clip-vit-large-patch14-336",
    device=0
)

# -----------------------------------------------------------------------------
# THRESHOLDS
# -----------------------------------------------------------------------------

PHOTO_THRESHOLD = 0.60
PHOTO_THRESHOLD_LENIENT = 0.40
ANIMAL_THRESHOLD = 0.50
ANIMAL_THRESHOLD_LENIENT = 0.15
TEXT_THRESHOLD = 0.35
BLUR_THRESHOLD = 5500

# -----------------------------------------------------------------------------
# HELPER FUNCTIONS
# -----------------------------------------------------------------------------

def get_blur_score(filepath):
    """Calculate blur score using Laplacian variance. Low = blurry, High = sharp."""
    img = cv2.imread(str(filepath), cv2.IMREAD_GRAYSCALE)
    if img is None:
        return 0
    return cv2.Laplacian(img, cv2.CV_64F).var()

def check_image_multi(filepath):
    """Run three independent CLIP classifications. Returns (photo_score, animal_score, text_score)."""
    img = Image.open(filepath)
    
    result1 = classifier(img, candidate_labels=["camera photograph", "digital artwork"])
    photo_score = result1[0]['score'] if result1[0]['label'] == "camera photograph" else 1 - result1[0]['score']
    
    result2 = classifier(img, candidate_labels=["an animal", "not an animal"])
    animal_score = result2[0]['score'] if result2[0]['label'] == "an animal" else 1 - result2[0]['score']
    
    result3 = classifier(img, candidate_labels=["text and words", "no text"])
    text_score = result3[0]['score'] if result3[0]['label'] == "text and words" else 1 - result3[0]['score']
    
    return photo_score, animal_score, text_score

def should_keep_v3(photo_score, animal_score, text_score, blur_score):
    """Determine if image should be kept based on CLIP scores and blur."""
    is_blurry = blur_score < BLUR_THRESHOLD
    
    if photo_score >= PHOTO_THRESHOLD and animal_score >= ANIMAL_THRESHOLD:
        return True, "real photo with animal"
    
    if is_blurry:
        if photo_score >= PHOTO_THRESHOLD_LENIENT:
            if animal_score >= ANIMAL_THRESHOLD_LENIENT:
                return True, "blurry photo with animal (lenient)"
            if animal_score < ANIMAL_THRESHOLD_LENIENT:
                if text_score >= TEXT_THRESHOLD:
                    return False, "blurry photo, no animal, has text"
                return False, "blurry photo but no animal"
    
    if photo_score < PHOTO_THRESHOLD_LENIENT:
        return False, "not a real photo"
    
    if animal_score < ANIMAL_THRESHOLD:
        if text_score >= TEXT_THRESHOLD:
            return False, "real photo, no animal, has text"
        return False, "real photo but no animal"
    
    return False, "did not meet criteria"

# -----------------------------------------------------------------------------
# PATHS
# -----------------------------------------------------------------------------

data_path = Path(r"C:\AWrk\cats_dogs_project\data\PetImages")
output_dir = Path(r"C:\AWrk\cats_dogs_project\outputs")
output_dir.mkdir(parents=True, exist_ok=True)

exclude_list_path = output_dir / "clip_exclude_list.txt"
exclude_details_path = output_dir / "clip_exclude_details.txt"
all_images_path = output_dir / "clip_all_images.txt"

# -----------------------------------------------------------------------------
# LOAD DATASET
# -----------------------------------------------------------------------------

cat_files = list((data_path / "Cat").glob("*.jpg"))
dog_files = list((data_path / "Dog").glob("*.jpg"))
all_files = cat_files + dog_files

print("=" * 100)
print("TEST 4: FULL PETIMAGES DATASET")
print("=" * 100)
print(f"Total images to process: {len(all_files)}")
print(f"  Cats: {len(cat_files)}")
print(f"  Dogs: {len(dog_files)}")
print()

# -----------------------------------------------------------------------------
# PROCESS ALL IMAGES
# -----------------------------------------------------------------------------

kept = []
rejected = []
errors = []

start_time = time.time()

print("Processing images...")
print("-" * 100)

with open(exclude_details_path, "w") as details_file, \
     open(all_images_path, "w") as all_file:
    
    # Write headers
    details_file.write("# CLIP Filter Results - Rejected Images Only\n")
    details_file.write("# path\tphoto\tanimal\ttext\tblur\treason\n")
    details_file.write("-" * 100 + "\n")
    
    all_file.write("# CLIP Filter Results - All Images\n")
    all_file.write("# path\tstatus\tphoto\tanimal\ttext\tblur\treason\n")
    all_file.write("-" * 120 + "\n")
    
    for i, f in enumerate(all_files):
        # Relative path format matching existing exclude_list.txt
        rel_path = f"..\\data\\PetImages\\{f.parent.name}\\{f.name}"
        
        try:
            photo, animal, text = check_image_multi(f)
            blur = get_blur_score(f)
            keep, reason = should_keep_v3(photo, animal, text, blur)
            
            status = "KEEP" if keep else "REJECT"
            
            # Write to all images file
            all_file.write(f"{rel_path}\t{status}\t{photo:.2f}\t{animal:.2f}\t{text:.2f}\t{blur:.1f}\t{reason}\n")
            
            if keep:
                kept.append((rel_path, photo, animal, text, blur, reason))
            else:
                rejected.append((rel_path, photo, animal, text, blur, reason))
                details_file.write(f"{rel_path}\t{photo:.2f}\t{animal:.2f}\t{text:.2f}\t{blur:.1f}\t{reason}\n")
            
        except Exception as e:
            errors.append((rel_path, str(e)))
            rejected.append((rel_path, 0, 0, 0, 0, f"ERROR: {e}"))
            details_file.write(f"{rel_path}\tERROR\t\t\t\t{e}\n")
            all_file.write(f"{rel_path}\tERROR\t\t\t\t\t{e}\n")
        
        # Progress update every 500 images
        if (i + 1) % 500 == 0:
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed
            remaining = (len(all_files) - i - 1) / rate
            print(f"  {i + 1}/{len(all_files)} ({(i+1)/len(all_files)*100:.1f}%) - "
                  f"Rejected: {len(rejected)} - "
                  f"ETA: {remaining/60:.1f} min")

# Write simple exclude list (just paths)
with open(exclude_list_path, "w") as f:
    for rel_path, photo, animal, text, blur, reason in rejected:
        f.write(rel_path + "\n")

# -----------------------------------------------------------------------------
# FINAL SUMMARY
# -----------------------------------------------------------------------------

elapsed = time.time() - start_time

print()
print("=" * 100)
print("COMPLETE")
print("=" * 100)
print(f"Time elapsed: {elapsed/60:.1f} minutes")
print(f"Total processed: {len(all_files)}")
print(f"KEPT: {len(kept)}")
print(f"REJECTED: {len(rejected)}")
print(f"ERRORS: {len(errors)}")

# Breakdown by class
rejected_cats = sum(1 for r in rejected if "\\Cat\\" in r[0])
rejected_dogs = sum(1 for r in rejected if "\\Dog\\" in r[0])
print()
print(f"Rejected Cats: {rejected_cats}")
print(f"Rejected Dogs: {rejected_dogs}")

# Breakdown by reason
print()
print("Rejection reasons:")
reason_counts = {}
for _, _, _, _, _, reason in rejected:
    reason_counts[reason] = reason_counts.get(reason, 0) + 1
for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
    print(f"  {count:5d}  {reason}")

# Output files
print()
print("Output files:")
print(f"  1. {exclude_list_path}")
print(f"     Paths only (same format as exclude_list.txt)")
print()
print(f"  2. {exclude_details_path}")
print(f"     Rejected images with scores and reasons")
print()
print(f"  3. {all_images_path}")
print(f"     ALL images with status, scores, and reasons")

# Show first 20 rejected for quick review
print()
print("First 20 rejected images:")
print("-" * 100)
print(f"{'path':<45} {'photo':>5} {'animal':>6} {'text':>5} {'blur':>8}  reason")
print("-" * 100)
for rel_path, photo, animal, text, blur, reason in rejected[:20]:
    print(f"{rel_path:<45} {photo:>5.2f} {animal:>6.2f} {text:>5.2f} {blur:>8.1f}  {reason}")

if errors:
    print()
    print(f"Errors ({len(errors)}):")
    for path, err in errors[:10]:
        print(f"  {path}: {err}")

KeyboardInterrupt: 

## Problems with V3

The code needed to be updated as we were identifying images that were cat and dog images as rejected. It was however working and truly rejecting bad images which was good. There were just a few to many false positives. The false positive were mostly on blurry images that had high animal threshold and on multiple animals in the image.

Fixes:

1. High Animal Override Now Requires Very Low Blur
New: If animal ≥ 0.80 AND blur < 500, then keep it
The critical insight is that drawings of animals also score high on animal detection but are NOT blurry. Real blurry photos have blur < 500, while drawings/edited images have blur > 1500. The old version was incorrectly keeping sharp drawings because they had high animal scores.

2. If a photo in very clearly real (clip tells us) and the animal is low try a multiple animals check - clip struggles with multiples and needs specific prompting for it so the new rule should be: photo >= 0.90 AND animal < 0.50 then run multiple >= 0.30 OR single >= 0.30 to keep the images

# V3+ new rescue rules:

  Rescue 1: blur < 500 AND animal >= 0.80 AND photo >= 0.25                  
              KEEP "RESCUED: very blurry with high animal confidence"      
                                                                             
            Why: Real blurry photos have blur < 500                          
                 Drawings have sharp edges (blur > 1500)                     
            Catches: yawning cat, unusual poses, solid backgrounds           
                                                                             
  Rescue 2: photo >= 0.90 AND animal < 0.50                                  
            Run secondary CLIP check for multiple animals                
              multiple >= 0.30 OR single >= 0.30                           
              KEEP "RESCUED: multiple animals detected"                
                                                                             
            Why: "an animal" prompt fails with multiple cats/dogs            
            Catches: group photos, cats in cages, kittens held by humans     
                                                                             
  No rescue applied:                                                         
              REJECT (return V3's original rejection reason) 

In [2]:
# =============================================================================
# TEST 4: FULL PETIMAGES DATASET - CLIP FILTER V3+ (V3 with Rescue Rules)
# =============================================================================
# 
# DESIGN PRINCIPLE:
# If V3 keeps it → V3+ keeps it. Always.
# Rescue rules can ONLY save images that V3 would reject.
#
# RESCUE RULES:
# 1. Very blurry + high animal: blur < 500, animal >= 0.80, photo >= 0.25
# 2. Multiple animals fallback: photo >= 0.90, animal < 0.50 → secondary CLIP check
#
# OUTPUTS:
# 1. clip_exclude_list_v3plus.txt - paths only (for loading into training)
# 2. clip_exclude_details_v3plus.txt - rejected images with scores and reasons
# 3. clip_all_images_v3plus.txt - ALL images with status, scores, reasons
# 4. clip_rescued_images_v3plus.txt - images saved by rescue rules
#
# =============================================================================

import cv2
import numpy as np
from pathlib import Path
from PIL import Image
from transformers import pipeline
import time

# -----------------------------------------------------------------------------
# SETUP: Load CLIP model
# -----------------------------------------------------------------------------

classifier = pipeline(
    "zero-shot-image-classification",
    model="openai/clip-vit-large-patch14-336",
    device=0
)

# -----------------------------------------------------------------------------
# V3 THRESHOLDS (unchanged - proven baseline)
# -----------------------------------------------------------------------------

PHOTO_THRESHOLD = 0.60
PHOTO_THRESHOLD_LENIENT = 0.40
ANIMAL_THRESHOLD = 0.50
ANIMAL_THRESHOLD_LENIENT = 0.15
TEXT_THRESHOLD = 0.35
BLUR_THRESHOLD = 5500

# -----------------------------------------------------------------------------
# RESCUE RULE THRESHOLDS (V3+ additions)
# -----------------------------------------------------------------------------

# Rescue Rule 1: Very blurry + high animal
RESCUE1_BLUR_MAX = 500          # Must be VERY blurry (real photos only)
RESCUE1_ANIMAL_MIN = 0.80       # Must have high animal confidence
RESCUE1_PHOTO_MIN = 0.25        # Must have SOME photo signal

# Rescue Rule 2: Multiple animals fallback
RESCUE2_PHOTO_MIN = 0.90        # Only try when photo is high
RESCUE2_ANIMAL_MAX = 0.50       # Only when standard animal check failed
RESCUE2_MULTIPLE_MIN = 0.30     # Threshold for multiple animals detection

# -----------------------------------------------------------------------------
# HELPER FUNCTIONS
# -----------------------------------------------------------------------------

def get_blur_score(filepath):
    """Calculate blur score using Laplacian variance. Low = blurry, High = sharp."""
    img = cv2.imread(str(filepath), cv2.IMREAD_GRAYSCALE)
    if img is None:
        return 0
    return cv2.Laplacian(img, cv2.CV_64F).var()


def check_image_multi(filepath):
    """Run three independent CLIP classifications. Returns (photo_score, animal_score, text_score)."""
    img = Image.open(filepath)
    
    result1 = classifier(img, candidate_labels=["camera photograph", "digital artwork"])
    photo_score = result1[0]['score'] if result1[0]['label'] == "camera photograph" else 1 - result1[0]['score']
    
    result2 = classifier(img, candidate_labels=["an animal", "not an animal"])
    animal_score = result2[0]['score'] if result2[0]['label'] == "an animal" else 1 - result2[0]['score']
    
    result3 = classifier(img, candidate_labels=["text and words", "no text"])
    text_score = result3[0]['score'] if result3[0]['label'] == "text and words" else 1 - result3[0]['score']
    
    return photo_score, animal_score, text_score


def check_multiple_animals(filepath):
    """
    Secondary CLIP check for multiple animals.
    Use when standard "an animal" check fails but image looks like a real photo.
    Returns (multiple_score, single_score).
    """
    img = Image.open(filepath)
    result = classifier(img, candidate_labels=["multiple animals", "one animal", "no animals"])
    
    scores = {r['label']: r['score'] for r in result}
    multiple_score = scores.get("multiple animals", 0)
    single_score = scores.get("one animal", 0)
    
    return multiple_score, single_score


def should_keep_v3(photo_score, animal_score, text_score, blur_score):
    """Original V3 decision logic. Returns (keep, reason)."""
    is_blurry = blur_score < BLUR_THRESHOLD
    
    if photo_score >= PHOTO_THRESHOLD and animal_score >= ANIMAL_THRESHOLD:
        return True, "real photo with animal"
    
    if is_blurry:
        if photo_score >= PHOTO_THRESHOLD_LENIENT:
            if animal_score >= ANIMAL_THRESHOLD_LENIENT:
                return True, "blurry photo with animal (lenient)"
            if animal_score < ANIMAL_THRESHOLD_LENIENT:
                if text_score >= TEXT_THRESHOLD:
                    return False, "blurry photo, no animal, has text"
                return False, "blurry photo but no animal"
    
    if photo_score < PHOTO_THRESHOLD_LENIENT:
        return False, "not a real photo"
    
    if animal_score < ANIMAL_THRESHOLD:
        if text_score >= TEXT_THRESHOLD:
            return False, "real photo, no animal, has text"
        return False, "real photo but no animal"
    
    return False, "did not meet criteria"


def should_keep_v3plus(photo_score, animal_score, text_score, blur_score, filepath=None):
    """
    V3+ decision logic: V3 baseline + rescue rules.
    
    Returns (keep, reason, rescued, rescue_details)
        keep: bool
        reason: str
        rescued: bool - True if a rescue rule saved this image
        rescue_details: str - details about rescue (e.g., multiple animal scores)
    """
    # Step 1: Run V3 first
    v3_keep, v3_reason = should_keep_v3(photo_score, animal_score, text_score, blur_score)
    
    if v3_keep:
        return True, v3_reason, False, ""
    
    # Step 2: V3 rejected it. Try rescue rules.
    
    # Rescue Rule 1: Very blurry + high animal
    if blur_score < RESCUE1_BLUR_MAX:
        if animal_score >= RESCUE1_ANIMAL_MIN:
            if photo_score >= RESCUE1_PHOTO_MIN:
                return True, "RESCUED: very blurry with high animal", True, f"blur={blur_score:.0f}, animal={animal_score:.2f}"
    
    # Rescue Rule 2: Multiple animals fallback
    if filepath is not None:
        if photo_score >= RESCUE2_PHOTO_MIN and animal_score < RESCUE2_ANIMAL_MAX:
            multiple_score, single_score = check_multiple_animals(filepath)
            
            if multiple_score >= RESCUE2_MULTIPLE_MIN or single_score >= RESCUE2_MULTIPLE_MIN:
                return True, "RESCUED: multiple animals detected", True, f"multi={multiple_score:.2f}, single={single_score:.2f}"
    
    # Step 3: No rescue applied
    return False, v3_reason, False, ""

# -----------------------------------------------------------------------------
# PATHS
# -----------------------------------------------------------------------------

data_path = Path(r"C:\AWrk\cats_dogs_project\data\PetImages")
output_dir = Path(r"C:\AWrk\cats_dogs_project\outputs")
output_dir.mkdir(parents=True, exist_ok=True)

exclude_list_path = output_dir / "clip_exclude_list_v3plus.txt"
exclude_details_path = output_dir / "clip_exclude_details_v3plus.txt"
all_images_path = output_dir / "clip_all_images_v3plus.txt"
rescued_path = output_dir / "clip_rescued_images_v3plus.txt"

# -----------------------------------------------------------------------------
# LOAD DATASET
# -----------------------------------------------------------------------------

cat_files = list((data_path / "Cat").glob("*.jpg"))
dog_files = list((data_path / "Dog").glob("*.jpg"))
all_files = cat_files + dog_files

print("=" * 110)
print("FULL PETIMAGES DATASET - V3+ ALGORITHM")
print("=" * 110)
print(f"Total images to process: {len(all_files)}")
print(f"  Cats: {len(cat_files)}")
print(f"  Dogs: {len(dog_files)}")
print()
print("V3+ Design: V3 decisions preserved, rescue rules can only SAVE rejected images")
print()
print("Rescue Rules:")
print(f"  1. Very blurry + high animal: blur < {RESCUE1_BLUR_MAX}, animal >= {RESCUE1_ANIMAL_MIN}, photo >= {RESCUE1_PHOTO_MIN}")
print(f"  2. Multiple animals fallback: photo >= {RESCUE2_PHOTO_MIN}, animal < {RESCUE2_ANIMAL_MAX}, then CLIP check")
print()

# -----------------------------------------------------------------------------
# PROCESS ALL IMAGES
# -----------------------------------------------------------------------------

kept = []
rejected = []
rescued = []
errors = []

start_time = time.time()

print("Processing images...")
print("-" * 110)

with open(exclude_details_path, "w") as details_file, \
     open(all_images_path, "w") as all_file, \
     open(rescued_path, "w") as rescued_file:
    
    # Write headers
    details_file.write("# CLIP Filter V3+ Results - Rejected Images Only\n")
    details_file.write("# path\tphoto\tanimal\ttext\tblur\treason\n")
    details_file.write("-" * 110 + "\n")
    
    all_file.write("# CLIP Filter V3+ Results - All Images\n")
    all_file.write("# path\tstatus\tphoto\tanimal\ttext\tblur\treason\trescue_details\n")
    all_file.write("-" * 130 + "\n")
    
    rescued_file.write("# CLIP Filter V3+ Results - Rescued Images\n")
    rescued_file.write("# These images would have been REJECTED by V3 but were SAVED by rescue rules\n")
    rescued_file.write("# path\tphoto\tanimal\ttext\tblur\treason\trescue_details\n")
    rescued_file.write("-" * 130 + "\n")
    
    for i, f in enumerate(all_files):
        # Relative path format matching existing exclude_list.txt
        rel_path = f"..\\data\\PetImages\\{f.parent.name}\\{f.name}"
        
        try:
            photo, animal, text = check_image_multi(f)
            blur = get_blur_score(f)
            keep, reason, was_rescued, rescue_details = should_keep_v3plus(photo, animal, text, blur, f)
            
            status = "KEEP" if keep else "REJECT"
            if was_rescued:
                status = "RESCUED"
            
            # Write to all images file
            all_file.write(f"{rel_path}\t{status}\t{photo:.2f}\t{animal:.2f}\t{text:.2f}\t{blur:.1f}\t{reason}\t{rescue_details}\n")
            all_file.flush()  # Force write to disk immediately
            
            if keep:
                kept.append((rel_path, photo, animal, text, blur, reason, was_rescued, rescue_details))
                if was_rescued:
                    rescued.append((rel_path, photo, animal, text, blur, reason, rescue_details))
                    rescued_file.write(f"{rel_path}\t{photo:.2f}\t{animal:.2f}\t{text:.2f}\t{blur:.1f}\t{reason}\t{rescue_details}\n")
                    rescued_file.flush()  # Force write to disk immediately
            else:
                rejected.append((rel_path, photo, animal, text, blur, reason))
                details_file.write(f"{rel_path}\t{photo:.2f}\t{animal:.2f}\t{text:.2f}\t{blur:.1f}\t{reason}\n")
                details_file.flush()  # Force write to disk immediately
            
        except Exception as e:
            errors.append((rel_path, str(e)))
            rejected.append((rel_path, 0, 0, 0, 0, f"ERROR: {e}"))
            details_file.write(f"{rel_path}\tERROR\t\t\t\t{e}\n")
            all_file.write(f"{rel_path}\tERROR\t\t\t\t\t{e}\t\n")
        
        # Progress update every 500 images
        if (i + 1) % 500 == 0:
            elapsed = time.time() - start_time
            rate = (i + 1) / elapsed
            remaining = (len(all_files) - i - 1) / rate
            print(f"  {i + 1}/{len(all_files)} ({(i+1)/len(all_files)*100:.1f}%) - "
                  f"Kept: {len(kept)} (rescued: {len(rescued)}) - "
                  f"Rejected: {len(rejected)} - "
                  f"ETA: {remaining/60:.1f} min")

# Write simple exclude list (just paths)
with open(exclude_list_path, "w") as f:
    for rel_path, photo, animal, text, blur, reason in rejected:
        f.write(rel_path + "\n")

# -----------------------------------------------------------------------------
# FINAL SUMMARY
# -----------------------------------------------------------------------------

elapsed = time.time() - start_time

print()
print("=" * 110)
print("COMPLETE - V3+ ALGORITHM")
print("=" * 110)
print(f"Time elapsed: {elapsed/60:.1f} minutes")
print(f"Total processed: {len(all_files)}")
print()
print(f"KEPT:     {len(kept)}")
print(f"  - By V3 rules:    {len(kept) - len(rescued)}")
print(f"  - By rescue rules: {len(rescued)}")
print(f"REJECTED: {len(rejected)}")
print(f"ERRORS:   {len(errors)}")

# Breakdown by class
rejected_cats = sum(1 for r in rejected if "\\Cat\\" in r[0])
rejected_dogs = sum(1 for r in rejected if "\\Dog\\" in r[0])
rescued_cats = sum(1 for r in rescued if "\\Cat\\" in r[0])
rescued_dogs = sum(1 for r in rescued if "\\Dog\\" in r[0])

print()
print("By class:")
print(f"  Rejected Cats: {rejected_cats}")
print(f"  Rejected Dogs: {rejected_dogs}")
print(f"  Rescued Cats:  {rescued_cats}")
print(f"  Rescued Dogs:  {rescued_dogs}")

# Breakdown by reason
print()
print("Rejection reasons:")
reason_counts = {}
for item in rejected:
    reason = item[5] if len(item) > 5 else "unknown"
    reason_counts[reason] = reason_counts.get(reason, 0) + 1
for reason, count in sorted(reason_counts.items(), key=lambda x: -x[1]):
    print(f"  {count:5d}  {reason}")

# Rescue breakdown
print()
print("Rescue reasons:")
rescue_counts = {}
for _, _, _, _, _, reason, _ in rescued:
    rescue_counts[reason] = rescue_counts.get(reason, 0) + 1
for reason, count in sorted(rescue_counts.items(), key=lambda x: -x[1]):
    print(f"  {count:5d}  {reason}")

# Output files
print()
print("Output files:")
print(f"  1. {exclude_list_path}")
print(f"     Paths only (for loading into training)")
print()
print(f"  2. {exclude_details_path}")
print(f"     Rejected images with scores and reasons")
print()
print(f"  3. {all_images_path}")
print(f"     ALL images with status, scores, and reasons")
print()
print(f"  4. {rescued_path}")
print(f"     Images saved by rescue rules (would have been rejected by V3)")

# Show rescued images for review
if rescued:
    print()
    print(f"RESCUED IMAGES ({len(rescued)} total) - First 30:")
    print("-" * 130)
    print(f"{'path':<50} {'photo':>5} {'animal':>6} {'text':>5} {'blur':>8}  {'reason':<40} rescue_details")
    print("-" * 130)
    for rel_path, photo, animal, text, blur, reason, rescue_details in rescued[:30]:
        short_path = rel_path.replace("..\\data\\PetImages\\", "")
        print(f"{short_path:<50} {photo:>5.2f} {animal:>6.2f} {text:>5.2f} {blur:>8.1f}  {reason:<40} {rescue_details}")
    if len(rescued) > 30:
        print(f"... and {len(rescued) - 30} more (see {rescued_path})")

# Show first 20 rejected for quick review
print()
print("First 20 rejected images:")
print("-" * 110)
print(f"{'path':<50} {'photo':>5} {'animal':>6} {'text':>5} {'blur':>8}  reason")
print("-" * 110)
for item in rejected[:20]:
    rel_path = item[0]
    photo, animal, text, blur = item[1], item[2], item[3], item[4]
    reason = item[5] if len(item) > 5 else "unknown"
    short_path = rel_path.replace("..\\data\\PetImages\\", "")
    print(f"{short_path:<50} {photo:>5.2f} {animal:>6.2f} {text:>5.2f} {blur:>8.1f}  {reason}")

if errors:
    print()
    print(f"Errors ({len(errors)}):")
    for path, err in errors[:10]:
        print(f"  {path}: {err}")

print()
print("=" * 110)
print("V3+ ALGORITHM SUMMARY")
print("=" * 110)
print("Design principle: If V3 keeps it → V3+ keeps it. Always.")
print("Rescue rules can ONLY save images that V3 would reject.")
print()
print("Rescue Rule 1: Very blurry + high animal")
print(f"  blur < {RESCUE1_BLUR_MAX}, animal >= {RESCUE1_ANIMAL_MIN}, photo >= {RESCUE1_PHOTO_MIN}")
print("  Catches: yawning cats, unusual poses, solid backgrounds")
print()
print("Rescue Rule 2: Multiple animals fallback")
print(f"  photo >= {RESCUE2_PHOTO_MIN}, animal < {RESCUE2_ANIMAL_MAX}")
print(f"  Secondary CLIP check, threshold >= {RESCUE2_MULTIPLE_MIN}")
print("  Catches: group photos, cats in cages, multiple kittens")
print("=" * 110)

Loading weights:   0%|          | 0/590 [00:00<?, ?it/s]

CLIPModel LOAD REPORT from: openai/clip-vit-large-patch14-336
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
vision_model.embeddings.position_ids | UNEXPECTED |  | 
text_model.embeddings.position_ids   | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
The image processor of type `CLIPImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. 


FULL PETIMAGES DATASET - V3+ ALGORITHM
Total images to process: 25000
  Cats: 12500
  Dogs: 12500

V3+ Design: V3 decisions preserved, rescue rules can only SAVE rejected images

Rescue Rules:
  1. Very blurry + high animal: blur < 500, animal >= 0.8, photo >= 0.25
  2. Multiple animals fallback: photo >= 0.9, animal < 0.5, then CLIP check

Processing images...
--------------------------------------------------------------------------------------------------------------


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


  500/25000 (2.0%) - Kept: 498 (rescued: 0) - Rejected: 2 - ETA: 183.2 min
  1000/25000 (4.0%) - Kept: 997 (rescued: 1) - Rejected: 3 - ETA: 179.3 min
  1500/25000 (6.0%) - Kept: 1496 (rescued: 3) - Rejected: 4 - ETA: 176.9 min
  2000/25000 (8.0%) - Kept: 1995 (rescued: 4) - Rejected: 5 - ETA: 173.5 min
  2500/25000 (10.0%) - Kept: 2494 (rescued: 4) - Rejected: 6 - ETA: 170.1 min
  3000/25000 (12.0%) - Kept: 2994 (rescued: 5) - Rejected: 6 - ETA: 166.5 min
  3500/25000 (14.0%) - Kept: 3492 (rescued: 5) - Rejected: 8 - ETA: 162.9 min
  4000/25000 (16.0%) - Kept: 3990 (rescued: 5) - Rejected: 10 - ETA: 159.1 min
  4500/25000 (18.0%) - Kept: 4489 (rescued: 5) - Rejected: 11 - ETA: 155.3 min
  5000/25000 (20.0%) - Kept: 4987 (rescued: 7) - Rejected: 13 - ETA: 151.6 min
  5500/25000 (22.0%) - Kept: 5485 (rescued: 10) - Rejected: 15 - ETA: 147.9 min
  6000/25000 (24.0%) - Kept: 5985 (rescued: 11) - Rejected: 15 - ETA: 144.1 min
  6500/25000 (26.0%) - Kept: 6483 (rescued: 11) - Rejected: 17 -



  24000/25000 (96.0%) - Kept: 23917 (rescued: 29) - Rejected: 83 - ETA: 8.2 min
  24500/25000 (98.0%) - Kept: 24414 (rescued: 30) - Rejected: 86 - ETA: 4.1 min
  25000/25000 (100.0%) - Kept: 24914 (rescued: 31) - Rejected: 86 - ETA: 0.0 min

COMPLETE - V3+ ALGORITHM
Time elapsed: 203.8 minutes
Total processed: 25000

KEPT:     24914
  - By V3 rules:    24883
  - By rescue rules: 31
REJECTED: 86
ERRORS:   2

By class:
  Rejected Cats: 44
  Rejected Dogs: 42
  Rescued Cats:  16
  Rescued Dogs:  15

Rejection reasons:
     43  not a real photo
     22  blurry photo, no animal, has text
      7  real photo, no animal, has text
      6  blurry photo but no animal
      3  did not meet criteria
      3  real photo but no animal
      1  ERROR: cannot identify image file 'C:\\AWrk\\cats_dogs_project\\data\\PetImages\\Cat\\666.jpg'
      1  ERROR: cannot identify image file 'C:\\AWrk\\cats_dogs_project\\data\\PetImages\\Dog\\11702.jpg'

Rescue reasons:
     23  RESCUED: multiple animals detect