## Cleaning

In [1]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import os
import shutil
import hashlib

In [2]:
# Load file lists
data_path = Path("../data/PetImages")
cat_files = list((data_path / "Cat").glob("*.jpg"))
dog_files = list((data_path / "Dog").glob("*.jpg"))
all_files = cat_files + dog_files

print(f"Total files: {len(all_files)}")

Total files: 25000


In [3]:
# Recreate the problems list from exploration
def check_image(path):
    try:
        img = cv2.imread(str(path))
        if img is None:
            return False, "failed to load"
        
        _ = img.sum()  # truncated files
        
        h, w = img.shape[:2]
        if h < 10 or w < 10:
            return False, f"too small: {w}x{h}"
        
        if len(img.shape) != 3 or img.shape[2] != 3:
            return False, f"not RGB: shape {img.shape}"
        
        return True, None
    except Exception as e:
        return False, str(e)

problems = []
for f in all_files:
    ok, err = check_image(f)
    if not ok:
        problems.append((f, err))

print(f"Corrupted/problematic: {len(problems)}")

Corrupted/problematic: 4


In [4]:
# Recreate tiny images list
tiny = []
for f in all_files:
    img = cv2.imread(str(f))
    if img is not None:
        h, w = img.shape[:2]
        if w < 100 or h < 100:
            tiny.append((f, w, h))

print(f"Tiny images (<100px): {len(tiny)}")

Tiny images (<100px): 180


## Start with Looking for Duplicates

In [5]:
# Check for duplicate images using file hash
import hashlib

def get_file_hash(filepath):
    """Quick hash of file contents"""
    with open(filepath, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

print("Hashing all files... (this takes a minute)")
hash_to_path = {}
duplicates = []

for i, f in enumerate(all_files):
    h = get_file_hash(f)
    if h in hash_to_path:
        duplicates.append((f, hash_to_path[h]))
    else:
        hash_to_path[h] = f

print(f"\nFound {len(duplicates)} duplicate images")

Hashing all files... (this takes a minute)

Found 31 duplicate images


In [6]:
# Check for duplicate images using file hash
import hashlib

def file_hash(path):
    with open(path, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

seen = {}
duplicates = []

for f in all_files:
    h = file_hash(f)
    if h in seen:
        duplicates.append((f, seen[h]))
    else:
        seen[h] = f

print(f"Duplicates: {len(duplicates)}")
for dup, orig in duplicates:
    print(f"  {dup.parent.name}/{dup.name} == {orig.parent.name}/{orig.name}")

Duplicates: 31
  Cat/11974.jpg == Cat/11706.jpg
  Cat/1639.jpg == Cat/10273.jpg
  Cat/1930.jpg == Cat/10052.jpg
  Cat/2339.jpg == Cat/12408.jpg
  Cat/3345.jpg == Cat/10360.jpg
  Cat/3910.jpg == Cat/11197.jpg
  Cat/4435.jpg == Cat/3411.jpg
  Cat/4575.jpg == Cat/4126.jpg
  Cat/4667.jpg == Cat/11053.jpg
  Cat/4726.jpg == Cat/2580.jpg
  Cat/6204.jpg == Cat/6143.jpg
  Cat/7545.jpg == Cat/3112.jpg
  Cat/7912.jpg == Cat/3908.jpg
  Cat/8137.jpg == Cat/11595.jpg
  Cat/9240.jpg == Cat/4509.jpg
  Cat/9588.jpg == Cat/4795.jpg
  Cat/9966.jpg == Cat/4442.jpg
  Dog/10401.jpg == Cat/11565.jpg
  Dog/10797.jpg == Cat/11565.jpg
  Dog/11702.jpg == Cat/666.jpg
  Dog/2877.jpg == Cat/11565.jpg
  Dog/3082.jpg == Dog/2027.jpg
  Dog/6058.jpg == Dog/10.jpg
  Dog/6700.jpg == Dog/3832.jpg
  Dog/74.jpg == Dog/180.jpg
  Dog/8044.jpg == Dog/5874.jpg
  Dog/8087.jpg == Dog/1947.jpg
  Dog/8591.jpg == Dog/39.jpg
  Dog/8736.jpg == Cat/8456.jpg
  Dog/980.jpg == Dog/1273.jpg
  Dog/9956.jpg == Dog/11688.jpg


In [7]:
# Check if any duplicate appears in BOTH Cat and Dog folders
# (would be a data quality problem)

cross_class = [(d, o) for d, o in duplicates if d.parent.name != o.parent.name]

print(f"Cross-class duplicates: {len(cross_class)}")
for d, o in cross_class:
    print(f"  {d.parent.name}/{d.name} <-> {o.parent.name}/{o.name}")

Cross-class duplicates: 5
  Dog/10401.jpg <-> Cat/11565.jpg
  Dog/10797.jpg <-> Cat/11565.jpg
  Dog/11702.jpg <-> Cat/666.jpg
  Dog/2877.jpg <-> Cat/11565.jpg
  Dog/8736.jpg <-> Cat/8456.jpg


In [9]:
# Build exclusion list
exclude = set()

# corrupted
for path, err in problems:
    exclude.add(str(path))

# duplicates (keep the original, drop the duplicate)
for dup, orig in duplicates:
    exclude.add(str(dup))

# cross-class - drop both since we can't trust either label
for d, o in cross_class:
    exclude.add(str(d))
    exclude.add(str(o))

# tiny images - your call whether to include this
# for path, w, h in tiny:
#     exclude.add(str(path))

print(f"Excluding {len(exclude)} files")

with open("../outputs/exclude_list.txt", "w") as f:
    for p in sorted(exclude):
        f.write(p + "\n")

Excluding 36 files


## Single Colour Images

In [10]:
# Detect likely blank or single-color images
def is_blank(img, threshold=5):
    """Low pixel variance = probably blank or solid color"""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    return gray.std() < threshold

blanks = []
for f in all_files:
    img = cv2.imread(str(f))
    if img is not None and is_blank(img):
        blanks.append(f)

print(f"Likely blank images: {len(blanks)}")
for f in blanks:
    print(f"  {f.parent.name}/{f.name}")

Likely blank images: 1
  Cat/835.jpg


In [11]:
# Cartoon detection - skip grayscale images

def is_grayscale(img):
    """Check if image is grayscale (or nearly)"""
    b, g, r = cv2.split(img)
    # If all channels are similar, it's grayscale
    diff = np.abs(b.astype(float) - g.astype(float)).mean() + \
           np.abs(g.astype(float) - r.astype(float)).mean()
    return diff < 5

def unique_colors(img):
    small = cv2.resize(img, (150, 150))
    quantized = (small // 4) * 4
    pixels = quantized.reshape(-1, 3)
    return len(set(map(tuple, pixels)))

# Check only color images
color_counts = []
grayscale_images = []

for f in all_files:
    img = cv2.imread(str(f))
    if img is not None:
        if is_grayscale(img):
            grayscale_images.append(f)
        else:
            color_counts.append((f, unique_colors(img)))

print(f"Grayscale images: {len(grayscale_images)} (skipped)")
print(f"Color images checked: {len(color_counts)}")

color_counts.sort(key=lambda x: x[1])

print("\nFewest unique colors (color images only):")
for f, c in color_counts[:40]:
    print(f"  {c:5d}  {f.parent.name}/{f.name}")

KeyboardInterrupt: 

In [None]:
def gradient_distribution(img):
    """
    Photos: lots of small gradients (fur, skin, lighting)
    Cartoons: mostly zeros (flat) and big jumps (edges), nothing in between
    
    Returns ratio of medium gradients to total - photos score high
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(float)
    
    # Pixel-to-pixel differences
    dx = np.abs(np.diff(gray, axis=1)).flatten()
    dy = np.abs(np.diff(gray, axis=0)).flatten()
    gradients = np.concatenate([dx, dy])
    
    # Count gradient sizes
    flat = np.sum(gradients < 3)           # nearly identical pixels
    small = np.sum((gradients >= 3) & (gradients < 25))   # subtle gradients (photo texture)
    large = np.sum(gradients >= 25)        # hard edges
    
    total = len(gradients)
    
    # Photos have lots of small gradients
    # Cartoons have mostly flat + some large, very few small
    small_ratio = small / total
    
    return small_ratio

# Score all color images
gradient_scores = []
for f in all_files:
    img = cv2.imread(str(f))
    if img is not None:
        if not is_grayscale(img):  # skip grayscale
            score = gradient_distribution(img)
            gradient_scores.append((f, score))

# Low score = few subtle gradients = likely cartoon
gradient_scores.sort(key=lambda x: x[1])

print("Lowest subtle gradient ratio (likely cartoons):")
for f, s in gradient_scores[:40]:
    print(f"  {s:.3f}  {f.parent.name}/{f.name}")

## Using subtle gradients of images to find cartoons and text in bad images (this didn't work)

In [None]:
def best_texture_region(img, patch_size=64):
    """
    Find the patch with the most photo-like texture.
    Real photo: at least one region (the animal) has subtle gradients
    Cartoon: flat everywhere, even on the subject
    """
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).astype(float)
    h, w = gray.shape
    
    best_score = 0
    
    for y in range(0, h - patch_size, patch_size // 2):
        for x in range(0, w - patch_size, patch_size // 2):
            patch = gray[y:y+patch_size, x:x+patch_size]
            
            dx = np.abs(np.diff(patch, axis=1)).flatten()
            dy = np.abs(np.diff(patch, axis=0)).flatten()
            gradients = np.concatenate([dx, dy])
            
            # Subtle gradients only (the fur/texture signal)
            subtle = np.sum((gradients >= 3) & (gradients < 25))
            score = subtle / len(gradients)
            
            if score > best_score:
                best_score = score
    
    return best_score

# Score all color images
texture_scores = []
for f in all_files:
    img = cv2.imread(str(f))
    if img is not None:
        if not is_grayscale(img):
            score = best_texture_region(img)
            texture_scores.append((f, score))

# Low score = no region has real texture = likely cartoon
texture_scores.sort(key=lambda x: x[1])

print("No textured region found (likely cartoons):")
for f, s in texture_scores[:40]:
    print(f"  {s:.3f}  {f.parent.name}/{f.name}")

## Using CLIP for Preprocessing Instead

The heuristic approaches didn't work well - they flag grayscale real photos and miss actual cartoons.

CLIP can classify images by asking about the medium (photo vs cartoon) without knowing about cats/dogs.

In [14]:
import torch
print(torch.cuda.is_available())  # True
print(torch.cuda.get_device_name(0))  # should show your GPU

True
NVIDIA GeForce RTX 3050 Laptop GPU


In [22]:
# Using the larger CLIP model - uses ~1.7GB VRAM
# clip-vit-large-patch14-336 has higher resolution input (336x336) = better quality

from transformers import pipeline
from PIL import Image
from pathlib import Path

classifier = pipeline(
    "zero-shot-image-classification",
    model="openai/clip-vit-large-patch14-336",
    device=0
)

print("Model loaded")

config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/590 [00:00<?, ?it/s]

CLIPModel LOAD REPORT from: openai/clip-vit-large-patch14-336
Key                                  | Status     |  | 
-------------------------------------+------------+--+-
text_model.embeddings.position_ids   | UNEXPECTED |  | 
vision_model.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


tokenizer_config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Model loaded


In [23]:
# Simple binary - is it a real photo or not?
# CLIP works better with simple, clear labels

labels = [
    "a photograph of a real animal",
    "a cartoon, drawing, illustration, or clip art"
]

In [24]:
# Quick test to see if this works better

for f in all_files[:20]:
    try:
        img = Image.open(f)
        result = classifier(img, candidate_labels=labels)
        photo_score = result[0]['score'] if result[0]['label'] == labels[0] else result[1]['score']
        top = result[0]['label'][:50]
        print(f"{photo_score:.2f}  {top:50}  {f.parent.name}/{f.name}")
    except Exception as e:
        print(f"Error: {f.name}")

0.24  a cartoon, drawing, illustration, or clip art       Cat/0.jpg
0.21  a cartoon, drawing, illustration, or clip art       Cat/1.jpg
0.33  a cartoon, drawing, illustration, or clip art       Cat/10.jpg
0.43  a cartoon, drawing, illustration, or clip art       Cat/100.jpg
0.08  a cartoon, drawing, illustration, or clip art       Cat/1000.jpg
0.24  a cartoon, drawing, illustration, or clip art       Cat/10000.jpg
0.28  a cartoon, drawing, illustration, or clip art       Cat/10001.jpg
0.15  a cartoon, drawing, illustration, or clip art       Cat/10002.jpg
0.05  a cartoon, drawing, illustration, or clip art       Cat/10003.jpg
0.03  a cartoon, drawing, illustration, or clip art       Cat/10004.jpg
0.05  a cartoon, drawing, illustration, or clip art       Cat/10005.jpg
0.21  a cartoon, drawing, illustration, or clip art       Cat/10006.jpg
0.08  a cartoon, drawing, illustration, or clip art       Cat/10007.jpg
0.02  a cartoon, drawing, illustration, or clip art       Cat/10008.jpg
0.53  

In [26]:
# Even simpler labels
labels = [
    "a photo",
    "a drawing"
]

for f in all_files[:20]:
    try:
        img = Image.open(f)
        result = classifier(img, candidate_labels=labels)
        photo_score = result[0]['score'] if result[0]['label'] == "a photo" else 1 - result[0]['score']
        top = result[0]['label']
        print(f"{photo_score:.2f}  {top:15}  {f.parent.name}/{f.name}")
    except Exception as e:
        print(f"Error: {f.name}")

0.85  a photo          Cat/0.jpg
0.72  a photo          Cat/1.jpg
0.89  a photo          Cat/10.jpg
0.67  a photo          Cat/100.jpg
0.99  a photo          Cat/1000.jpg
0.38  a drawing        Cat/10000.jpg
0.93  a photo          Cat/10001.jpg
0.89  a photo          Cat/10002.jpg
0.80  a photo          Cat/10003.jpg
0.65  a photo          Cat/10004.jpg
0.57  a photo          Cat/10005.jpg
0.82  a photo          Cat/10006.jpg
0.72  a photo          Cat/10007.jpg
0.32  a drawing        Cat/10008.jpg
0.71  a photo          Cat/10009.jpg
0.68  a photo          Cat/1001.jpg
0.78  a photo          Cat/10010.jpg
0.48  a drawing        Cat/10011.jpg
0.83  a photo          Cat/10012.jpg
0.88  a photo          Cat/10013.jpg


In [27]:
# Different angle - ask about the medium
labels = [
    "camera photograph",
    "digital artwork"
]

for f in all_files[:20]:
    try:
        img = Image.open(f)
        result = classifier(img, candidate_labels=labels)
        print(f"{result[0]['score']:.2f}  {result[0]['label']:20}  {f.parent.name}/{f.name}")
    except Exception as e:
        print(f"Error: {f.name}")

0.97  camera photograph     Cat/0.jpg
0.96  camera photograph     Cat/1.jpg
0.97  camera photograph     Cat/10.jpg
0.95  camera photograph     Cat/100.jpg
0.99  camera photograph     Cat/1000.jpg
0.92  camera photograph     Cat/10000.jpg
0.97  camera photograph     Cat/10001.jpg
0.98  camera photograph     Cat/10002.jpg
0.98  camera photograph     Cat/10003.jpg
0.95  camera photograph     Cat/10004.jpg
0.98  camera photograph     Cat/10005.jpg
0.97  camera photograph     Cat/10006.jpg
0.96  camera photograph     Cat/10007.jpg
0.90  camera photograph     Cat/10008.jpg
0.98  camera photograph     Cat/10009.jpg
0.95  camera photograph     Cat/1001.jpg
0.97  camera photograph     Cat/10010.jpg
0.64  camera photograph     Cat/10011.jpg
0.98  camera photograph     Cat/10012.jpg
0.95  camera photograph     Cat/10013.jpg


## Look at all the known bad images in the tiny images folder and see how many can get right

In [None]:
# Test on known bad images from tiny folder of images
# Change these paths to actual cartoon files you found

test_bad = [
    Path("../outputs/01_tiny_images/50x39_10747.jpg"),  # the fence
    Path("../outputs/01_tiny_images/4x4_5673.jpg"),  # 4x4pixels
    Path("../outputs/01_tiny_images/60x60_835.jpg"),  # white only
    Path("../outputs/01_tiny_images/82x159_9517.jpg"),  # SAVE ALIVE image
    Path("../outputs/01_tiny_images/75x80_8470.jpg"),  # cat drawing
    Path("../outputs/01_tiny_images/85x95_4833.jpg"),  # cat drawing
    Path("../outputs/01_tiny_images/90x162_1259.jpg"),  # dog drawing
    Path("../outputs/01_tiny_images/99x125_9188.jpg"),  # dog drawing
    Path("../outputs/01_tiny_images/100x93_7968.jpg"),  # cat mail drawing
    Path("../outputs/01_tiny_images/145x39_9171.jpg"),  # rose drawing
    Path("../outputs/01_tiny_images/194x83_2663.jpg"),  # cat with stripes through it and babies feeding
    Path("../outputs/01_tiny_images/196x33_4367.jpg"),  # yahoo mail logo

    # add a few more you know are cartoons
]

labels = [
    "camera photograph",
    "digital artwork"
]

for f in test_bad:
    if f.exists():
        try:
            img = Image.open(f)
            result = classifier(img, candidate_labels=labels)
            print(f"{result[0]['score']:.2f}  {result[0]['label']:20}  {f.name}")
        except Exception as e:
            print(f"Error: {f.name}")

0.68  camera photograph     50x39_10747.jpg
0.74  digital artwork       4x4_5673.jpg
0.54  digital artwork       60x60_835.jpg
0.98  camera photograph     82x159_9517.jpg
0.60  digital artwork       75x80_8470.jpg
0.71  digital artwork       85x95_4833.jpg
0.86  digital artwork       90x162_1259.jpg
0.93  digital artwork       99x125_9188.jpg
0.93  digital artwork       100x93_7968.jpg
0.89  camera photograph     145x39_9171.jpg
0.94  digital artwork       194x83_2663.jpg
0.83  camera photograph     196x33_4367.jpg


In [None]:
# Test the multi-check approach on known bad images

# Rules - A digital art should be rejected. A real photo should be kept. An animal should be kept and no animal should be rejected. Text should be rejected if the photograph was real and not animal was there. Otherwise we keep everything


test_bad = [
    Path("../outputs/01_tiny_images/50x39_10747.jpg"),  # the fence - real photo but no animal
    Path("../outputs/01_tiny_images/4x4_5673.jpg"),  # 4x4pixels - digital art a 4 pixels make up nothing
    Path("../outputs/01_tiny_images/60x60_835.jpg"),  # white only - digital art
    Path("../outputs/01_tiny_images/82x159_9517.jpg"),  # SAVE ALIVE image - real photo but is a label with text and no animal
    Path("../outputs/01_tiny_images/75x80_8470.jpg"),  # cat drawing - not a real photo but has a cat
    Path("../outputs/01_tiny_images/85x95_4833.jpg"),  # cat drawing - not a real photo but has a cat
    Path("../outputs/01_tiny_images/90x162_1259.jpg"),  # dog drawing
    Path("../outputs/01_tiny_images/99x125_9188.jpg"),  # dog drawing
    Path("../outputs/01_tiny_images/100x93_7968.jpg"),  # cat mail drawing
    Path("../outputs/01_tiny_images/145x39_9171.jpg"),  # rose drawing
    Path("../outputs/01_tiny_images/194x83_2663.jpg"),  # cat with stripes through it and babies feeding
    Path("../outputs/01_tiny_images/196x33_4367.jpg"),  # yahoo mail logo
]

def check_image_multi(filepath):
    img = Image.open(filepath)
    
    # Check 1: Real photo or artwork?
    result1 = classifier(img, candidate_labels=["camera photograph", "digital artwork"])
    photo_score = result1[0]['score'] if result1[0]['label'] == "camera photograph" else 1 - result1[0]['score']
    
    # Check 2: Contains an animal?
    result2 = classifier(img, candidate_labels=["an animal", "not an animal"])
    animal_score = result2[0]['score'] if result2[0]['label'] == "an animal" else 1 - result2[0]['score']
    
    # Check 3: Contains text?
    result3 = classifier(img, candidate_labels=["text and words", "no text"])
    text_score = result3[0]['score'] if result3[0]['label'] == "text and words" else 1 - result3[0]['score']
    
    return photo_score, animal_score, text_score

print("photo  animal  text   file")
print("-" * 70)
for f in test_bad:
    if f.exists():
        try:
            photo, animal, text = check_image_multi(f)
            print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {f.name}")
        except Exception as e:
            print(f"Error: {f.name} - {e}")
    else:
        print(f"Not found: {f}")

photo  animal  text   file
----------------------------------------------------------------------
0.68   0.16    0.39   50x39_10747.jpg
0.26   0.23    0.09   4x4_5673.jpg
0.46   0.13    0.31   60x60_835.jpg
0.98   0.14    0.68   82x159_9517.jpg
0.40   0.57    0.18   75x80_8470.jpg
0.29   0.63    0.16   85x95_4833.jpg
0.14   0.94    0.53   90x162_1259.jpg
0.07   0.92    0.07   99x125_9188.jpg
0.07   0.79    0.37   100x93_7968.jpg
0.89   0.15    0.13   145x39_9171.jpg
0.06   0.80    0.66   194x83_2663.jpg
0.83   0.33    0.51   196x33_4367.jpg


In [None]:
# Test the multi-check approach on known bad images - add reason for the rejection and change thresholds
# Rules - A digital art should be rejected. A real photo should be kept. An animal should be kept and no animal should be rejected. Text should be rejected if the photograph was real and not animal was there. Otherwise we keep everything

test_bad = [
    Path("../outputs/01_tiny_images/50x39_10747.jpg"),  # the fence - real photo but no animal
    Path("../outputs/01_tiny_images/4x4_5673.jpg"),  # 4x4pixels - digital art
    Path("../outputs/01_tiny_images/60x60_835.jpg"),  # white only - digital art
    Path("../outputs/01_tiny_images/82x159_9517.jpg"),  # SAVE ALIVE image - real photo, text, no animal
    Path("../outputs/01_tiny_images/75x80_8470.jpg"),  # cat drawing - not a real photo but has a cat
    Path("../outputs/01_tiny_images/85x95_4833.jpg"),  # cat drawing - not a real photo but has a cat
    Path("../outputs/01_tiny_images/90x162_1259.jpg"),  # dog drawing with Text
    Path("../outputs/01_tiny_images/99x125_9188.jpg"),  # dog drawing
    Path("../outputs/01_tiny_images/100x93_7968.jpg"),  # cat mail drawing with text
    Path("../outputs/01_tiny_images/145x39_9171.jpg"),  # rose drawing
    Path("../outputs/01_tiny_images/194x83_2663.jpg"),  # cat with stripes and babies feeding
    Path("../outputs/01_tiny_images/196x33_4367.jpg"),  # yahoo mail logo
]

# This multi-check function will return a tuple of three values: photo_score, animal_score, text_score and is used throughout the notebook
def check_image_multi(filepath):
    img = Image.open(filepath)
    
    # Check 1: Real photo or artwork?
    result1 = classifier(img, candidate_labels=["camera photograph", "digital artwork"])
    photo_score = result1[0]['score'] if result1[0]['label'] == "camera photograph" else 1 - result1[0]['score']
    
    # Check 2: Contains an animal?
    result2 = classifier(img, candidate_labels=["an animal", "not an animal"])
    animal_score = result2[0]['score'] if result2[0]['label'] == "an animal" else 1 - result2[0]['score']
    
    # Check 3: Contains text?
    result3 = classifier(img, candidate_labels=["text and words", "no text"])
    text_score = result3[0]['score'] if result3[0]['label'] == "text and words" else 1 - result3[0]['score']
    
    return photo_score, animal_score, text_score

# Thresholds
PHOTO_THRESHOLD = 0.60
ANIMAL_THRESHOLD = 0.50
TEXT_THRESHOLD = 0.35

def should_keep(photo_score, animal_score, text_score):
    # Rule 1: Not a real photo → REJECT
    if photo_score < PHOTO_THRESHOLD:
        return False, "not a real photo"
    
    # Rule 2: Real photo + has animal → KEEP
    if animal_score >= ANIMAL_THRESHOLD:
        return True, "real photo with animal"
    
    # Rule 3: Real photo + no animal + has text → REJECT
    if text_score >= TEXT_THRESHOLD:
        return False, "real photo, no animal, has text"
    
    # Rule 4: Real photo + no animal + no text → REJECT
    return False, "real photo but no animal"

# Run the test
print("photo  animal  text   KEEP?   reason                         file")
print("-" * 90)
for f in test_bad:
    if f.exists():
        try:
            photo, animal, text = check_image_multi(f)
            keep, reason = should_keep(photo, animal, text)
            status = "KEEP" if keep else "REJECT"
            print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {status:6}  {reason:30} {f.name}")
        except Exception as e:
            print(f"Error: {f.name} - {e}")
    else:
        print(f"Not found: {f}")

    # Path("../outputs/01_tiny_images/50x39_10747.jpg"),  # the fence - real photo but no animal
    # Path("../outputs/01_tiny_images/4x4_5673.jpg"),  # 4x4pixels - digital art
    # Path("../outputs/01_tiny_images/60x60_835.jpg"),  # white only - digital art
    # Path("../outputs/01_tiny_images/82x159_9517.jpg"),  # SAVE ALIVE image - real photo, text, no animal
    # Path("../outputs/01_tiny_images/75x80_8470.jpg"),  # cat drawing - not a real photo but has a cat
    # Path("../outputs/01_tiny_images/85x95_4833.jpg"),  # cat drawing - not a real photo but has a cat
    # Path("../outputs/01_tiny_images/90x162_1259.jpg"),  # dog drawing with Text
    # Path("../outputs/01_tiny_images/99x125_9188.jpg"),  # dog drawing
    # Path("../outputs/01_tiny_images/100x93_7968.jpg"),  # cat mail drawing with text
    # Path("../outputs/01_tiny_images/145x39_9171.jpg"),  # rose drawing
    # Path("../outputs/01_tiny_images/194x83_2663.jpg"),  # cat with stripes and babies feeding
    # Path("../outputs/01_tiny_images/196x33_4367.jpg"),  # yahoo mail logo

photo  animal  text   KEEP?   reason                         file
------------------------------------------------------------------------------------------
0.68   0.16    0.39   REJECT  real photo, no animal, has text 50x39_10747.jpg
0.26   0.23    0.09   REJECT  not a real photo               4x4_5673.jpg
0.46   0.13    0.31   REJECT  not a real photo               60x60_835.jpg
0.98   0.14    0.68   REJECT  real photo, no animal, has text 82x159_9517.jpg
0.40   0.57    0.18   REJECT  not a real photo               75x80_8470.jpg
0.29   0.63    0.16   REJECT  not a real photo               85x95_4833.jpg
0.14   0.94    0.53   REJECT  not a real photo               90x162_1259.jpg
0.07   0.92    0.07   REJECT  not a real photo               99x125_9188.jpg
0.07   0.79    0.37   REJECT  not a real photo               100x93_7968.jpg
0.89   0.15    0.13   REJECT  real photo but no animal       145x39_9171.jpg
0.06   0.80    0.66   REJECT  not a real photo               194x83_2663.jpg
0

In [None]:
# Test on ALL tiny images to see false positives (good images wrongly rejected)

tiny_folder = Path("../outputs/01_tiny_images")
all_tiny = list(tiny_folder.glob("*.jpg"))

# Known bad images from earlier manual review
known_bad = [
    "50x39_10747.jpg",   # the fence - real photo but no animal
    "4x4_5673.jpg",      # 4x4pixels - digital art
    "60x60_835.jpg",     # white only - digital art
    "82x159_9517.jpg",   # SAVE ALIVE image - real photo, text, no animal
    "75x80_8470.jpg",    # cat drawing - not a real photo but has a cat
    "85x95_4833.jpg",    # cat drawing - not a real photo but has a cat
    "90x162_1259.jpg",   # dog drawing with text
    "99x125_9188.jpg",   # dog drawing
    "100x93_7968.jpg",   # cat mail drawing with text
    "145x39_9171.jpg",   # rose drawing
    "194x83_2663.jpg",   # cat with stripes and babies feeding
    "196x33_4367.jpg",   # yahoo mail logo
    
]
# Note:
# "88x131_11184.jpg",      # drawing of a husky dog - need to add to the list later from this check
print(f"Total images in folder: {len(all_tiny)}")
print(f"Known bad images: {len(known_bad)}")
print()

kept = []
rejected = []

print("photo  animal  text   KEEP?   KNOWN_BAD?  reason                         file")
print("-" * 105)

for f in sorted(all_tiny):
    try:
        photo, animal, text = check_image_multi(f)
        keep, reason = should_keep(photo, animal, text)
        status = "KEEP" if keep else "REJECT"
        is_known_bad = "YES" if f.name in known_bad else ""
        print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {status:6}  {is_known_bad:10}  {reason:30} {f.name}")
        
        if keep:
            kept.append((f.name, photo, animal, text, reason, f.name in known_bad))
        else:
            rejected.append((f.name, photo, animal, text, reason, f.name in known_bad))
    except Exception as e:
        print(f"Error: {f.name} - {e}")

print()
print("=" * 105)
print(f"KEPT: {len(kept)}  |  REJECTED: {len(rejected)}")
print("=" * 105)

# Check for problems
kept_but_known_bad = [x for x in kept if x[5]]
rejected_known_bad = [x for x in rejected if x[5]]

print(f"\n✓ Known bad images correctly rejected: {len(rejected_known_bad)}/{len(known_bad)}")
if kept_but_known_bad:
    print(f"\n PROBLEM: Known bad images that were KEPT:")
    for name, photo, animal, text, reason, _ in kept_but_known_bad:
        print(f"  {name}")

print("\n--- KEPT IMAGES (check these manually for false negatives) ---")
for name, photo, animal, text, reason, was_known_bad in kept:
    flag = " KNOWN BAD" if was_known_bad else ""
    print(f"  {photo:.2f}  {animal:.2f}  {text:.2f}  {name}{flag}")

print("\n--- REJECTED IMAGES ---")
for name, photo, animal, text, reason, was_known_bad in rejected:
    flag = " KNOWN_BAD" if was_known_bad else ""
    print(f"  {reason:30} {name}{flag}")

Total images in folder: 180
Known bad images: 12

photo  animal  text   KEEP?   KNOWN_BAD?  reason                         file
---------------------------------------------------------------------------------------------------------
0.65   0.63    0.62   KEEP                real photo with animal         100x37_11248.jpg
0.95   0.93    0.78   KEEP                real photo with animal         100x63_7765.jpg
1.00   0.82    0.92   KEEP                real photo with animal         100x74_7374.jpg
0.99   0.83    0.82   KEEP                real photo with animal         100x74_7893.jpg
0.92   0.83    0.61   KEEP                real photo with animal         100x75_10385.jpg
0.92   0.74    0.31   KEEP                real photo with animal         100x75_10893.jpg
0.99   0.90    0.79   KEEP                real photo with animal         100x75_1631.jpg
0.89   0.82    0.09   KEEP                real photo with animal         100x75_4629.jpg
0.94   0.81    0.56   KEEP                real phot

## Updated results of list of tiny images to discover false positives in all the good images after the good images were reviewed:

In [36]:
# Updated known bad list - add the husky drawing we found - 
known_bad = [
    "50x39_10747.jpg",   # the fence - real photo but no animal
    "4x4_5673.jpg",      # 4x4pixels - digital art
    "60x60_835.jpg",     # white only - digital art
    "82x159_9517.jpg",   # SAVE ALIVE image - real photo, text, no animal
    "75x80_8470.jpg",    # cat drawing - not a real photo but has a cat
    "85x95_4833.jpg",    # cat drawing - not a real photo but has a cat
    "90x162_1259.jpg",   # dog drawing with text
    "99x125_9188.jpg",   # dog drawing
    "100x93_7968.jpg",   # cat mail drawing with text
    "145x39_9171.jpg",   # rose drawing
    "194x83_2663.jpg",   # cat with stripes and babies feeding
    "196x33_4367.jpg",   # yahoo mail logo
    "88x131_11184.jpg",  # drawing of a husky dog - NEWLY FOUND
]

# False positives - good images incorrectly rejected
# These are real photos of animals that we need to KEEP
false_positives = [
    # (filename, algo_reason, true_state, notes)
    
    # Problem: Low animal score (animal not recognized)
    ("120x90_7630.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("140x93_9589.jpg", "no animal, has text", "real, animal, text", "animal not recognized"),
    ("142x93_7610.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("144x86_10807.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("150x97_9703.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("183x92_11263.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("200x94_3250.jpg", "no animal, has text", "real, animal, no text", "cats not showing faces"),
    ("300x94_5773.jpg", "no animal, has text", "real, animal, no text", "cats with eyes closed"),
    ("95x76_4134.jpg", "no animal, has text", "real, animal, no text", "fence in the way of dog"),
    ("96x65_3074.jpg", "no animal", "real, animal, no text", "very small and blurry dog"),
    ("96x71_8087.jpg", "no animal, has text", "real, animal, no text", "cat with human hands, blurred, no face visible"),
    ("96x72_9456.jpg", "no animal, has text", "real, animal, no text", "dog face behind chain link fence"),
    ("60x33_6402.jpg", "no animal", "real, animal, no text", "very small cat head looking away, held by human"),
    ("60x70_7314.jpg", "no animal, has text", "real, animal, no text", "blurry small image with two cats"),
    
    # Problem: Low photo score (real photo not recognized as photo)
    ("50x50_10392.jpg", "not a real photo", "real, animal, no text", "blurry small cat head only"),
    ("60x36_5534.jpg", "not a real photo", "real, animal, no text", "very extra blurry"),
    ("60x40_4821.jpg", "not a real photo", "real, animal, no text", "very blurry"),
    ("60x41_2433.jpg", "not a real photo", "real, animal, no text", "very blurry and small"),
    
    # Problem: False text detection
    ("60x39_9705.jpg", "has text", "real, animal, text", "very blurry - but actually has text so this one is tricky"),
]

print(f"Known bad images: {len(known_bad)}")
print(f"False positives (good images wrongly rejected): {len(false_positives)}")

print("\n--- FALSE POSITIVES BY PROBLEM TYPE ---")

no_animal_problem = [x for x in false_positives if "animal not recognized" in x[3] or "no animal" in x[1]]
no_photo_problem = [x for x in false_positives if "not a real photo" in x[1]]
text_problem = [x for x in false_positives if "has text" in x[3]]

print(f"\nAnimal not detected ({len(no_animal_problem)} images):")
for f, algo, true, note in no_animal_problem:
    print(f"  {f:25} - {note}")

print(f"\nPhoto not detected ({len(no_photo_problem)} images):")
for f, algo, true, note in no_photo_problem:
    print(f"  {f:25} - {note}")

Known bad images: 13
False positives (good images wrongly rejected): 19

--- FALSE POSITIVES BY PROBLEM TYPE ---

Animal not detected (14 images):
  120x90_7630.jpg           - animal not recognized
  140x93_9589.jpg           - animal not recognized
  142x93_7610.jpg           - animal not recognized
  144x86_10807.jpg          - animal not recognized
  150x97_9703.jpg           - animal not recognized
  183x92_11263.jpg          - animal not recognized
  200x94_3250.jpg           - cats not showing faces
  300x94_5773.jpg           - cats with eyes closed
  95x76_4134.jpg            - fence in the way of dog
  96x65_3074.jpg            - very small and blurry dog
  96x71_8087.jpg            - cat with human hands, blurred, no face visible
  96x72_9456.jpg            - dog face behind chain link fence
  60x33_6402.jpg            - very small cat head looking away, held by human
  60x70_7314.jpg            - blurry small image with two cats

Photo not detected (4 images):
  50x50_10392

## Address failures on  blurry/small/obscured animals

In [39]:
# Updated known bad list
known_bad = [
    "50x39_10747.jpg",   # the fence - real photo but no animal
    "4x4_5673.jpg",      # 4x4pixels - digital art
    "60x60_835.jpg",     # white only - digital art
    "82x159_9517.jpg",   # SAVE ALIVE image - real photo, text, no animal
    "75x80_8470.jpg",    # cat drawing
    "85x95_4833.jpg",    # cat drawing
    "90x162_1259.jpg",   # dog drawing with text
    "99x125_9188.jpg",   # dog drawing
    "100x93_7968.jpg",   # cat mail drawing with text
    "145x39_9171.jpg",   # rose drawing
    "194x83_2663.jpg",   # cat with stripes and babies feeding
    "196x33_4367.jpg",   # yahoo mail logo
    "88x131_11184.jpg",  # drawing of a husky dog - NEWLY FOUND
]

# False positives - good images incorrectly rejected
false_positives = [
    ("120x90_7630.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("140x93_9589.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("142x93_7610.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("144x86_10807.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("150x97_9703.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("183x92_11263.jpg", "no animal, has text", "real, animal, no text", "animal not recognized"),
    ("200x94_3250.jpg", "no animal, has text", "real, animal, no text", "cats not showing faces"),
    ("300x94_5773.jpg", "no animal, has text", "real, animal, no text", "cats with eyes closed"),
    ("95x76_4134.jpg", "no animal, has text", "real, animal, no text", "fence in the way of dog"),
    ("96x65_3074.jpg", "no animal", "real, animal, no text", "very small and blurry dog"),
    ("96x71_8087.jpg", "no animal, has text", "real, animal, no text", "cat with human hands, blurred"),
    ("96x72_9456.jpg", "no animal, has text", "real, animal, no text", "dog face behind chain link fence"),
    ("60x33_6402.jpg", "no animal", "real, animal, no text", "very small cat head looking away"),
    ("60x70_7314.jpg", "no animal, has text", "real, animal, no text", "blurry small image with two cats"),
    ("60x39_9705.jpg", "no animal, has text", "real, animal, no text", "very blurry image"),
    ("50x50_10392.jpg", "not a real photo", "real, animal, no text", "blurry small cat head only"),
    ("60x36_5534.jpg", "not a real photo", "real, animal, no text", "very extra blurry"),
    ("60x40_4821.jpg", "not a real photo", "real, animal, no text", "very blurry"),
    ("60x41_2433.jpg", "not a real photo", "real, animal, no text", "very blurry and small"),
]

# Thresholds
PHOTO_THRESHOLD = 0.60
PHOTO_THRESHOLD_LENIENT = 0.40
ANIMAL_THRESHOLD = 0.50
ANIMAL_THRESHOLD_HIGH = 0.70
TEXT_THRESHOLD = 0.35

def should_keep_v2(photo_score, animal_score, text_score):
    # Rule 1: Clear keep - real photo with animal
    if photo_score >= PHOTO_THRESHOLD and animal_score >= ANIMAL_THRESHOLD:
        return True, "real photo with animal"
    
    # Rule 2: Trust high animal score even if photo score is borderline
    if animal_score >= ANIMAL_THRESHOLD_HIGH and photo_score >= PHOTO_THRESHOLD_LENIENT:
        return True, "high animal confidence, borderline photo"
    
    # Rule 3: Not a real photo and low animal confidence -> REJECT
    if photo_score < PHOTO_THRESHOLD_LENIENT:
        return False, "not a real photo"
    
    # Rule 4: Real photo but no animal detected
    if animal_score < ANIMAL_THRESHOLD:
        if text_score >= TEXT_THRESHOLD:
            return False, "real photo, no animal, has text"
        return False, "real photo but no animal"
    
    return False, "did not meet criteria"

# TEST BOTH
print("=" * 90)
print("FALSE POSITIVES (should be KEEP)")
print("=" * 90)
print("photo  animal  text   OLD       NEW       file")
print("-" * 90)

fixed_count = 0
still_wrong = 0
for filename, algo_reason, true_state, notes in false_positives:
    f = tiny_folder / filename
    if f.exists():
        photo, animal, text = check_image_multi(f)
        old_keep, _ = should_keep(photo, animal, text)
        new_keep, _ = should_keep_v2(photo, animal, text)
        old_status = "KEEP" if old_keep else "REJECT"
        new_status = "KEEP" if new_keep else "REJECT"
        if new_keep and not old_keep:
            changed = "<- FIXED"
            fixed_count += 1
        elif not new_keep:
            changed = "<- STILL WRONG"
            still_wrong += 1
        else:
            changed = ""
        print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {old_status:6}    {new_status:6}    {filename} {changed}")

print(f"\nFixed: {fixed_count}/{len(false_positives)} | Still wrong: {still_wrong}/{len(false_positives)}")

print("\n" + "=" * 90)
print("KNOWN BAD IMAGES (should be REJECT)")
print("=" * 90)
print("photo  animal  text   OLD       NEW       file")
print("-" * 90)

broken_count = 0
for filename in known_bad:
    f = tiny_folder / filename
    if f.exists():
        photo, animal, text = check_image_multi(f)
        old_keep, _ = should_keep(photo, animal, text)
        new_keep, _ = should_keep_v2(photo, animal, text)
        old_status = "KEEP" if old_keep else "REJECT"
        new_status = "KEEP" if new_keep else "REJECT"
        changed = "<- BROKEN!" if new_keep else ""
        if new_keep:
            broken_count += 1
        print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {old_status:6}    {new_status:6}    {filename} {changed}")

print(f"\nBroken: {broken_count}/{len(known_bad)}")

print("\n" + "=" * 90)
print("SUMMARY")
print("=" * 90)
print(f"False positives fixed:    {fixed_count}/{len(false_positives)}")
print(f"False positives remaining:{still_wrong}/{len(false_positives)}")
print(f"Bad images broken:        {broken_count}/{len(known_bad)}")

FALSE POSITIVES (should be KEEP)
photo  animal  text   OLD       NEW       file
------------------------------------------------------------------------------------------
1.00   0.29    0.76   REJECT    REJECT    120x90_7630.jpg <- STILL WRONG
0.99   0.35    0.71   REJECT    REJECT    140x93_9589.jpg <- STILL WRONG
0.98   0.46    0.45   REJECT    REJECT    142x93_7610.jpg <- STILL WRONG
0.97   0.44    0.70   REJECT    REJECT    144x86_10807.jpg <- STILL WRONG
0.97   0.43    0.71   REJECT    REJECT    150x97_9703.jpg <- STILL WRONG
0.97   0.33    0.58   REJECT    REJECT    183x92_11263.jpg <- STILL WRONG
0.99   0.39    0.79   REJECT    REJECT    200x94_3250.jpg <- STILL WRONG
0.98   0.30    0.41   REJECT    REJECT    300x94_5773.jpg <- STILL WRONG
0.98   0.44    0.80   REJECT    REJECT    95x76_4134.jpg <- STILL WRONG
0.93   0.33    0.09   REJECT    REJECT    96x65_3074.jpg <- STILL WRONG
0.98   0.47    0.44   REJECT    REJECT    96x71_8087.jpg <- STILL WRONG
0.91   0.41    0.69   REJEC

In [40]:
# Idea 1: Blur detection using Laplacian variance
# Low variance = blurry, High variance = sharp

def get_blur_score(filepath):
    img = cv2.imread(str(filepath), cv2.IMREAD_GRAYSCALE)
    if img is None:
        return 0
    return cv2.Laplacian(img, cv2.CV_64F).var()

# First, let's see what blur scores look like on our problem images
print("=" * 100)
print("BLUR SCORES - FALSE POSITIVES (good images wrongly rejected)")
print("=" * 100)
print("photo  animal  text   blur      file                      notes")
print("-" * 100)

for filename, algo_reason, true_state, notes in false_positives:
    f = tiny_folder / filename
    if f.exists():
        photo, animal, text = check_image_multi(f)
        blur = get_blur_score(f)
        print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {blur:7.1f}   {filename:25} {notes}")

print("\n" + "=" * 100)
print("BLUR SCORES - KNOWN BAD IMAGES (should stay rejected)")
print("=" * 100)
print("photo  animal  text   blur      file                      description")
print("-" * 100)

bad_descriptions = {
    "50x39_10747.jpg": "fence - real photo but no animal",
    "4x4_5673.jpg": "4x4 pixels - digital art",
    "60x60_835.jpg": "white only - digital art",
    "82x159_9517.jpg": "SAVE ALIVE label - real photo, text, no animal",
    "75x80_8470.jpg": "cat drawing",
    "85x95_4833.jpg": "cat drawing",
    "90x162_1259.jpg": "dog drawing with text",
    "99x125_9188.jpg": "dog drawing",
    "100x93_7968.jpg": "cat mail drawing with text",
    "145x39_9171.jpg": "rose drawing",
    "194x83_2663.jpg": "cat with stripes drawing",
    "196x33_4367.jpg": "yahoo mail logo",
    "88x131_11184.jpg": "husky dog drawing",
}

for filename in known_bad:
    f = tiny_folder / filename
    if f.exists():
        photo, animal, text = check_image_multi(f)
        blur = get_blur_score(f)
        desc = bad_descriptions.get(filename, "")
        print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {blur:7.1f}   {filename:25} {desc}")

BLUR SCORES - FALSE POSITIVES (good images wrongly rejected)
photo  animal  text   blur      file                      notes
----------------------------------------------------------------------------------------------------
1.00   0.29    0.76     630.1   120x90_7630.jpg           animal not recognized
0.99   0.35    0.71    1078.9   140x93_9589.jpg           animal not recognized
0.98   0.46    0.45     632.2   142x93_7610.jpg           animal not recognized
0.97   0.44    0.70     340.3   144x86_10807.jpg          animal not recognized
0.97   0.43    0.71     185.8   150x97_9703.jpg           animal not recognized
0.97   0.33    0.58    1636.1   183x92_11263.jpg          animal not recognized
0.99   0.39    0.79      43.6   200x94_3250.jpg           cats not showing faces
0.98   0.30    0.41     222.3   300x94_5773.jpg           cats with eyes closed
0.98   0.44    0.80    2083.6   95x76_4134.jpg            fence in the way of dog
0.93   0.33    0.09    2705.5   96x65_3074.jpg     

In [None]:
# After the moves above this fixed the issues with the dataset on the false positives and bad images so everything was perfect

# New logic with blur detection
PHOTO_THRESHOLD = 0.60
PHOTO_THRESHOLD_LENIENT = 0.40
ANIMAL_THRESHOLD = 0.50
ANIMAL_THRESHOLD_LENIENT = 0.15  # for blurry photos where CLIP struggles - moved from 0.25 down to 0.15
TEXT_THRESHOLD = 0.35
BLUR_THRESHOLD = 5500  # below this = likely blurry photo, not artwork - moved from 4000 up to 5500

def should_keep_v3(photo_score, animal_score, text_score, blur_score):
    is_blurry = blur_score < BLUR_THRESHOLD
    
    # Rule 1: Clear keep - real photo with animal
    if photo_score >= PHOTO_THRESHOLD and animal_score >= ANIMAL_THRESHOLD:
        return True, "real photo with animal"
    
    # Rule 2: Blurry photo - be lenient on both photo and animal detection
    # CLIP struggles with blurry images, but blurry artwork is rare
    if is_blurry:
        # If photo score is borderline but it's blurry, trust it's a real photo
        if photo_score >= PHOTO_THRESHOLD_LENIENT:
            # Lower animal threshold for blurry images
            if animal_score >= ANIMAL_THRESHOLD_LENIENT:
                return True, "blurry photo with animal (lenient)"
            # Still reject if no animal at all
            if animal_score < ANIMAL_THRESHOLD_LENIENT:
                if text_score >= TEXT_THRESHOLD:
                    return False, "blurry photo, no animal, has text"
                return False, "blurry photo but no animal"
    
    # Rule 3: Not blurry, not a real photo -> REJECT
    if photo_score < PHOTO_THRESHOLD_LENIENT:
        return False, "not a real photo"
    
    # Rule 4: Real photo but no animal detected (not blurry, so trust CLIP)
    if animal_score < ANIMAL_THRESHOLD:
        if text_score >= TEXT_THRESHOLD:
            return False, "real photo, no animal, has text"
        return False, "real photo but no animal"
    
    return False, "did not meet criteria"

# TEST BOTH
print("=" * 110)
print("FALSE POSITIVES (should be KEEP)")
print("=" * 110)
print("photo  animal  text   blur      OLD       NEW       file")
print("-" * 110)

fixed_count = 0
still_wrong = 0
for filename, algo_reason, true_state, notes in false_positives:
    f = tiny_folder / filename
    if f.exists():
        photo, animal, text = check_image_multi(f)
        blur = get_blur_score(f)
        old_keep, _ = should_keep(photo, animal, text)
        new_keep, new_reason = should_keep_v3(photo, animal, text, blur)
        old_status = "KEEP" if old_keep else "REJECT"
        new_status = "KEEP" if new_keep else "REJECT"
        if new_keep and not old_keep:
            changed = "<- FIXED"
            fixed_count += 1
        elif not new_keep:
            changed = f"<- STILL WRONG ({new_reason})"
            still_wrong += 1
        else:
            changed = ""
        print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {blur:7.1f}   {old_status:6}    {new_status:6}    {filename} {changed}")

print(f"\nFixed: {fixed_count}/{len(false_positives)} | Still wrong: {still_wrong}/{len(false_positives)}")

print("\n" + "=" * 110)
print("KNOWN BAD IMAGES (should be REJECT)")
print("=" * 110)
print("photo  animal  text   blur      OLD       NEW       file")
print("-" * 110)

broken_count = 0
for filename in known_bad:
    f = tiny_folder / filename
    if f.exists():
        photo, animal, text = check_image_multi(f)
        blur = get_blur_score(f)
        old_keep, _ = should_keep(photo, animal, text)
        new_keep, new_reason = should_keep_v3(photo, animal, text, blur)
        old_status = "KEEP" if old_keep else "REJECT"
        new_status = "KEEP" if new_keep else "REJECT"
        changed = f"<- BROKEN! ({new_reason})" if new_keep else ""
        if new_keep:
            broken_count += 1
        print(f"{photo:.2f}   {animal:.2f}    {text:.2f}   {blur:7.1f}   {old_status:6}    {new_status:6}    {filename} {changed}")

print(f"\nBroken: {broken_count}/{len(known_bad)}")

print("\n" + "=" * 110)
print("SUMMARY")
print("=" * 110)
print(f"False positives fixed:     {fixed_count}/{len(false_positives)}")
print(f"False positives remaining: {still_wrong}/{len(false_positives)}")
print(f"Bad images broken:         {broken_count}/{len(known_bad)}")

FALSE POSITIVES (should be KEEP)
photo  animal  text   blur      OLD       NEW       file
--------------------------------------------------------------------------------------------------------------
1.00   0.29    0.76     630.1   REJECT    KEEP      120x90_7630.jpg <- FIXED
0.99   0.35    0.71    1078.9   REJECT    KEEP      140x93_9589.jpg <- FIXED
0.98   0.46    0.45     632.2   REJECT    KEEP      142x93_7610.jpg <- FIXED
0.97   0.44    0.70     340.3   REJECT    KEEP      144x86_10807.jpg <- FIXED
0.97   0.43    0.71     185.8   REJECT    KEEP      150x97_9703.jpg <- FIXED
0.97   0.33    0.58    1636.1   REJECT    KEEP      183x92_11263.jpg <- FIXED
0.99   0.39    0.79      43.6   REJECT    KEEP      200x94_3250.jpg <- FIXED
0.98   0.30    0.41     222.3   REJECT    KEEP      300x94_5773.jpg <- FIXED
0.98   0.44    0.80    2083.6   REJECT    KEEP      95x76_4134.jpg <- FIXED
0.93   0.33    0.09    2705.5   REJECT    KEEP      96x65_3074.jpg <- FIXED
0.98   0.47    0.44     570.8