In [None]:
import os
from PIL import Image
import imagehash
from tqdm import tqdm

In [None]:
# ==============================================================================
# --- Configuration ---
# ==============================================================================

# The folder you want to scan for duplicates. This script will search all subfolders within it.
TARGET_FOLDER = "/Users/natalyagrokh/AI/ml_expressions/img_datasets/flick_dataset_archive/flickr_curated"

# The strictness of the duplicate check. Lower is stricter.
# 0: Only files that are pixel-for-pixel identical will be considered duplicates.
# 1: Catches duplicates with extremely minor, imperceptible differences.
# 2-3: A good balance for catching visually identical images with different compression/color profiles.
HASH_THRESHOLD = 1

In [None]:
# ==============================================================================
# --- Main Duplicate Removal Function ---
# ==============================================================================
def remove_perceptual_duplicates(target_folder, hash_threshold):
    """
    Finds and deletes visually similar images in a folder and its subfolders.
    """
    print(f"--- Starting Perceptual Duplicate Removal ---")
    print(f"  Target Folder: {target_folder}")
    print(f"  Strictness Threshold: {hash_threshold}")

    hashes = {}
    duplicates_to_remove = []

    # Find all image files in the target folder and all subfolders
    image_paths = [os.path.join(dp, f) for dp, dn, fn in os.walk(os.path.expanduser(target_folder)) for f in fn if f.lower().endswith(('.png', '.jpg', 'jpeg'))]
    
    if not image_paths:
        print("  No images found in the specified folder.")
        return

    print(f"  Found {len(image_paths)} images to analyze.")

    # Iterate through all images to calculate and compare hashes
    for file_path in tqdm(image_paths, desc="Hashing images"):
        try:
            with Image.open(file_path) as img:
                h = imagehash.phash(img)
            
            found_match = False
            for seen_hash in hashes:
                if (h - seen_hash) <= hash_threshold:
                    duplicates_to_remove.append(file_path)
                    found_match = True
                    break
            
            if not found_match:
                hashes[h] = file_path
        except Exception as e:
            print(f"\n  WARNING: Could not process {file_path}. Error: {e}")
            
    # Delete all the identified duplicates
    if duplicates_to_remove:
        print(f"\n  Found {len(duplicates_to_remove)} duplicate images. Removing...")
        
        # <--- CHANGE: Removed tqdm and added a print statement inside the loop ---
        for dup_path in duplicates_to_remove:
            try:
                # Print the name of the file being removed
                print(f"    - Removing: {os.path.basename(dup_path)}")
                os.remove(dup_path)
            except Exception as e:
                print(f"  Failed to remove duplicate {dup_path}: {e}")
    else:
        print("\n  No duplicate files were found.")
        
    print(f"--- Duplicate Removal Complete ---")

In [None]:
# ==============================================================================
# --- Main Execution Block ---
# ==============================================================================

if __name__ == "__main__":
    if not os.path.isdir(TARGET_FOLDER):
        print(f"ERROR: The specified TARGET_FOLDER does not exist: {TARGET_FOLDER}")
    else:
        remove_perceptual_duplicates(TARGET_FOLDER, hash_threshold=HASH_THRESHOLD)