### The following function will look for duplicate images and removes them.

In [1]:
import os
import face_recognition

def remove_duplicate_images_according_face(folder_path, threshold=0.6):
    """
    Identifies and removes duplicate images from a specified folder based on facial similarity.

    Args:
        folder_path (str): Path to the folder containing images.
        threshold (float, optional): The distance threshold for considering two images as duplicates. Default is 0.6.
    """
    
    if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist.")
        return

    image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    encodings_seen = []

    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)

        try:
            # Load image and calculate encoding
            image = face_recognition.load_image_file(image_path)
            encodings = face_recognition.face_encodings(image)

            if not encodings:
                print(f"No face found in {image_file}. Skipping.")
                continue

            image_encoding = encodings[0]  # Assuming one face per image

        except Exception as e:
            print(f"Error processing {image_file}: {e}")
            continue

        # Compare this encoding with all seen encodings
        is_duplicate = False
        for seen_encoding in encodings_seen:
            distance = face_recognition.face_distance([seen_encoding], image_encoding)[0]
            if distance < threshold:
                print(f"Duplicate found: {image_file} is a duplicate.")
                is_duplicate = True
                os.remove(image_path)  # Remove the duplicate image
                break

        if not is_duplicate:
            encodings_seen.append(image_encoding)

    print("Duplicate image removal process is complete.")


In [2]:
folder_path = "assets/image_data/archive/pull"
remove_duplicate_images_according_face(folder_path)

Duplicate found: Akshay Kumar_49.jpg is a duplicate.
Duplicate found: Alexandra Daddario_3.jpg is a duplicate.
Duplicate found: Akshay Kumar_46.jpg is a duplicate.
Duplicate found: Akshay Kumar_45.jpg is a duplicate.
Duplicate found: Akshay Kumar_47.jpg is a duplicate.
Duplicate found: Alexandra Daddario_2.jpg is a duplicate.
Duplicate found: Alexandra Daddario_0 copy.jpg is a duplicate.
Duplicate found: Akshay Kumar_47 copy.jpg is a duplicate.
Duplicate found: Alexandra Daddario_3 copy.jpg is a duplicate.
Duplicate found: Akshay Kumar_49 copy.jpg is a duplicate.
Duplicate found: Alexandra Daddario_1.jpg is a duplicate.
Duplicate found: Akshay Kumar_48.jpg is a duplicate.
Duplicate found: Akshay Kumar_48 copy.jpg is a duplicate.
Duplicate found: Akshay Kumar_45 copy.jpg is a duplicate.
Duplicate found: Alexandra Daddario_1 copy.jpg is a duplicate.
Duplicate found: Alexandra Daddario_0.jpg is a duplicate.
Duplicate image removal process is complete.


In [4]:
!pip install imagehash

Defaulting to user installation because normal site-packages is not writeable
Collecting imagehash
  Downloading ImageHash-4.3.1-py2.py3-none-any.whl (296 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.5/296.5 KB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m[31m1.3 MB/s[0m eta [36m0:00:01[0m
Collecting PyWavelets
  Downloading pywavelets-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: PyWavelets, imagehash
Successfully installed PyWavelets-1.7.0 imagehash-4.3.1


In [5]:
import os
from PIL import Image
import imagehash

def remove_duplicate_images(folder_path):
    """
    Identifies and removes duplicate images from a specified folder based on image hash.

    Args:
        folder_path (str): Path to the folder containing images.
    """
    
    if not os.path.exists(folder_path):
        print(f"The folder '{folder_path}' does not exist.")
        return

    image_files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    hashes_seen = {}

    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)

        try:
            # Load image and compute its hash
            with Image.open(image_path) as img:
                img_hash = imagehash.phash(img)  # Perceptual hash (pHash)

            # Check if this hash has already been seen
            if img_hash in hashes_seen:
                print(f"Duplicate found: {image_file} is a duplicate of {hashes_seen[img_hash]}")
                os.remove(image_path)  # Remove the duplicate image
            else:
                hashes_seen[img_hash] = image_file

        except Exception as e:
            print(f"Error processing {image_file}: {e}")
            continue

    print("Duplicate image removal process is complete.")


In [6]:
folder_path = "assets/image_data/archive/pull"
remove_duplicate_images(folder_path)

Duplicate found: Hrithik Roshan_97.jpg is a duplicate of Hrithik Roshan_97 copy.jpg
Duplicate found: Henry Cavill_2.jpg is a duplicate of Henry Cavill_2 copy.jpg
Duplicate found: Hrithik Roshan_99.jpg is a duplicate of Hrithik Roshan_99 copy.jpg
Duplicate found: Henry Cavill_0.jpg is a duplicate of Henry Cavill_0 copy.jpg
Duplicate found: Henry Cavill_3.jpg is a duplicate of Henry Cavill_3 copy.jpg
Duplicate found: Hrithik Roshan_100.jpg is a duplicate of Hrithik Roshan_100 copy.jpg
Duplicate found: Henry Cavill_4 copy.jpg is a duplicate of Henry Cavill_4.jpg
Duplicate found: Henry Cavill_1 copy.jpg is a duplicate of Henry Cavill_1.jpg
Duplicate found: Hrithik Roshan_98 copy.jpg is a duplicate of Hrithik Roshan_98.jpg
Duplicate image removal process is complete.
