## Description
This script detects and removes duplicate images in a dataset organized by class folders. It uses image hashing (PIL, hashlib) and parallel processing (ThreadPoolExecutor) to optimize performance.

## Library Imports
Loads libraries for file handling, image processing, hashing, and multithreading.

In [None]:
import os
import hashlib
from PIL import Image
from concurrent.futures import ThreadPoolExecutor

## Hash Calculation
Normalizes images and computes an MD5 hash to uniquely identify content.

In [None]:
def calculate_hash(image_path):
    try:
        with Image.open(image_path) as img:
            img = img.resize((256, 256)).convert("RGB")  # Normalize size and mode
            hash_md5 = hashlib.md5(img.tobytes()).hexdigest()
        return hash_md5
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return None

## Duplicate Handling
Functions to detect (find_duplicates_in_class) and delete (delete_duplicates) duplicate images, and manage per-class processing (process_class_folder).

In [None]:
def find_duplicates_in_class(class_folder):
    
    image_hashes = {}
    duplicates = []

    for root, _, files in os.walk(class_folder):
        for file in files:
            image_path = os.path.join(root, file)
            file_hash = calculate_hash(image_path)

            if file_hash:
                if file_hash in image_hashes:
                    duplicates.append(image_path)
                else:
                    image_hashes[file_hash] = image_path

    return duplicates

def delete_duplicates(duplicates):
    
    deleted_count = 0
    for file_path in duplicates:
        try:
            os.remove(file_path)
            deleted_count += 1
        except Exception as e:
            print(f"Failed to delete {file_path}: {e}")
    return deleted_count

def process_class_folder(class_name, dataset_path):
  
    class_folder = os.path.join(dataset_path, class_name)
    print(f"Processing class: {class_name}")
    duplicates = find_duplicates_in_class(class_folder)
    duplicate_count = len(duplicates)

    print(f"Found {duplicate_count} duplicate images in class '{class_name}'.")
    if duplicate_count > 0:
        deleted_count = delete_duplicates(duplicates)
        print(f"Deleted {deleted_count} duplicate images in class '{class_name}'.")
    else:
        deleted_count = 0

    return class_name, duplicate_count, deleted_count

## Main Execution
Uses multithreading to process class folders in parallel and summarizes results.

In [1]:
def main():
    dataset_path = r"C:\Users\jamee\Downloads\HealthyDataset"  # Path to your dataset folder
    class_folders = os.listdir(dataset_path)

    total_duplicates = 0
    total_deleted = 0

    # Use ThreadPoolExecutor to process classes in parallel
    with ThreadPoolExecutor() as executor:
        futures = []
        for class_name in class_folders:
            futures.append(executor.submit(process_class_folder, class_name, dataset_path))

        for future in futures:
            class_name, duplicate_count, deleted_count = future.result()
            total_duplicates += duplicate_count
            total_deleted += deleted_count
            print(f"Class '{class_name}' completed: {duplicate_count} duplicates, {deleted_count} deleted.")

    print("\nSummary:")
    print(f"Total duplicate images found: {total_duplicates}")
    print(f"Total duplicate images deleted: {total_deleted}")

if __name__ == "__main__":
    main()

Processing class: 0.jpg
Found 0 duplicate images in class '0.jpg'.
Processing class: 1.jpg
Found 0 duplicate images in class '1.jpg'.
Processing class: 10.jpg
Found 0 duplicate images in class '10.jpg'.
Processing class: 100.jpg
Processing class: 1000.jpg
Found 0 duplicate images in class '100.jpg'.
Processing class: 1001.jpg
Processing class: 1002.jpg
Found 0 duplicate images in class '1000.jpg'.
Found 0 duplicate images in class '1001.jpg'.
Found 0 duplicate images in class '1002.jpg'.
Processing class: 1003.jpg
Processing class: 1004.jpg
Processing class: 1005.jpg
Found 0 duplicate images in class '1003.jpg'.
Processing class: 1006.jpg
Found 0 duplicate images in class '1004.jpg'.
Found 0 duplicate images in class '1005.jpg'.
Found 0 duplicate images in class '1006.jpg'.
Processing class: 1007.jpg
Processing class: 1008.jpg
Processing class: 1009.jpg
Processing class: 101.jpg
Found 0 duplicate images in class '1007.jpg'.
Processing class: 1010.jpg
Found 0 duplicate images in class '