In [7]:
import os

In [8]:
def remove_duplicates(target_folder):
    """
    Identify and remove duplicate images in the target folder based on file size and content hash.

    Args:
        target_folder (str): Path to the target folder where duplicates will be removed.

    Returns:
        None
    """
    seen_files = {}
    duplicates = []

    # Walk through all subdirectories in the target folder
    for root, _, files in os.walk(target_folder):
        for file in files:
            file_path = os.path.join(root, file)
            try:
                # Use file size and content hash to detect duplicates
                file_size = os.path.getsize(file_path)
                with open(file_path, "rb") as f:
                    file_hash = hash(f.read())

                if (file_size, file_hash) in seen_files:
                    duplicates.append(file_path)
                else:
                    seen_files[(file_size, file_hash)] = file_path

            except Exception as e:
                print(f"Failed to process {file_path}: {e}")

    # Remove duplicates
    if duplicates:
        print("\nRemoving duplicate files:")
        for dup in duplicates:
            print(f"Removing duplicate: {dup}")
            try:
                os.remove(dup)
            except Exception as e:
                print(f"Failed to remove {dup}: {e}")
    else:
        print("\nNo duplicate files found.")

In [9]:
TARGET_FOLDER = "/Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small"
remove_duplicates(TARGET_FOLDER)


Removing duplicate files:
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small/0 (118).jpg_face1.jpg
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small/0 (1264).jpg_face1.jpg
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small/0 (1294).jpg_face1.jpg
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small/0 (1322).jpg_face1.jpg
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small/0 (1330).jpg_face1.jpg
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small/0 (1345).jpg_face1.jpg
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_small/0 (1467).jpg_face1.jpg
Removing duplicate: /Volumes/JavaAOT/Documents/AI/ml_expressions/img_datasets/combined_datasets_sma