In [1]:
import os
import hashlib
from PIL import Image
import shutil

def deduplicate_images(source_dir, clean_dir, duplicates_dir):
    """
    Finds and separates duplicate images from a source directory.

    Args:
        source_dir (str): The folder containing the original dataset (e.g., 'dataset_full').
        clean_dir (str): Where to save the unique images.
        duplicates_dir (str): Where to move the found duplicates.
    """
    if not os.path.exists(clean_dir):
        os.makedirs(clean_dir)
    if not os.path.exists(duplicates_dir):
        os.makedirs(duplicates_dir)

    hashes = {}
    total_files = 0
    duplicate_count = 0

    print(f"Starting de-duplication of '{source_dir}'...")

    # Walk through all subdirectories and files
    for dirpath, _, filenames in os.walk(source_dir):
        # Create corresponding subdirectories in clean and duplicates folders
        structure_clean = os.path.join(clean_dir, os.path.relpath(dirpath, source_dir))
        structure_duplicates = os.path.join(duplicates_dir, os.path.relpath(dirpath, source_dir))
        if not os.path.isdir(structure_clean):
            os.makedirs(structure_clean)
        if not os.path.isdir(structure_duplicates):
            os.makedirs(structure_duplicates)

        for filename in filenames:
            total_files += 1
            path = os.path.join(dirpath, filename)
            try:
                with Image.open(path) as im:
                    im = im.convert("RGB")
                    data = im.tobytes()
                    file_hash = hashlib.md5(data).hexdigest()

                # If we've seen this hash before, it's a duplicate
                if file_hash in hashes:
                    print(f"  DUPLICATE: '{path}' is a copy of '{hashes[file_hash]}'")
                    shutil.move(path, os.path.join(structure_duplicates, filename))
                    duplicate_count += 1
                else:
                    # If it's a new hash, keep it and copy the file
                    hashes[file_hash] = path
                    shutil.copy2(path, os.path.join(structure_clean, filename))

            except Exception as e:
                print(f"Could not process {path}: {e}")

    print("\n--- De-duplication Complete ---")
    print(f"Total files scanned: {total_files}")
    print(f"Unique images found: {len(hashes)}")
    print(f"Duplicates moved: {duplicate_count}")
    print(f"Clean dataset saved to: '{clean_dir}'")

# --- CONFIGURE AND RUN ---
# 1. Your original dataset with all the duplicates
source_dataset_folder = "Diabaticeye"

# 2. A new folder where the clean, unique images will be saved
clean_dataset_folder = "dataset_clean"

# 3. A folder to store the duplicates for inspection
duplicates_folder = "dataset_duplicates"

deduplicate_images(source_dataset_folder, clean_dataset_folder, duplicates_folder)

Starting de-duplication of 'Diabaticeye'...
  DUPLICATE: 'Diabaticeye\Mild\19e350c7c83c.png' is a copy of 'Diabaticeye\Mild\19722bff5a09.png'
  DUPLICATE: 'Diabaticeye\Mild\3044022c6969.png' is a copy of 'Diabaticeye\Mild\04ac765f91a1.png'
  DUPLICATE: 'Diabaticeye\Mild\33105f9b3a04.png' is a copy of 'Diabaticeye\Mild\111898ab463d.png'
  DUPLICATE: 'Diabaticeye\Mild\595446774178.png' is a copy of 'Diabaticeye\Mild\36041171f441.png'
  DUPLICATE: 'Diabaticeye\Mild\63a03880939c.png' is a copy of 'Diabaticeye\Mild\05a5183c92d0.png'
  DUPLICATE: 'Diabaticeye\Mild\6b00cb764237.png' is a copy of 'Diabaticeye\Mild\64678182d8a8.png'
  DUPLICATE: 'Diabaticeye\Mild\77a9538b8362.png' is a copy of 'Diabaticeye\Mild\40e9b5630438.png'
  DUPLICATE: 'Diabaticeye\Mild\79ce83c07588.png' is a copy of 'Diabaticeye\Mild\71c1a3cdbe47.png'
  DUPLICATE: 'Diabaticeye\Mild\7bf981d9c7fe.png' is a copy of 'Diabaticeye\Mild\36677b70b1ef.png'
  DUPLICATE: 'Diabaticeye\Mild\94372043d55b.png' is a copy of 'Diabaticeye

In [2]:
import splitfolders

# The location of your single, combined dataset
input_folder = "dataset_clean"

# The location where you want the new split dataset
output_folder = "dataset_split"

# This will split the data into 80% training, 10% validation, and 10% testing
# It ensures no files are shared between them.
# The 'stratify' option is not needed as the library does it by default.
splitfolders.ratio(
    input_folder,
    output=output_folder,
    seed=42,  # A fixed seed ensures the split is the same every time you run it
    ratio=(0.8, 0.1, 0.1)  # 80% train, 10% validation, 10% test
)

print("Dataset has been split successfully!")

Copying files: 3534 files [00:01, 3310.48 files/s]

Dataset has been split successfully!





In [3]:
import os

def find_overlap(train_dir, test_dir):
    train_files = set()
    for root, _, files in os.walk(train_dir):
        for f in files:
            train_files.add(f)
    test_files = set()
    for root, _, files in os.walk(test_dir):
        for f in files:
            if f in train_files:
                print("Exact filename overlap:", f)
find_overlap("dataset_split/train", "dataset_split/test")


In [4]:
from PIL import Image
import os
import hashlib

def img_hash(path):
    with Image.open(path) as im:
        im = im.convert("RGB")
        data = im.tobytes()
    return hashlib.md5(data).hexdigest()

def find_pixel_duplicates(train_dir, test_dir):
    train_hashes = {}
    for root, _, files in os.walk(train_dir):
        for f in files:
            p = os.path.join(root, f)
            try:
                h = img_hash(p)
                train_hashes.setdefault(h, []).append(p)
            except Exception:
                pass

    for root, _, files in os.walk(test_dir):
        for f in files:
            p = os.path.join(root, f)
            try:
                h = img_hash(p)
                if h in train_hashes:
                    print("Pixel duplicate found:", p, "matches", train_hashes[h][:3])
            except Exception:
                pass

find_pixel_duplicates("dataset_split/train", "dataset_split/test")
