In [1]:
!pip install imagehash

Collecting imagehash
  Downloading ImageHash-4.3.2-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting PyWavelets (from imagehash)
  Downloading pywavelets-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Downloading ImageHash-4.3.2-py2.py3-none-any.whl (296 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.7/296.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pywavelets-1.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyWavelets, imagehash
Successfully installed PyWavelets-1.8.0 imagehash-4.3.2


In [4]:
import os
from pathlib import Path
import numpy as np
from PIL import Image
import imagehash
from collections import defaultdict
import pandas as pd

def check_duplicate_images(paths):
    """
    Check for duplicate images across train, valid, and test folders
    using both filename and image content comparison.

    Args:
        paths (dict): Dictionary containing paths to train, valid, and test folders

    Returns:
        tuple: (filename_duplicates, content_duplicates)
            - filename_duplicates: Dictionary of duplicate filenames
            - content_duplicates: Dictionary of duplicate images based on content
    """
    # Store all filenames
    filename_map = defaultdict(list)
    # Store image hashes
    hash_map = defaultdict(list)

    # Process each split (train/valid/test)
    for split in ['train', 'valid', 'test']:
        if os.path.exists(paths[split]):
            # Walk through all subdirectories
            for class_name in os.listdir(paths[split]):
                class_dir = os.path.join(paths[split], class_name)
                if os.path.isdir(class_dir):
                    # Process each image
                    for img_name in os.listdir(class_dir):
                        if img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                            # Store full path and split info for filename check
                            filename_map[img_name].append({
                                'path': os.path.join(class_dir, img_name),
                                'split': split,
                                'class': class_name
                            })

                            # Calculate image hash for content comparison
                            try:
                                img_path = os.path.join(class_dir, img_name)
                                with Image.open(img_path) as img:
                                    # Convert to RGB if necessary
                                    if img.mode != 'RGB':
                                        img = img.convert('RGB')
                                    # Calculate perceptual hash
                                    img_hash = str(imagehash.average_hash(img))
                                    hash_map[img_hash].append({
                                        'path': img_path,
                                        'split': split,
                                        'class': class_name,
                                        'filename': img_name
                                    })
                            except Exception as e:
                                print(f"Error processing {img_path}: {str(e)}")

    # Find duplicates by filename
    filename_duplicates = {
        filename: locations
        for filename, locations in filename_map.items()
        if len(locations) > 1
    }

    # Find duplicates by content
    content_duplicates = {
        hash_val: locations
        for hash_val, locations in hash_map.items()
        if len(locations) > 1
    }

    return filename_duplicates, content_duplicates

def print_duplicate_summary(filename_duplicates, content_duplicates):
    """Print a summary of found duplicates"""
    print("\n=== Duplicate Analysis Summary ===")

    print("\nDuplicates by filename:")
    if filename_duplicates:
        for filename, locations in filename_duplicates.items():
            print(f"\nFilename: {filename}")
            for loc in locations:
                print(f"- Found in {loc['split']}/{loc['class']}")
    else:
        print("No duplicate filenames found.")

    print("\nDuplicates by content:")
    if content_duplicates:
        for hash_val, locations in content_duplicates.items():
            print(f"\nHash: {hash_val}")
            for loc in locations:
                print(f"- {loc['filename']} in {loc['split']}/{loc['class']}")
    else:
        print("No duplicate content found.")

def generate_duplicate_report(filename_duplicates, content_duplicates):
    """Generate pandas DataFrames for detailed duplicate analysis"""
    # Prepare data for filename duplicates
    filename_data = []
    for filename, locations in filename_duplicates.items():
        for loc in locations:
            filename_data.append({
                'filename': filename,
                'split': loc['split'],
                'class': loc['class'],
                'full_path': loc['path']
            })

    # Prepare data for content duplicates
    content_data = []
    for hash_val, locations in content_duplicates.items():
        for loc in locations:
            content_data.append({
                'hash': hash_val,
                'filename': loc['filename'],
                'split': loc['split'],
                'class': loc['class'],
                'full_path': loc['path']
            })

    # Create DataFrames
    filename_df = pd.DataFrame(filename_data) if filename_data else pd.DataFrame()
    content_df = pd.DataFrame(content_data) if content_data else pd.DataFrame()

    return filename_df, content_df

def main():
    # Define base directory and paths
    base_dir = Path('/content/drive/MyDrive/CS471_AI/FinalProject/MangoDataset_Sorted_One')
    paths = {
        'base': base_dir,
        'valid': os.path.join(str(base_dir), 'valid'),
        'train': os.path.join(str(base_dir), 'train'),
        'test': os.path.join(str(base_dir), 'test')
    }

    # Check for duplicates
    print("Checking for duplicate images...")
    filename_duplicates, content_duplicates = check_duplicate_images(paths)

    # Print summary
    print_duplicate_summary(filename_duplicates, content_duplicates)

    # Generate detailed report
    filename_df, content_df = generate_duplicate_report(filename_duplicates, content_duplicates)

    # Save reports if duplicates were found
    if not filename_df.empty:
        filename_df.to_csv('duplicate_filenames_report.csv', index=False)
        print("\nDuplicate filenames report saved to 'duplicate_filenames_report.csv'")

    if not content_df.empty:
        content_df.to_csv('duplicate_content_report.csv', index=False)
        print("\nDuplicate content report saved to 'duplicate_content_report.csv'")

    return filename_df, content_df

if __name__ == "__main__":
    filename_df, content_df = main()


Checking for duplicate images...

=== Duplicate Analysis Summary ===

Duplicates by filename:
No duplicate filenames found.

Duplicates by content:
No duplicate content found.
