In [None]:
#Duplicate Detection

import os
from PIL import Image, UnidentifiedImageError
import imagehash
import shutil

def find_duplicates(directory_path, hamming_threshold):
    image_hashes = {}  # Dictionary to store image hashes
    duplicates = []  # List to store groups of duplicate images

    # Calculate image hashes for each image in the directory
    for root, _, files in os.walk(directory_path):
        # Ignore the 'duplicates' folder
        if 'duplicates' in root or 'collage_images' in root or 'text_detected' in root:
            continue

        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                image_path = os.path.join(root, file)
                try:
                    image = Image.open(image_path)
                    image_hash = imagehash.phash(image)

                    # Check for duplicates using Hamming distance
                    found_duplicate = False
                    for existing_hash, existing_paths in image_hashes.items():
                        hamming_distance = image_hash - existing_hash
                        if hamming_distance <= hamming_threshold:
                            existing_paths.append(image_path)  # Add to existing group of duplicates
                            found_duplicate = True
                            break

                    # Only add to the dictionary if no duplicate was found
                    if not found_duplicate:
                        image_hashes[image_hash] = [image_path]

                except UnidentifiedImageError:
                    print(f"Cannot identify image file: {image_path}. Deleting file.")
                    os.remove(image_path)  # Automatically delete the file

    print("Duplicate detection result:")
    duplicates = [paths for paths in image_hashes.values() if len(paths) > 1]
    return duplicates

def calculate_similarity(hamming_distance, max_distance):
    similarity_percentage = (1 - (hamming_distance / max_distance)) * 100
    return round(similarity_percentage, 2)

def move_and_rename_duplicates(duplicates):
    pair_counter = {}  # To track pair counts for each food folder
    grouped_new_paths = []  # Store new paths grouped by duplicate groups

    for group in duplicates:
        food_folder = os.path.dirname(group[0])
        food_name = os.path.basename(food_folder)  # Extract the food folder name
        duplicates_folder = os.path.join(food_folder, 'duplicates')

        # Ensure the duplicates folder exists
        if not os.path.exists(duplicates_folder):
            os.makedirs(duplicates_folder)
            print(f"Created 'duplicates' folder at: {duplicates_folder}")

        # Initialize the pair counter for this food item folder
        if food_name not in pair_counter:
            pair_counter[food_name] = 1

        new_group_paths = []  # To store the paths of the current group

        # Move and rename each duplicate in the group
        pair_number = pair_counter[food_name]
        for idx, dup in enumerate(group):
            file_name = f"{food_name}_DuplicateGroup{pair_number}_{idx + 1}{os.path.splitext(dup)[1]}"
            dest_path = os.path.join(duplicates_folder, file_name)
            shutil.move(dup, dest_path)
            print(f"Moved and renamed {dup} to {file_name}")
            new_group_paths.append(dest_path)  # Add new path to the current group

        # Store the grouped paths of this duplicate group
        grouped_new_paths.append(new_group_paths)

        # Increment the pair number for the next group in the same folder
        pair_counter[food_name] += 1

    return grouped_new_paths

def delete_smaller_duplicates_and_move_back(grouped_new_paths):
    choice = input(f"Do you want to delete all duplicates except the largest file? (y/n): ").strip().lower()
    if choice == 'y' or choice == 'Y':
        for group in grouped_new_paths:
            # Sort the duplicates by file size
            group_sorted_by_size = []
            for file in group:
                try:
                    # Attempt to get the file size and add to sorted list
                    size = os.path.getsize(file)
                    group_sorted_by_size.append((file, size))
                except FileNotFoundError:
                    print(f"File not found: {file}. Skipping.")
            
            if not group_sorted_by_size:
                # If no valid files are found, skip the group
                print("No valid files found in this group. Skipping.")
                continue

            # Sort by file size in descending order
            group_sorted_by_size.sort(key=lambda x: x[1], reverse=True)
            largest_file = group_sorted_by_size[0][0]  # Get the largest file

            # Move the largest file back to the original folder
            original_folder = os.path.dirname(os.path.dirname(largest_file))  # Go back two levels to the original folder
            new_name = os.path.basename(largest_file)  # Keep the renamed file (e.g., DuplicateGroupX_Y.jpg)
            new_path = os.path.join(original_folder, new_name)
            shutil.move(largest_file, new_path)
            print(f"Moving largest file {largest_file} back to original folder: {new_path}")

            # Delete all smaller files
            for file, _ in group_sorted_by_size[1:]:
                try:
                    print(f"Deleting smaller file: {file}")
                    os.remove(file)
                except FileNotFoundError:
                    print(f"File not found during deletion: {file}. It might have been deleted already.")
    else:
        print("No files were deleted.")


if __name__ == '__main__':
    hamming_threshold = 10  # Define the maximum Hamming distance to consider images as duplicates
    # Change your directory to the one where you want to find and delete duplicate images
    directory_path = r'C:/Users/YourUserName/Desktop/Food AI/images/Malaysian Dish/'

    # Ensure the directory exists
    if not os.path.exists(directory_path):
        print(f"Directory {directory_path} does not exist.")
    else:
        # Find duplicates based on the Hamming distance threshold
        duplicates = find_duplicates(directory_path, hamming_threshold)

        # Move and rename duplicates to a separate folder within each food item folder
        if duplicates:
            grouped_new_paths = move_and_rename_duplicates(duplicates)
            # Prompt user to delete smaller duplicates in the 'duplicates' folder
            delete_smaller_duplicates_and_move_back(grouped_new_paths)
        else:
            print("No duplicates found.")

