In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import requests
import io
from datetime import datetime as dt
from PIL import Image
import time
import os
from urllib.parse import quote_plus
import subprocess
import re

# Download the driver from the ChromeDriver website for the relevant OS i.e. MAC, Windows, Debian, etc.
PATH = r'C:/path/to/your/chromedriver.exe'
service = Service(executable_path=PATH)

# Initialize the WebDriver with the Service object
wd = webdriver.Chrome(service=service)

def get_images_from_google(wd, delay, max_images, url):
    def scroll_down(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)

    wd.get(url)

    image_urls = set()
    skips = 0
    while len(image_urls) + skips < max_images:
        scroll_down(wd)
        thumbnails = wd.find_elements(By.CLASS_NAME, "mNsIhb")

        for img in thumbnails[len(image_urls) + skips:max_images]:
            try:
                img.click()
                time.sleep(delay)
            except:
                continue

            images = wd.find_elements(By.CLASS_NAME, "sFlh5c")
            for image in images:
                if image.get_attribute('src') in image_urls:
                    max_images += 1
                    skips += 1
                    break

                if image.get_attribute('src') and 'http' in image.get_attribute('src'):
                    image_urls.add(image.get_attribute('src'))
                    ##print(f"Found {len(image_urls)}")

    return image_urls

def download_image(down_path, url, file_name, image_type='JPEG', verbose=True):
    try:
        time = dt.now()
        curr_time = time.strftime('%H:%M:%S')
        # Content of the image will be a url
        img_content = requests.get(url).content
        # Get the bytes IO of the image
        img_file = io.BytesIO(img_content)
        # Stores the file in memory and convert to image file using Pillow
        image = Image.open(img_file)
        file_pth = down_path + file_name

        with open(file_pth, 'wb') as file:
            image.save(file, image_type)

        if verbose:
            print(f'The image: {file_pth} downloaded successfully at {curr_time}.')
    except Exception as e:
        print(f'Unable to download image from Google Photos due to\n: {str(e)}')

def find_duplicates(directory_path):
    command = [
        'find-dups',
        directory_path,
        '--algorithm', 'phash',
        '--on-equal', 'delete-first',
        '--parallel', '4'
    ]
    subprocess.run(command)

if __name__ == '__main__':
    # Ask user for the list of foods they want to search, separated by commas
    food_list = input("Enter the names of the foods you want to search for, separated by commas: ").split(',')
    food_list = [food.strip() for food in food_list]  # Remove any leading/trailing whitespace

    # Loop to ensure valid integer input for the maximum number of images to scrape
    while True:
        try:
            max_images = int(input("Enter the maximum number of images to scrape for each food item: "))
            break  # Exit the loop if a valid integer is entered
        except ValueError:
            print("Invalid input. Please enter a valid integer for the number of images.")

    # Get the current date in the format YYYY-MM-DD
    current_date = dt.now().strftime('%Y-%m-%d')
    base_directory = f'C:/Users/YourUserName/Desktop/Food AI/images/Malaysian Dish/{current_date}/'

    # Make the base directory for the current date if it doesn't exist
    if not os.path.exists(base_directory):
        print(f'Making base directory for the current date: {base_directory}')
        os.makedirs(base_directory)

    for food_name in food_list:
        # URL encode the food name for the Google search URL
        search_query = quote_plus(food_name)
        google_url = f"https://www.google.com/search?q={search_query}&tbm=isch"

        # Directory to save the images, named after the food
        sanitized_name = re.sub(r'[\\/:*?"<>|]', '', food_name)
        save_directory = os.path.join(base_directory, sanitized_name)
        
        # Make the directory for the specific food if it doesn't exist
        if not os.path.exists(save_directory):
            print(f'Making directory for {food_name}: {save_directory}')
            os.makedirs(save_directory)

        # Scrape images from Google
        urls = get_images_from_google(wd, delay=0.5, max_images=max_images, url=google_url)
        
        # Download the images
        for i, url in enumerate(urls):
            download_image(down_path=save_directory + '/', 
                           url=url, 
                           file_name=str(i+1) + '.jpg',
                           verbose=True)

        # Find and handle duplicates
        find_duplicates(save_directory)

    # Close the webdriver
    wd.quit()


In [5]:
import os
from PIL import Image, UnidentifiedImageError
import imagehash

def find_duplicates(directory_path, hamming_threshold):
    image_hashes = {}  # Dictionary to store image hashes
    duplicates = []  # List to store pairs of duplicate images

    # Calculate image hashes for each image in the directory
    for root, _, files in os.walk(directory_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                image_path = os.path.join(root, file)
                try:
                    image = Image.open(image_path)
                    image_hash = imagehash.phash(image)
                    
                    # Check for duplicates using Hamming distance
                    found_duplicate = False
                    for existing_hash, existing_path in image_hashes.items():
                        hamming_distance = image_hash - existing_hash
                        if hamming_distance <= hamming_threshold:
                            duplicates.append((existing_path, image_path))
                            found_duplicate = True
                            break
                    
                    # Only add to the dictionary if no duplicate was found
                    if not found_duplicate:
                        image_hashes[image_hash] = image_path

                except UnidentifiedImageError:
                        print(f"Cannot identify image file: {image_path}. Deleting file.")
                        os.remove(image_path)  # Automatically delete the file


    print("Duplicate detection result:")
    if duplicates:
        for dup1, dup2 in duplicates:
            # Calculate the Hamming distance
            hash1 = imagehash.phash(Image.open(dup1))
            hash2 = imagehash.phash(Image.open(dup2))
            distance = hash1 - hash2
            similarity = calculate_similarity(distance, hamming_threshold)
            print(f"Duplicate found: {dup1} and {dup2} with Hamming distance: {distance} and similarity: {similarity}%")
    else:
        print("No duplicates found.")
    
    return duplicates

def calculate_similarity(hamming_distance, max_distance):
    similarity_percentage = (1 - (hamming_distance / max_distance)) * 100
    return round(similarity_percentage, 2)

def delete_smallest_duplicates(directory_path, hamming_threshold):
    duplicates = find_duplicates(directory_path, hamming_threshold)

    print(f"Note: It is preferable to check if the image similarity is below 50% before deciding to delete.")
    choice = input(f"Do you want to delete all the duplicate image:? (y/any key to skip) ").strip().lower()

    if choice == 'y' and choice == 'Y':
        for dup1, dup2 in duplicates:
            size1 = os.path.getsize(dup1)
            size2 = os.path.getsize(dup2)
            if size1 < size2:
                print(f"Deleting smaller file: {dup1}")
                os.remove(dup1)
            else:
                print(f"Deleting smaller file: {dup2}")
                os.remove(dup2)

    else:
        print("Skipping deletion for all possible duplicate images.")

if __name__ == '__main__':
    hamming_threshold = 10  # Define the maximum Hamming distance to consider images as duplicates
    # Change your directory to the one where you want to find and delete duplicate images
    directory_path = r'C:/Users/YourUserName/Desktop/Food AI/images/Malaysian Dish/'
    
    # Ensure the directory exists
    if not os.path.exists(directory_path):
        print(f"Directory {directory_path} does not exist.")
    else:
        # Find and delete duplicates based on the Hamming distance threshold
        delete_smallest_duplicates(directory_path, hamming_threshold)


Duplicate detection result:
Duplicate found: C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Goreng\2.jpg and C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Goreng\24.jpg with Hamming distance: 2 and similarity: 80.0%
Duplicate found: C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Penyek\25.jpg and C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Penyek\33.jpg with Hamming distance: 2 and similarity: 80.0%
Duplicate found: C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Penyek\24.jpg and C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Penyek\46.jpg with Hamming distance: 2 and similarity: 80.0%
Duplicate found: C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Penyek\39.jpg and C:\Users\Naluri - Akmal\Desktop\Food AI\images\Malaysian Dish\2024-08-31\Ayam Penyek\65.jpg with Hamming

KeyboardInterrupt: Interrupted by user

Do you want to delete all the duplicate image:? (y/any key to skip)  n
