In [None]:
# Web Scraping
from PIL import Image
import os
import requests
import io
from urllib.parse import quote_plus
import re
import ctypes
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException, ElementNotInteractableException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
from datetime import datetime as dt

# Prevent sleep mode while the script is running
ctypes.windll.kernel32.SetThreadExecutionState(0x80000002)

# Download the driver from the ChromeDriver website for the relevant OS i.e. MAC, Windows, etc.
PATH = r'C:/path/to/your/chromedriver.exe'
service = Service(executable_path=PATH)

# Initialize the WebDriver with the Service object
wd = webdriver.Chrome(service=service)

def get_images_from_google(wd, delay, max_images, url):
    def scroll_down(wd):
        wd.execute_script("window.scrollBy(0, 200);")  # Scroll slightly to move past suggestions
        time.sleep(1)  # Allow time for suggestions to be cleared

    wd.get(url)
    scroll_down(wd)  # Scroll immediately after the search page loads

    image_urls = set()
    skips = 0
    while len(image_urls) + skips < max_images:
        # Find the image thumbnails on the page
        thumbnails = wd.find_elements(By.CLASS_NAME, "mNsIhb")

        for img in thumbnails[len(image_urls) + skips:max_images]:
            try:
                # Ensure the thumbnail is clickable before clicking
                WebDriverWait(wd, 10).until(EC.element_to_be_clickable((By.CLASS_NAME, "mNsIhb")))

                # Scroll the image into view to ensure it's interactable
                wd.execute_script("arguments[0].scrollIntoView(true);", img)

                img.click()  # Click the image
                time.sleep(delay)

            except (ElementClickInterceptedException, TimeoutException, ElementNotInteractableException) as e:
                # Log or print the error and skip the problematic element
                print(f"Error clicking on thumbnail: {e}")
                skips += 1
                continue

            # Now grab the large image
            images = wd.find_elements(By.CSS_SELECTOR, "img.sFlh5c:not(.GKS7s):not(.bqW4cb)")
            for image in images:
                if image.get_attribute('src') in image_urls:
                    max_images += 1  # Adjust max_images to continue gathering more unique images
                    skips += 1
                    break

                if image.get_attribute('src') and 'http' in image.get_attribute('src'):
                    image_urls.add(image.get_attribute('src'))
                    
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(delay)

        # Break the loop if no new thumbnails are found
        if len(thumbnails) == len(image_urls) + skips:
            print(f"No more thumbnails found. Proceed....")
            break

    return image_urls


def download_image(down_path, url, file_name, image_type='JPEG', verbose=True):
    try:
        time = dt.now()
        curr_time = time.strftime('%H:%M:%S')
        # Content of the image will be a url
        img_content = requests.get(url).content
        # Get the bytes IO of the image
        img_file = io.BytesIO(img_content)
        # Stores the file in memory and convert to image file using Pillow
        image = Image.open(img_file)

        # If image is in WebP format, convert it to JPEG or PNG
        if image.format == 'WEBP':
            print(f"Converting WebP image to {image_type} format...")
            file_name = file_name.replace('.jpg', '.webp')  # Change extension if you prefer keeping WebP format
            file_pth = down_path + file_name
            image = image.convert("RGB")  # Convert WebP to RGB before saving as JPEG/PNG

            # Save as JPEG/PNG depending on the user's choice
            image.save(file_pth, image_type)
        else:
            # Save the image normally for non-WebP formats
            file_pth = down_path + file_name
            image.save(file_pth, image_type)

        if verbose:
            print(f'The image: {file_pth} downloaded successfully at {curr_time}.')
    except Exception as e:
        print(f'Unable to download image from Google Photos')


if __name__ == '__main__':
    # Ask user for the list of foods they want to search, separated by commas
    food_list = input("Enter the names of the foods you want to search for, separated by commas: ").split(',')
    food_list = [food.strip() for food in food_list]  # Remove any leading/trailing whitespace

    # Loop to ensure valid integer input for the maximum number of images to scrape
    while True:
        try:
            max_images = int(input("Enter the maximum number of images to scrape for each food item: "))
            break  # Exit the loop if a valid integer is entered
        except ValueError:
            print("Invalid input. Please enter a valid integer for the number of images.")

    # Get the current date in the format YYYY-MM-DD
    current_date = dt.now().strftime('%Y-%m-%d')
    base_directory = f'C:/Users/YourUserName/Desktop/Food AI/images/Malaysian Dish/{current_date}/'

    # Make the base directory for the current date if it doesn't exist
    if not os.path.exists(base_directory):
        print(f'Making base directory for the current date: {base_directory}')
        os.makedirs(base_directory)

    for food_name in food_list:
        # URL encode the food name for the Google search URL
        search_query = quote_plus(food_name)
        google_url = f"https://www.google.com/search?q={search_query}&tbm=isch"

        # Directory to save the images, named after the food
        sanitized_name = re.sub(r'[\\/:*?"<>|]', '', food_name)
        save_directory = os.path.join(base_directory, sanitized_name)

        # Make the directory for the specific food if it doesn't exist
        if not os.path.exists(save_directory):
            print(f'Making directory for {food_name}: {save_directory}')
            os.makedirs(save_directory)

        # Scrape images from Google
        urls = get_images_from_google(wd, delay=0.5, max_images=max_images, url=google_url)

        # Download the images
        for i, url in enumerate(urls):
            download_image(down_path=save_directory + '/', 
                           url=url, 
                           file_name=f"{sanitized_name}_" + str(i+1) + '.jpg',
                           verbose=True)

    # Close the webdriver
    wd.quit()

    # Allow sleep mode again after script execution
    ctypes.windll.kernel32.SetThreadExecutionState(0x80000000)


In [None]:
#Duplicate Detection

import os
from PIL import Image, UnidentifiedImageError
import imagehash
import shutil

def find_duplicates(directory_path, hamming_threshold):
    image_hashes = {}  # Dictionary to store image hashes
    duplicates = []  # List to store groups of duplicate images

    # Calculate image hashes for each image in the directory
    for root, _, files in os.walk(directory_path):
        # Ignore the 'duplicates' folder
        if 'duplicates' in root or 'collage_images' in root or 'text_detected' in root:
            continue

        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                image_path = os.path.join(root, file)
                try:
                    image = Image.open(image_path)
                    image_hash = imagehash.phash(image)

                    # Check for duplicates using Hamming distance
                    found_duplicate = False
                    for existing_hash, existing_paths in image_hashes.items():
                        hamming_distance = image_hash - existing_hash
                        if hamming_distance <= hamming_threshold:
                            existing_paths.append(image_path)  # Add to existing group of duplicates
                            found_duplicate = True
                            break

                    # Only add to the dictionary if no duplicate was found
                    if not found_duplicate:
                        image_hashes[image_hash] = [image_path]

                except UnidentifiedImageError:
                    print(f"Cannot identify image file: {image_path}. Deleting file.")
                    os.remove(image_path)  # Automatically delete the file

    print("Duplicate detection result:")
    duplicates = [paths for paths in image_hashes.values() if len(paths) > 1]
    return duplicates

def calculate_similarity(hamming_distance, max_distance):
    similarity_percentage = (1 - (hamming_distance / max_distance)) * 100
    return round(similarity_percentage, 2)

def move_and_rename_duplicates(duplicates):
    pair_counter = {}  # To track pair counts for each food folder
    grouped_new_paths = []  # Store new paths grouped by duplicate groups

    for group in duplicates:
        food_folder = os.path.dirname(group[0])
        food_name = os.path.basename(food_folder)  # Extract the food folder name
        duplicates_folder = os.path.join(food_folder, 'duplicates')

        # Ensure the duplicates folder exists
        if not os.path.exists(duplicates_folder):
            os.makedirs(duplicates_folder)
            print(f"Created 'duplicates' folder at: {duplicates_folder}")

        # Initialize the pair counter for this food item folder
        if food_name not in pair_counter:
            pair_counter[food_name] = 1

        new_group_paths = []  # To store the paths of the current group

        # Move and rename each duplicate in the group
        pair_number = pair_counter[food_name]
        for idx, dup in enumerate(group):
            file_name = f"{food_name}_DuplicateGroup{pair_number}_{idx + 1}{os.path.splitext(dup)[1]}"
            dest_path = os.path.join(duplicates_folder, file_name)
            shutil.move(dup, dest_path)
            print(f"Moved and renamed {dup} to {file_name}")
            new_group_paths.append(dest_path)  # Add new path to the current group

        # Store the grouped paths of this duplicate group
        grouped_new_paths.append(new_group_paths)

        # Increment the pair number for the next group in the same folder
        pair_counter[food_name] += 1

    return grouped_new_paths

def delete_smaller_duplicates_and_move_back(grouped_new_paths):
    choice = input(f"Do you want to delete all duplicates except the largest file? (y/n): ").strip().lower()
    if choice == 'y' or choice == 'Y':
        for group in grouped_new_paths:
            # Sort the duplicates by file size
            group_sorted_by_size = []
            for file in group:
                try:
                    # Attempt to get the file size and add to sorted list
                    size = os.path.getsize(file)
                    group_sorted_by_size.append((file, size))
                except FileNotFoundError:
                    print(f"File not found: {file}. Skipping.")
            
            if not group_sorted_by_size:
                # If no valid files are found, skip the group
                print("No valid files found in this group. Skipping.")
                continue

            # Sort by file size in descending order
            group_sorted_by_size.sort(key=lambda x: x[1], reverse=True)
            largest_file = group_sorted_by_size[0][0]  # Get the largest file

            # Move the largest file back to the original folder
            original_folder = os.path.dirname(os.path.dirname(largest_file))  # Go back two levels to the original folder
            new_name = os.path.basename(largest_file)  # Keep the renamed file (e.g., DuplicateGroupX_Y.jpg)
            new_path = os.path.join(original_folder, new_name)
            shutil.move(largest_file, new_path)
            print(f"Moving largest file {largest_file} back to original folder: {new_path}")

            # Delete all smaller files
            for file, _ in group_sorted_by_size[1:]:
                try:
                    print(f"Deleting smaller file: {file}")
                    os.remove(file)
                except FileNotFoundError:
                    print(f"File not found during deletion: {file}. It might have been deleted already.")
    else:
        print("No files were deleted.")


if __name__ == '__main__':
    hamming_threshold = 10  # Define the maximum Hamming distance to consider images as duplicates
    # Change your directory to the one where you want to find and delete duplicate images
    directory_path = r'C:/Users/YourUserName/Desktop/Food AI/images/Malaysian Dish/'

    # Ensure the directory exists
    if not os.path.exists(directory_path):
        print(f"Directory {directory_path} does not exist.")
    else:
        # Find duplicates based on the Hamming distance threshold
        duplicates = find_duplicates(directory_path, hamming_threshold)

        # Move and rename duplicates to a separate folder within each food item folder
        if duplicates:
            grouped_new_paths = move_and_rename_duplicates(duplicates)
            # Prompt user to delete smaller duplicates in the 'duplicates' folder
            delete_smaller_duplicates_and_move_back(grouped_new_paths)
        else:
            print("No duplicates found.")



In [None]:
#Count remaining image in every food folder
import os

def count_non_duplicate_images(directory_path):
    # Iterate over the date folders first
    for root, dirs, _ in os.walk(directory_path):
        for date_folder in dirs:
            date_folder_path = os.path.join(root, date_folder)
            # Go one level deeper to get the food item folders
            for food_folder in os.listdir(date_folder_path):
                food_folder_path = os.path.join(date_folder_path, food_folder)
                if os.path.isdir(food_folder_path) and 'duplicates' not in food_folder_path and 'text_detected' not in food_folder_path:
                    # Count images in the food item folder (excluding the 'duplicates' folder)
                    non_duplicate_images = [file for file in os.listdir(food_folder_path)
                                            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif'))]
                    print(f"({date_folder}) Folder: {food_folder}: {len(non_duplicate_images)}")
                    
            #print('\n')

# Usage example
directory_path = r'C:/Users/YourUserName/Desktop/Food AI/images/Malaysian Dish/'
count_non_duplicate_images(directory_path)


In [None]:
#Detecting text in images
#Still not perform well 

import os
import cv2
import numpy as np
from PIL import Image
import shutil
import easyocr
#import pytesseract

def preprocess_image(image_path, target_size=(800, 800)):
    # Load image using OpenCV
    img = cv2.imread(image_path)
    
    # Skip resizing for smaller images
    if img.shape[0] > target_size[0] or img.shape[1] > target_size[1]:
        img = cv2.resize(img, target_size, interpolation=cv2.INTER_AREA)

    # Convert to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Optional: Apply histogram equalization for contrast enhancement only for low-contrast images
    if np.mean(gray) < 100:  # Apply only if the image is dark/low contrast
        gray = cv2.equalizeHist(gray)
    
    # Apply a global threshold instead of adaptive thresholding for simpler images
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Skip sharpening unless necessary (e.g., for blurry images)
    if cv2.Laplacian(thresh, cv2.CV_64F).var() < 100:  # Apply sharpening only if the image is blurry
        kernel = np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]])
        thresh = cv2.filter2D(thresh, -1, kernel)
    
    return thresh

def detect_text_in_images(directory_path):
    for root, _, files in os.walk(directory_path):
        # Skip 'text_detected' folder and other excluded folders
        if 'duplicates' in root or 'collage_images' in root or 'text_detected' in root:
            continue

        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
                image_path = os.path.join(root, file)

                text_detected_folder = os.path.join(root, 'text_detected')
                if not os.path.exists(text_detected_folder):
                    os.makedirs(text_detected_folder)

                try:
                    # Preprocess the image before OCR
                    processed_img = preprocess_image(image_path)
                    
                    # Perform text detection using pytesseract on preprocessed image

                    reader = easyocr.Reader(['en','ms'], gpu=True)
                    result = reader.readtext(processed_img)
                    #text = pytesseract.image_to_string(processed_img)

                    # If text is detected, move the image to the 'text_detected' folder
                    for (bbox, text, prob) in result:
                        if text.strip():  # Check if detected text is not empty
                            dest_path = os.path.join(text_detected_folder, file)
                            shutil.move(image_path, dest_path)
                            print(f"Moved image with text: {file} to 'text_detected' folder")

                except Exception as e:
                    print(f"Error processing {image_path}: {e}")

if __name__ == '__main__':
    directory_path = r'C:/Users/YourUserName/Desktop/Food AI/images/Malaysian Dish/'
    detect_text_in_images(directory_path)
