<a href="https://colab.research.google.com/github/StarMindz/Google-Image-Scrapper/blob/main/Google_Image_Scrapper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Google Image Scrapper

This script is designed for scraping both high-quality and low-quality images from the internet using a set of search terms. It includes codes for scraping full size images from Google. You can specify the number of images you wish to scrape and the directory on your computer where you'd like to store them. For each search term in your list, a folder will be automatically created with the same name as the search term, and the specified number of images will be downloaded automatically.

Here, a list of popular Nigerian foods is used, making this script useful for gathering image datasets for machine learning training and general AI model training.

In [None]:
!pip install selenium
!pip install selenium_stealth
!import pillow

In [None]:
import os
import time
import requests
from PIL import Image
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support import expected_conditions as EC
from selenium import webdriver
import io
from datetime import datetime

In [None]:
nigerian_dishes = [
    "Jollof Rice", "Egusi Soup", "Efo Riro", "Banga Soup", "ofada rice and ofada sauce",
    "Edikang Ikong Soup", "Amala and gbegiri with ewedu soup", "Ogbono Soup", "Nkwobi", "Afang Soup", "Tuwo", "Waina masa", "Fried Plantain", "Miyan Taushe",
    "Oha Soup", "Beans porridge and plantain", "Bitterleaf Soup", "Ofe Nsala", "Suya"
   "Yam Porridge", "Okra soup","Pepper Soup", "Pounded yam", "Eba", "Garri and Groundnut", "Moi Moi", "Abacha and Ugba", "Adalu"
]

In [None]:
def prepare_browser():
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("start-maximized")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

In [None]:
def download_single_image(image_url, search_term, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    if image_url and 'https' in image_url:
        try:
            img_data = requests.get(image_url).content
            timestamp = datetime.now().strftime("%Y%m%d%H%M%S%f")
            img_file = os.path.join(save_dir, f"{search_term}_{timestamp}.jpg")
            with open(img_file, 'wb') as handler:
                handler.write(img_data)
            print(f"Image downloaded: {img_file}")
        except Exception as e:
            print(f"Could not download {image_url}: {e}")

In [None]:
def scrape_google_images(search_term, save_dir, max_images=1000):
    driver = prepare_browser()
    search_url = f'https://www.google.com/search?hl=en&tbm=isch&q={search_term}'
    driver.get(search_url)
    print(f"Searching for {search_term} on Google Images")
    time.sleep(2)

    downloaded_count = 0
    image_srcs = set()
    pins_seen = set()

    while downloaded_count < max_images:
        thumbnails = driver.find_elements(By.CSS_SELECTOR, "img[class='YQ4gaf']")
        images = [img for img in thumbnails if img.is_displayed()]
        print("here are the thumbnails", thumbnails)

        for img in images:  # Avoid reprocessing already processed thumbnails
            if downloaded_count >= max_images:
                break

            if img not in pins_seen:
#                 img_src= img.get_attribute('src')
                driver.execute_script("arguments[0].click();", img)
                print("Collected Image ", img)
                time.sleep(3)  # Wait for the pin to open
#                 Re-locate the full image element
                try:
                    full_image = WebDriverWait(driver, 10).until(
                        EC.presence_of_element_located((By.XPATH, '//img[contains(@class, "sFlh5c pT0Scc iPVvYb")]'))
                    )
                except:
                    pass
                try:
                    full_image_url = full_image.get_attribute('src')
                    print("The full image src", full_image_url)
                    download_single_image(full_image_url, search_term, save_dir)
                    downloaded_count += 1
                    pins_seen.add(img_src)
                except:
                    pass

        # Clear processed image_srcs and load more content if necessary
        image_srcs.clear()


In [None]:
def main():
    base_dir = r'C:\Users\STARMINDS\Desktop\Projects\Data'
    nigerian_dishes = ["Jollof Rice"]
    max_images = 300

    for dish in nigerian_dishes:
        save_dir = os.path.join(base_dir, dish)
        scrape_google_images(dish, save_dir, max_images)


In [None]:
if __name__ == '__main__':
    main()