In [50]:
import requests
import lxml
import re
import base64
from bs4 import BeautifulSoup 
from io import BytesIO
from PIL import Image
import os
import time
import hashlib
import selenium

from selenium import webdriver
from selenium.webdriver.common.by import By

In [102]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        tries = 1
        
        thumbnail_results = wd.find_elements(By.CSS_SELECTOR, "img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        while (tries < 10 and number_results == results_start):
            thumbnail_results = wd.find_elements(By.CSS_SELECTOR, "img.Q4LuWd")
            time.sleep(2 ** (tries // 2))
            tries += 1
            
        if (number_results == results_start):
            print("Could not find any more images!")
            
            return image_urls
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements(By.CSS_SELECTOR, 'img.rg_i')
            
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)

            load_more_button = wd.find_elements(By.CSS_SELECTOR, ".mye4qd")
            
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [97]:
def persist_image(folder_path:str, url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=100)
            
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [98]:
def search_and_download(search_term:str, wd:webdriver, save_folder, number_images=5):
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.05)
        
    for elem in res:
        persist_image(save_folder, elem)

## Scrape images

In [104]:
DRIVER_PATH = './driver/chromedriver.exe'
wd = webdriver.Chrome()

In [None]:
# search_terms = ['hariatiki salad', 'caprese salad', 'caesar salad', 'tabbouleh salad', 'nicoise salad', 'larb salad',
#                'black beans and corn salad', 'cobb salad', 'fattoush salad', 'waldorf salad']

search_terms = ['apple pie', 'banofee pie', 'creme brulee', 'baklava', 'waffles', 'gulab jamun', 'nanaimo bar',
               'brownie', 'donuts', 'medovik']

start = time.time()

with wd as wd:
    for search_term in search_terms[2:]:
        search_and_download(search_term, wd, './images/desserts/{}'.format(search_term.replace(' ', '_')), 250)

end = time.time()

print((end - start) / 500)

Found: 100 search results. Extracting links from 0:100
Found: 100 image links, looking for more ...
Found: 212 search results. Extracting links from 100:212
Found: 199 image links, looking for more ...
Found: 312 search results. Extracting links from 212:312
Found: 209 image links, looking for more ...
Found: 100 search results. Extracting links from 312:100
Found: 209 image links, looking for more ...
Found: 200 search results. Extracting links from 100:200
Found: 263 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ-fkaGou3kV1r8byBkSWuhqtAJwA8iyqF8nw&usqp=CAU - as ./images/desserts/creme_brulee\c37bcda3fc.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSsxJzqYZ5izIpiqjOuxUXpbL4pUKvYHVvQs5fd52iurp-U0vOIc40i4GHGSWAaVpNsPPo&usqp=CAU - as ./images/desserts/creme_brulee\3b9d473e81.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSgOSrMWYGNuG3kM_WMUfP-flzYHPf6alj5Og&usqp=CAU - as ./images/desserts/c

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRuVFh-HC3nTE8Cu6zVcTLB1l1FrB1DEpQmJA&usqp=CAU - as ./images/desserts/creme_brulee\f128212349.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSj53yFfPea3F6HRpeXSPVGBhwjlx9b8KBd6lwFaGprcFr9UrO9S5WrS2Rj63O-tDvWxn0&usqp=CAU - as ./images/desserts/creme_brulee\5dc1172fed.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT15cggn3daTC6QJA9-hUmwI1atyRsLXtaMJA&usqp=CAU - as ./images/desserts/creme_brulee\ce0cb5566f.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT7frIzTRK8Ui3BXgwzwgcQfnLFlpVMDep1sA&usqp=CAU - as ./images/desserts/creme_brulee\de56f4ae05.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTted4kWax-WKT87ChQwRnnuDMsCctqkrAf6Q&usqp=CAU - as ./images/desserts/creme_brulee\deb6ade1a2.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSIHD4Qw-OWkGdQ9jJHs5eQsXpFRDp2JPS16KcCuumdYUSUCFfLjX-FwTk8eMsl

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRxu5gCiWEOrnNWfHkK_Jdma7FefX4u-pXY7Q&usqp=CAU - as ./images/desserts/creme_brulee\2945682cae.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT5fOzvPCQmj36i-DoRW78qqpQgbhXyMB06IOOA0C35gv1nEcIFoEthbjRY-xUKzM01V88&usqp=CAU - as ./images/desserts/creme_brulee\4303360137.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSHrAhiK34r3tSfkKktjGXGzxHrqrb26NswIw&usqp=CAU - as ./images/desserts/creme_brulee\9b2e73a6fa.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSNR9jx18HuIWGlvUJovZGpiRAr91FXwWH2ig&usqp=CAU - as ./images/desserts/creme_brulee\7b9ebb4241.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSRHqVAd6MR1SCwHLT3ZU1ZoHcGcFp-xwQ32g&usqp=CAU - as ./images/desserts/creme_brulee\3361c7238f.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS2H9A4l_CQ5iD5BHTq7mVgP5fCtBOoQ6cuYiV_wLkZpIDxpLsSfQnl7WPMr2xq

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSDe3a1V2tA7cR4WpwHC4FhcW-3taITEX8Qrw&usqp=CAU - as ./images/desserts/creme_brulee\d2b49d9c86.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ3YSpLQ1pkJQcjD6qkWLhtBYOamjaR6zzFLg&usqp=CAU - as ./images/desserts/creme_brulee\bcdab33aeb.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQyf_Ctf0Y2Sh5XIxgCj4csQxjOvwSbII5eVg&usqp=CAU - as ./images/desserts/creme_brulee\a7d3799d29.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQqZnaQ4OqzT1yNMNpxHml-97ZVUlbDa0xgxg&usqp=CAU - as ./images/desserts/creme_brulee\9944c96f31.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTyGT1E7d1grkdF72v22rXtG-PV_XwQOxF6SA&usqp=CAU - as ./images/desserts/creme_brulee\a1069d35b0.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQif2iMGyyJL0hvUvxEQGVNKFW_oRvz7OdoYg&usqp=CAU - as ./images/desserts/creme_brulee\d6c2707c97.jp

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcScko9wZiL2VJqgVub53eAcy8oK1vnTnoGXMw&usqp=CAU - as ./images/desserts/creme_brulee\2e769c9836.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRNIlF98zdjWbjy9__6wC7M3WzKigEjKk4g3w&usqp=CAU - as ./images/desserts/creme_brulee\e197d247c3.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSNV3L7Qf6kcL7NDgVogU5zFTrE7F45jQMw1Q&usqp=CAU - as ./images/desserts/creme_brulee\90ad99ec83.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS0JObt4wv_SAtYeGzOT62Nu6a1VVnRCCPn3w&usqp=CAU - as ./images/desserts/creme_brulee\eb26bd4c02.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTKr6yQRGuB5I-uIJTuqkxa55jMlWm_5YAUxg&usqp=CAU - as ./images/desserts/creme_brulee\6e9522a68c.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRAT7qJZC_U7W19cBD_gISUoNyAQ5hjq36XIw&usqp=CAU - as ./images/desserts/creme_brulee\759aa61723.jp

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQk_1TIqO6XwOrjvkOHei9IxWPSFlJs9aNKGMuXrMp0tm7SqtbKWKHtWdVOGNFruWGf694&usqp=CAU - as ./images/desserts/creme_brulee\fd90430740.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRcrX98fXwTf8PJMkxJAsXlFVLIfrH7x8kGEQ&usqp=CAU - as ./images/desserts/creme_brulee\e5f21e4026.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTA2zQD5byyYDNMqDdXGSsx_VvcabsyVWlcAQ&usqp=CAU - as ./images/desserts/creme_brulee\ffaf9a4bda.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQUqf7ljxDdjmdqXjS8J0W3RIG-W4U-mO_aOmPgL2JZGGbUjPPrIcZVjW15w6z0bqSxx8s&usqp=CAU - as ./images/desserts/creme_brulee\77912d4d73.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQoeFNvTmn9vsfai63dajF-vB5Z0O6z46Z7VA&usqp=CAU - as ./images/desserts/creme_brulee\e236de9e96.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTbLW3GSedzbNkD65iObJKvyX5fbk4

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTgIoGXknlXAbafMHxIiZMAb0d4ZvDT_SZZiw&usqp=CAU - as ./images/desserts/baklava\5ad15eecae.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRyKsxTzhLgVV4_5RKwQx8A0hw3z8s-Ls5KQw&usqp=CAU - as ./images/desserts/baklava\68ba9c1748.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRwhiZE5h3WN1E2AE2c7Sctrd-KcQc6Z-cfDA&usqp=CAU - as ./images/desserts/baklava\89d7dc4cb2.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRucLzGocLVXF8LIU17D_YFN-wHr3rb-Sebtg&usqp=CAU - as ./images/desserts/baklava\4f328a62ae.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTAIF76LWY0k3LzBzkwcrWpGCSdR8IcKT0TM58j7BSUwpLrqWN4AhlcViFjlfP7zfq4FBk&usqp=CAU - as ./images/desserts/baklava\69ea329f08.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRXig_OUylmotkcImzDvL4e_3YkuyDE4TkIwyt4kVTXRcrsufT1bm7yUib-k9CuPR7ayNI&usqp=CAU - as ./i

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcShbIxn5x2DNxN17lZGKVUxWSFqEN_uoYd9rA&usqp=CAU - as ./images/desserts/baklava\9e6f1c4018.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRpAzqCmVqUNklRZu6bGV-3VpP0Eh6ieDp6xg&usqp=CAU - as ./images/desserts/baklava\9432bc83a5.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTmJKUOs0ZujwBe3MN5erBY9Y-EWd06Fnd3zg&usqp=CAU - as ./images/desserts/baklava\6b628fb1e1.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTZ7LW-QY7EuF4PoHeM8Ia63oigCrKjnc_C1w&usqp=CAU - as ./images/desserts/baklava\20e8110838.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQR4A-No4AaZwmsT5BHge3bviS-AZF0gYmO7Q&usqp=CAU - as ./images/desserts/baklava\2eaf7d82a7.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQwCwL0_FqsncWR-VEUInFw4kSR_1GD1NPhxw&usqp=CAU - as ./images/desserts/baklava\599956b2d6.jpg
SUCCESS - saved https://encr

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcShg8ZnrV93gpv4Htp0wSbJpcrgjhd9lOhY7PhGVOTB64dqzU0No7WB_Oa-QZiFriqpIOY&usqp=CAU - as ./images/desserts/baklava\256d40d961.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTfJSL-Lavgh5hBmYgFYA-MYq7IVjTj4fkuVreYkdTf5STjP7q5784B3XnwPFcHC15mNbY&usqp=CAU - as ./images/desserts/baklava\7081fe4578.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSym8-uzp93WPA3LQxdsJ1DO9kFE7v7djz6PQ&usqp=CAU - as ./images/desserts/baklava\52691f0b66.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTqp-rBQE0Olh4IFeFpRTehQAObU_nkvWBmRg&usqp=CAU - as ./images/desserts/baklava\59d7ad74d6.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTsUPwRMBg2ln8wmSaamWXBMOEFILFPizW7yQ&usqp=CAU - as ./images/desserts/baklava\2606946dc4.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSfULAiNLYSl3I1OQfBisoawII3_SZ1wmyWvuXSowRxVphrS9y2BuF-

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRndfuey0ma9dIKRkslgcf246O9yEtOUB3FcQ&usqp=CAU - as ./images/desserts/baklava\221e3a57ff.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT__NlVeGCiQa4KofCbE0cm_ajCB_lOduRNGg&usqp=CAU - as ./images/desserts/baklava\e0bf13019c.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSd4BMBSKIdexF1L2tqD6fpgXqKUxA6adCrjQ&usqp=CAU - as ./images/desserts/baklava\a7e4570bb3.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTUSoHPJYwEsfVQtWcmd_VWjSB5cyKoE21YXg&usqp=CAU - as ./images/desserts/baklava\d5e95fa9b9.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT5X0NC_bvM8aXMy7vSELncO0NkjIHY5zjSbQD14vV5tk9SF10Y9mgda7tX5Fbq_EYs3cc&usqp=CAU - as ./images/desserts/baklava\bee209c930.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSTJ5M8GwhOmVX45vS5kEWQPQyFZ_RfZRaNKw&usqp=CAU - as ./images/desserts/baklava\777c9da6f2

SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSdRmsTbdiG20inJj2j32ylLisG-jwZh4P_6eaQjoWZ52wPEWLGCkfHLbG2aeVi0yx-FUM&usqp=CAU - as ./images/desserts/baklava\daaff9f57d.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTKm-4QCOTMZ81uT5E4CZoZzPf2_nX4Eya_N1R52zu7ApC-9L4UIySxUv2Lf_gwjejsBvQ&usqp=CAU - as ./images/desserts/baklava\0b45695e9f.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRLT2KkP6ywpYP7jCTjtOmfY67G7VD62kjoPQ&usqp=CAU - as ./images/desserts/baklava\2c77d37c59.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSq0iDNmXr6cxavy2xj6fOCGOiyqW0TstzV2g&usqp=CAU - as ./images/desserts/baklava\51b01ab67a.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQy2rshL3-PMi3gdtXh-rb1nLXEZ2qPPNeVpQ&usqp=CAU - as ./images/desserts/baklava\1ae59bdbe9.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSkMktgBgwePxIrAMejzaicEUxnVmWVK7hW116pea1h0MbYi1ZdDZgA