In [27]:
import requests
import io
import hashlib
import os

import time
from selenium import webdriver
from PIL import Image
from matplotlib import pyplot as plt

In [28]:
def scroll_to_end(wd: webdriver):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)

In [29]:
def get_images(wd: webdriver.Chrome, element_count: int, element_class: str, url: str):
    wd.get(url=url)
    
    img_urls = set()
    img_limit_reached = False
    
    while len(img_urls) < element_count or img_limit_reached:
        img_limit_reached = True
        scroll_to_end(wd=wd)
        
        results = wd.find_elements_by_class_name(element_class)
        res_count = len(results)
        
        for elem in results:
            if len(img_urls) >= element_count:
                break
            if elem.get_attribute('src') and 'http' in elem.get_attribute('src') and elem.get_attribute('src') not in img_urls:
                    img_urls.add(elem.get_attribute('src'))
                    img_limit_reached = False
    wd.quit()
    return img_urls

In [30]:
def preprocess_image(img: Image, img_size: int):
    default_img = Image.new('RGB', (img_size, img_size), (0, 0, 0))
    
    img_ratio = img.width / img.height
    if img_ratio > 1:
        img = img.resize((img_size , int(img_size / img_ratio)))
    else:
        img = img.resize((int(img_size * img_ratio), img_size))
    
    img_pos_center = (img_size - img.width) // 2 , (img_size - img.height) // 2
    default_img.paste(img, img_pos_center)
    default_img = default_img.convert('L')
    
    return default_img

In [31]:
def save_img(path:str, url:str):
    try:
        img = requests.get(url).content

    except Exception as e:
        print("Error while trying to download: {0} \n".format(url))

    try:
        img_data = io.BytesIO(img)
        
        image = Image.open(img_data).convert('RGB')
        image = preprocess_image(image, 100)
        file_path = os.path.join(path, hashlib.sha1(img).hexdigest() + '.bmp')
        with open(file_path, 'wb') as f:
            image.save(f, "BMP")
    except Exception as e:
        print("Error {1} while trying to store: {0}".format(url, e))

In [32]:
path = "/Users/robert/Documents/RepositorysLocal/BerlinerPfannkuchenKlassifikatior/BerlinerPfannkuchenKlassifikator/data"
url = "https://www.google.com/search?q=Berliner+(Geb%C3%A4ck)&tbm=isch&ved=2ahUKEwikusWEnOP8AhWcxrsIHa14AdYQ2-cCegQIABAA&oq=Berliner+(Geb%C3%A4ck)&gs_lcp=CgNpbWcQAzIECAAQHjIECAAQHjIECAAQHjoFCAAQgAQ6BggAEAgQHlDyAliYH2DrIWgAcAB4AIABgwGIAbMFkgEEMTAuMZgBAKABAaoBC2d3cy13aXotaW1nwAEB&sclient=img&ei=PmPRY6SjApyN7_UPrfGFsA0&bih=957&biw=1200"

def scrape_images(path: str, url: str, count: int):
    driver = webdriver.Chrome('/Users/robert/Documents/RepositorysLocal/BerlinerPfannkuchenKlassifikatior/chromedriver_mac_arm64/chromedriver')
    img_urls = get_images(wd=driver, element_count=2, element_class="rg_i.Q4LuWd", url=url) #rg_i.Q4LuWd is the class of the images in the google search
    
    for img_url in img_urls:
        save_img(path, img_url)


In [None]:
scrape_images(path, url, 2)