# Import Libraries

In [2]:
import os
import time
import urllib
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# Script

Helper functions

In [3]:
# Scrolls to the bottom of Google Images results
import urllib.request


def scroll_to_bottom(driver, pause_time=2):
    last_height = driver.execute_script('return document.body.scrollHeight')
 
    while True:
        driver.execute_script('window.scrollTo(0,document.body.scrollHeight)')
 
        # Wait for results to load
        time.sleep(pause_time) # Increase if internet is slow
        
        # Check for 'More results' button and click it
        try:
            more_results_button = driver.find_element(By.XPATH, "//a[@class='T7sFge sW9g3e VknLRd']")
            more_results_button.click()
            time.sleep(2)
        except:
            pass

        # Check if we have reached the bottom of the page
        new_height = driver.execute_script('return document.body.scrollHeight')

        if new_height == last_height:
            break
 
        last_height = new_height

# Downloads an image from a URL
def download_image(image_url, prefix, save_folder, image_count):
    try:
        urllib.request.urlretrieve(image_url, f'{save_folder}/{prefix}_{image_count}.jpg')
        
        print(f'Downloaded image {image_count}')
    except Exception as e:
        print(f'Could not download image {image_count}: {e}')

# Scraper function
def scrape_images(query, num_images, prefix, save_folder):
    # Create new directory if it does not exist
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--start-maximized')
    # set chrome driver manager
    chrome_service = Service()
    # Instantiate ChromeDriver
    driver = webdriver.Chrome(service=chrome_service, options=chrome_options)

    # Go to Google Images
    driver.get('https://images.google.com')
    # Find search box element
    searchbox = driver.find_element('name', 'q')
    # Fill in the search query
    searchbox.send_keys(query)
    # Press ENTER
    searchbox.send_keys(Keys.RETURN)

    # Scroll to load more images
    scroll_to_bottom(driver=driver)

    # Parse HTML and find image URL
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    images = soup.find_all('img', {'class': 'YQ4gaf'}, limit=num_images)

    # Download images
    image_count = 1
    for img in images:
        img_url = img.get('src')
        if img_url:
            if int(img['width']) > 200:
                download_image(img_url, prefix, save_folder, image_count)
                image_count += 1
    
    driver.quit()

**Run scraper**

Search queries
- posture_arched_back: cat arched back real image
- posture_stretch: cat stretching real image
- tail_straight_up: cat tail straight up real image

In [6]:
dataset_dir = '../data/raw/emeowtions-scraped'
save_dir = f'{dataset_dir}/_other/dump'
query = 'cat arched back real image'

print(f'Scraping images for: {query}')
scrape_images(query=query, num_images=1500, prefix='posture_arched_back', save_folder=save_dir)

Scraping images for: cat arched back real image
Downloaded image 1
Downloaded image 2
Downloaded image 3
Downloaded image 4
Downloaded image 5
Downloaded image 6
Downloaded image 7
Downloaded image 8
Downloaded image 9
Downloaded image 10
Downloaded image 11
Downloaded image 12
Downloaded image 13
Downloaded image 14
Downloaded image 15
Downloaded image 16
Downloaded image 17
Downloaded image 18
Downloaded image 19
Downloaded image 20
Downloaded image 21
Downloaded image 22
Downloaded image 23
Downloaded image 24
Downloaded image 25
Downloaded image 26
Downloaded image 27
Downloaded image 28
Downloaded image 29
Downloaded image 30
Downloaded image 31
Downloaded image 32
Downloaded image 33
Downloaded image 34
Downloaded image 35
Downloaded image 36
Downloaded image 37
Downloaded image 38
Downloaded image 39
Downloaded image 40
Downloaded image 41
Downloaded image 42
Downloaded image 43
Downloaded image 44
Downloaded image 45
Downloaded image 46
Downloaded image 47
Downloaded image 48
D