**Problem**
<p>Using html web scrappers to scrape Google Images results is limited to the first 100 images. This is because as the user scrolls, javascript is executed to load the rest of the images. Using selenium, we can simulate a user scrolling to the bottom of the page and **THEN** scrape for images.</p>

**Useful links**
- https://simply-python.com/2015/05/18/saving-images-from-google-search-using-selenium-and-python/
- http://stackoverflow.com/questions/20716842/python-download-images-from-google-image-search

**Installation**
- pip3 install -U selenium
- download firefox's gecko driver and add to path
- make sure you're using firefox 53 at least!
- test everything works in python shell

In [None]:
# tests
# binary = FirefoxBinary('/path/to/your/firefox_executable')
# browser = webdriver.Firefox(firefox_binary = binary)

In [1]:
# standard
from collections import defaultdict
import json
import os
import shutil
import sys
import time
import urllib

# vendors
from selenium import webdriver
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary

In [2]:
class GoogleImageExtractor:
    # google images search url
    URL = 'https://www.google.co.in/search?q={0}&source=lnms&tbm=isch'
    # request header
    HEADERS = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"
    }
    # appropriate file extensions
    EXTS = {'jpg', 'jpeg', 'png', 'gif'}
    
    def __init__(self, queries,
                 firefox_exe,
                 req_num = 200,
                 dl_path = 'google_search',
                 allow_dups = False,
                 overwrite_dirs = False,
                 debug = False):
            
        # excess print statements enabled/disabled
        self.debug = debug
        # path to firefox executable
        self.firefox_exe = firefox_exe
        # allow duplicates across queries
        self.allow_dups = allow_dups
        # overwrite image dirs if found
        self.overwrite_dirs = overwrite_dirs
        # download path
        self.dl_path = dl_path
        # number of images to download
        self.req_num = req_num
        # number of times we should scroll
        self.scroll_num = int(self.req_num / 400) + 1
        
        # used to determine unique links
        self.links = set()
        
        # query buckets filled with links
        # e.g. {query_path: {link: ext}}
        self.query_links = defaultdict(dict)
        
        # build urls with the queries
        tokenized = [query.strip().replace(' ', '+') for query in queries]
        self.urls = [self.URL.format(tokes) for tokes in tokenized]
        
        # build save paths with the queries
        underscored = [query.strip().replace(' ', '_') for query in queries]
        self.query_paths = [os.path.join(dl_path, path) for path in underscored]
        
        if self.debug:
            print('download path: {0}'.format(self.dl_path))
            print('request number: {0}'.format(self.req_num))
            print('scroll number: {0}'.format(self.scroll_num))
            print('overwrite image dirs: {0}'.format(self.overwrite_dirs))
            print('allow duplicates: {0}'.format(self.allow_dups))
            print('search urls: {0}'.format(self.urls))
            print('save paths: {0}'.format(self.query_paths))
              
                
    def scroll_to_bottom(self):
        
        # multiple scrolls needed to show all images
        for _ in range(self.scroll_num):
            for __ in range(10):
                if self.debug:
                    print('scrolling...')
                self.browser.execute_script('window.scrollBy(0, 1000000)')
                time.sleep(0.2)
            time.sleep(0.5)
            try:
                if self.debug:
                    print('looking for "Show more results" button...')
                self.browser.find_element_by_id('smb').click() #ok
            except:
                # button not found
                if self.debug:
                    print('button not found, done scrolling')
                break

    def get_img_links(self, query_path):
        assert query_path not in self.query_links
        
        # get all image meta data tags
        metas = self.browser.find_elements_by_xpath('//div[@class="rg_meta"]')
        if self.debug:
            print('Total image metas found: ', len(metas))

        for meta in metas:
            img_url = json.loads(meta.get_attribute('innerHTML'))['ou']
            img_type = json.loads(meta.get_attribute('innerHTML'))['ity']
            
            # allowing dups between query buckets
            if self.allow_dups:
                # ensure unique inside particular query bucket
                if img_url not in self.query_links[query_path] and img_type in self.EXTS:
                    self.query_links[query_path][img_url] = img_type
            else:
                # disallow dups between query buckets
                
                # link exists already, remove from existing query bucket
                if img_url in self.links:
                    if self.debug:
                        print('found duplicate, removing from all query buckets')
                    for q_path in self.query_links:
                        if img_url in self.query_links[q_path]:
                            self.query_links[q_path].pop(img_url)
                            #break # max of 1 already exists
                else:
                    # link does not exist, add it
                    if img_type in self.EXTS:
                        self.query_links[query_path][img_url] = img_type

            self.links.add(img_url)
                
            # max link count
            if len(self.query_links[query_path]) >= self.req_num:
                if self.debug:
                    print('found max request count, exiting...')
                break

    def exec_queries(self):
        if self.debug:
            print('creating firefox binary...')
        self.binary = FirefoxBinary(self.firefox_exe)
        if self.debug:
            print('generating firefox browser...')
        self.browser = webdriver.Firefox(firefox_binary = self.binary)

        
        for url, q_path in zip(self.urls, self.query_paths):
            self.browser.get(url)
            self.scroll_to_bottom()
            self.get_img_links(q_path)

        self.browser.close()
        
    def save_imgs(self):
        assert self.links
        assert self.query_links
        assert self.query_paths
        
        if self.overwrite_dirs:
            if os.path.exists(self.dl_path):
                shutil.rmtree(self.dl_path)
        else:
            assert not os.path.exists(self.dl_path)
        
        os.mkdir(self.dl_path)
        
        # for every query bucket
        for q_path in self.query_paths:
            os.mkdir(q_path)
            
            # for every link in that bucket
            for i, (link, ext) in enumerate(self.query_links[q_path].items()):
                try:
                    raw_img = urllib.request.urlopen(link).read()
                    save_path = os.path.join(q_path, '{0}.{1}'.format(i, ext))
                    with open(save_path, 'wb') as f:
                        f.write(raw_img)
                    if self.debug:
                        print('{0} at {1}'.format(save_path, link))
                except Exception as e:
                    if self.debug:
                        print('{0} could not load {1}'.format(i, link))
                        print(e)
                

In [3]:
queries = ['minecraft', 'minecraft gameplay', 'minecraft screenshots', 'minecraft pig']
extr = GoogleImageExtractor(queries, '/home/jarmentr/bin/firefox-53', req_num=10, debug=True)

download path: google_search
request number: 10
scroll number: 1
overwrite image dirs: False
allow duplicates: False
search urls: ['https://www.google.co.in/search?q=minecraft&source=lnms&tbm=isch', 'https://www.google.co.in/search?q=minecraft+gameplay&source=lnms&tbm=isch', 'https://www.google.co.in/search?q=minecraft+screenshots&source=lnms&tbm=isch', 'https://www.google.co.in/search?q=minecraft+pig&source=lnms&tbm=isch']
save paths: ['google_search/minecraft', 'google_search/minecraft_gameplay', 'google_search/minecraft_screenshots', 'google_search/minecraft_pig']


In [4]:
extr.exec_queries()

creating firefox binary...
generating firefox browser...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
looking for "Show more results" button...
Total image metas found:  400
found max request count, exiting...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
looking for "Show more results" button...
Total image metas found:  400
found max request count, exiting...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
looking for "Show more results" button...
Total image metas found:  400
found max request count, exiting...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
scrolling...
looking for "Show more results" button...
Total image metas found:  400
found max request count, ex

In [5]:
extr.overwrite_dirs = True
extr.save_imgs()

google_search/minecraft/0.png at https://media.mojang.com/blog-image/2c34ca1217c7d95e76a6f8d646adf9208f78145a/blogmcnet.png
google_search/minecraft/1.png at https://ragezone.com/wp-content/uploads/2017/02/minecraft.png
google_search/minecraft/2.jpg at https://minecraft.net/static/pages/img/minecraft-hero-og.8192ae9ca275.jpg
google_search/minecraft/3.jpg at https://minecraft.net/static/pages/img/index-hero-og.0757cc783ca4.jpg
google_search/minecraft/4.jpg at https://screenshots.en.sftcdn.net/en/scrn/189000/189271/minecraft-10-700x393.jpg
google_search/minecraft/5.jpg at https://i.ytimg.com/vi/mVyI-dzPawY/maxresdefault.jpg
google_search/minecraft/6.png at https://minecraft.net/static/pages/img/minecraft-hero-cta.4fd6713be829.png
google_search/minecraft/7.png at https://media.mojang.com/blog-image/0db18353862a2f2d4d029b757914935a930311a4/0_17_Update_Mojang_Blog_1024x576.png
google_search/minecraft/8.jpg at https://minecraft.net/static/pages/img/minecraft-hero.df1112867f04.jpg
9 could not 

In [None]:
# for link in list(extr.links.keys())[:10]:
#     print(link)