# Web-Scraping Data for tflite model
Web-Scraping data for android plushie classification tflite model using BeautifulSoup selenium webdriver

In [28]:
import requests
from bs4 import BeautifulSoup
# import selenium
from selenium import webdriver

import os
import time
import io
from PIL import Image
import hashlib

In [8]:
DRIVER_PATH = '../chromedriver'
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

## Scraping
Here are the functions to search and download on Google using webdriver


Referenced and edited to fit my own use: https://towardsdatascience.com/image-scraping-with-python-a96feda8af2d

In [20]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.rg_ic")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.irc_mi')
            for actual_image in actual_images:
                if actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(1)
            try:
                load_more_button = wd.find_element_by_css_selector(".ksb")
                wd.execute_script("document.querySelector('.ksb').click();")
            except:
                pass

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
            
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

In [14]:
def search_and_download(search_term:str,driver_path:str,target_folder='./images',number_images=5):

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

## Downloading Pictures 

#### Is Android Plushie Image searches

- 'android plush' (75 imgs)

- 'android plushie' (35 images)

In [40]:
search_and_download('android plush', DRIVER_PATH, target_folder='./images/is_android_plushie', number_images=75)

Found: 200 search results. Extracting links from 0:200
Found: 75 image links, done!
SUCCESS - saved https://www.dhresource.com/albu_393586365_00/1.600x600.jpg - as ./images/is_android_plushie/45e8b5ca55.jpg
SUCCESS - saved https://di2ponv0v5otw.cloudfront.net/posts/2019/01/17/5c4118f52e14783dd0bd66e8/m_5c411902aa87706eeec57ec0.jpg - as ./images/is_android_plushie/62a50d73aa.jpg
SUCCESS - saved https://i.etsystatic.com/6940002/r/il/d21cb0/1786561941/il_570xN.1786561941_q4sp.jpg - as ./images/is_android_plushie/0e2640b390.jpg
SUCCESS - saved https://img.letgo.com/images/7c/99/2e/5b/7c992e5bb649c6dd87ae2e5bcf3e10c3.jpeg?impolicy=img_600 - as ./images/is_android_plushie/051ac6c9c3.jpg
SUCCESS - saved https://i.ytimg.com/vi/PMsbacpk9IA/hqdefault.jpg - as ./images/is_android_plushie/7e212c9080.jpg
SUCCESS - saved http://sc02.alicdn.com/kf/HTB1KdvydbSYBuNjSspiq6xNzpXae/Authorization-custom-plush-android-plush-toy-stuffed.jpg - as ./images/is_android_plushie/0e0d24fd6d.jpg
SUCCESS - saved http

SUCCESS - saved https://www.promoplace.com/ws/ws.dll/QPic?SN=69740&P=314759939&RS=300 - as ./images/is_android_plushie/e0f04195a5.jpg
SUCCESS - saved https://media.karousell.com/media/photos/products/2016/11/26/android_plush_toy_1480140650_b12478c0.jpg - as ./images/is_android_plushie/583f03a518.jpg
SUCCESS - saved https://live.staticflickr.com/7291/8802170700_d33baf2751_b.jpg - as ./images/is_android_plushie/b0d17b05a9.jpg
ERROR - Could not download https://www.spotsound.fr/7066-large_default/android-robot-mascot-green-green-robot-costume.jpg - HTTPSConnectionPool(host='www.spotsound.fr', port=443): Max retries exceeded with url: /7066-large_default/android-robot-mascot-green-green-robot-costume.jpg (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x7f11b9961e90>: Failed to establish a new connection: [Errno 101] Network is unreachable'))
ERROR - Could not save https://www.spotsound.fr/7066-large_default/android-robot-mascot-green-green-robot-costum

In [41]:
search_and_download('android plushie', DRIVER_PATH, target_folder='./images/is_android_plushie', number_images=35)

Found: 200 search results. Extracting links from 0:200
Found: 35 image links, done!
SUCCESS - saved https://i.ytimg.com/vi/PMsbacpk9IA/hqdefault.jpg - as ./images/is_android_plushie/7e212c9080.jpg
SUCCESS - saved https://images-na.ssl-images-amazon.com/images/I/61d4KRK2bjL._AC_SL1500_.jpg - as ./images/is_android_plushie/60cfe86a8a.jpg
SUCCESS - saved https://i.ebayimg.com/images/g/pW4AAOSwz21c82Ak/s-l500.jpg - as ./images/is_android_plushie/53212311b3.jpg
SUCCESS - saved https://www.picclickimg.com/d/l400/pict/292683884502_/Dragonball-Z-8-ANDROID-17-PLUSH-FIGURE.jpg - as ./images/is_android_plushie/1ec7ef1816.jpg
SUCCESS - saved https://i.pinimg.com/originals/2e/97/24/2e9724dfdd1b5a94085114612cc01fc1.jpg - as ./images/is_android_plushie/98edd37767.jpg
SUCCESS - saved https://i.ebayimg.com/images/g/TFoAAOSwrfVZOhKC/s-l300.jpg - as ./images/is_android_plushie/4538019ea2.jpg
SUCCESS - saved https://cdn.instructables.com/FFD/E4SJ/IRXHLMKA/FFDE4SJIRXHLMKA.LARGE.jpg?auto=webp&width=1024&hei

#### Not android Plushie searches

- 'green toy' (55 imgs)
- 'soft toy' (25 imgs)
- 'toy' (30 imgs)

In [37]:
search_and_download('green toy', DRIVER_PATH, target_folder='./images/not_android_plushie', number_images=55)

Found: 200 search results. Extracting links from 0:200
Found: 55 image links, done!
SUCCESS - saved https://www.eco-business.com/media/cache/4a/04/4a04edeb1bcb42e57a35e52b11a0f977.jpg - as ./images/not_android_plushie/6d3b04fbb8.jpg
SUCCESS - saved https://www.lamkins.com.sg/images/leapfrog%20lettersaurus%20blue.jpg - as ./images/not_android_plushie/bac8703942.jpg
SUCCESS - saved https://upload.wikimedia.org/wikipedia/commons/thumb/2/24/Yo-Yo-Plastic-Toy-Green.jpg/1200px-Yo-Yo-Plastic-Toy-Green.jpg - as ./images/not_android_plushie/5b8f99d9fa.jpg
SUCCESS - saved https://cf.shopee.sg/file/350e85b9d6c5b20b62d13192b51620f1 - as ./images/not_android_plushie/3f97a11297.jpg
SUCCESS - saved https://cf.shopee.sg/file/d996efd177e74d5d2b0fa4c8ba5967cd - as ./images/not_android_plushie/e334c27993.jpg
SUCCESS - saved https://gd.image-gmkt.com/li/300/442/1222442300.g_400-w-st_g.jpg - as ./images/not_android_plushie/dad436de56.jpg
SUCCESS - saved https://images-na.ssl-images-amazon.com/images/I/61aD

SUCCESS - saved https://www.kissnature.co.uk/wp-content/uploads/2019/04/T-rex-teether.jpg - as ./images/not_android_plushie/c3e80ee777.jpg
SUCCESS - saved https://cdn.shopify.com/s/files/1/0004/0855/1482/products/140618_helicopter_green_with_bear_re.jpg?v=1571746629 - as ./images/not_android_plushie/858f230c56.jpg
SUCCESS - saved https://www.greendiaperstore.com/images/detailed/19/PullToy_LQ_20171026B.crop__(1).jpg - as ./images/not_android_plushie/18342f49fd.jpg
SUCCESS - saved https://cdn.shopify.com/s/files/1/0149/8336/4708/products/594669_1024x.jpg?v=1573159946 - as ./images/not_android_plushie/ba147eb420.jpg
SUCCESS - saved https://images-na.ssl-images-amazon.com/images/I/61E49Wl-7iL._SY355_.jpg - as ./images/not_android_plushie/975502632e.jpg
SUCCESS - saved https://ae01.alicdn.com/kf/HTB1aFRAIXXXXXXqXpXXq6xXFXXXc/New-Volkswagen-1-32-Beetle-Coupe-Diecast-Model-Car-Green-Toy-collection-B153c.jpg - as ./images/not_android_plushie/85b3450de6.jpg


In [38]:
search_and_download('soft toy', DRIVER_PATH, target_folder='./images/not_android_plushie', number_images=25)

Found: 200 search results. Extracting links from 0:200
Found: 25 image links, done!
SUCCESS - saved https://static.zara.net/photos///2018/I/0/3/p/5946/591/064/2/w/560/5946591064_1_1_1.jpg?ts=1542305057555 - as ./images/not_android_plushie/9ff0caaab9.jpg
SUCCESS - saved https://diapercakes.sg/wp-content/uploads/2019/01/18323Pooh.jpg - as ./images/not_android_plushie/609e18253e.jpg
SUCCESS - saved https://www.ikea.com/sg/en/images/products/kramig-soft-toy__0162448_PE317642_S5.JPG - as ./images/not_android_plushie/1db4a5dd85.jpg
SUCCESS - saved https://www.pupsikstudio.com/media/catalog/product/cache/1/image/450x/9df78eab33525d08d6e5fb8d27136e95/p/i/pinkfong-baby-shark-official-song-cube-baby-shark_2.jpg - as ./images/not_android_plushie/24f98933e9.jpg
SUCCESS - saved https://www.natures-collection.com/wp-content/uploads/2017/12/G4053934-Gund-Animated-Flappy-12-Inches-high-res.jpg - as ./images/not_android_plushie/3dfa0354fc.jpg
SUCCESS - saved https://noodoll.com/media/catalog/product/ca

In [39]:
search_and_download('toy', DRIVER_PATH, target_folder='./images/not_android_plushie', number_images=30)

Found: 200 search results. Extracting links from 0:200
Found: 30 image links, done!
SUCCESS - saved https://heavyeditorial.files.wordpress.com/2019/08/51-best-toys-for-10-year-old-girls-the-ultimate-list-2019.jpg?quality=65&strip=all&w=780 - as ./images/not_android_plushie/38e9234ab2.jpg
SUCCESS - saved https://shop.qoobee.com/wp-content/uploads/3-QooBee-Plushy-2019-Eating-600.png - as ./images/not_android_plushie/7d47130999.jpg
SUCCESS - saved https://assets.bwbx.io/images/users/iqjWHBFdfxIU/iQ.B92_.0SRs/v0/360x-1.jpg - as ./images/not_android_plushie/094aea7ae6.jpg
SUCCESS - saved https://images-na.ssl-images-amazon.com/images/I/61EYVtXVHEL._AC_SX355_.jpg - as ./images/not_android_plushie/a6e9b786e3.jpg
SUCCESS - saved https://assets.bwbx.io/images/users/iqjWHBFdfxIU/ivReUEOyfpKM/v0/1000x-1.jpg - as ./images/not_android_plushie/9edf1af014.jpg
SUCCESS - saved https://hips.hearstapps.com/vader-prod.s3.amazonaws.com/1574281130-race-track-1574281117.jpg - as ./images/not_android_plushie/

## Scraping done

The images have been scraped:

- Is android plushie: 110 Images
- Not android plushie: 110 Images

The images now will be cleaned manually, where images that do not fit the description will be removed.

After cleaning:

- Is android Plushie: 90 Images
- Not android Plushie: 105 Images