## Web Scraping for Nigerian Fashion Images
This exercise involves collecting data for men and women fashion in different colours based on the colour library csv
The end task should be:
1. 50,000 labeled images[30k women styles/colours, 20k men styles/colours]
2. Images should be labeled by colour
3. image dimension should be 128px*128px
Data collection from google images using Selenium

#### import libraries

In [1]:
import os
import time
import requests
import io
import pathlib
import numpy as np
import pandas as pd
import selenium
import PIL
import hashlib
from PIL import Image
from selenium import webdriver

#### load datasets

In [2]:
dvr = "dataset/chromedriver.exe"
men_dir = "dataset/image_library/men"
women_dir = "dataset/image_library/women"
color_lib = pd.read_csv("dataset/color_dataset.csv")
color_lib

Unnamed: 0,Color_name
0,Red
1,Green
2,Blue
3,Yellow
4,Orange
5,Pink
6,Purple
7,Brown
8,Grey
9,Black


In [3]:
#create folder from colors in the library
for x in color_lib['Color_name']:
    color_folder = x
    pathlib.Path(men_dir,color_folder).mkdir(parents=True, exist_ok=True)
    pathlib.Path(women_dir,color_folder).mkdir(parents=True, exist_ok=True)
    print("\nFolder created in: ", os.path.join(men_dir, color_folder))
    print("\nFolder created in: ", os.path.join(women_dir, color_folder))


Folder created in:  dataset/image_library/men\Red

Folder created in:  dataset/image_library/women\Red

Folder created in:  dataset/image_library/men\Green

Folder created in:  dataset/image_library/women\Green

Folder created in:  dataset/image_library/men\Blue 

Folder created in:  dataset/image_library/women\Blue 

Folder created in:  dataset/image_library/men\Yellow

Folder created in:  dataset/image_library/women\Yellow

Folder created in:  dataset/image_library/men\Orange

Folder created in:  dataset/image_library/women\Orange

Folder created in:  dataset/image_library/men\Pink

Folder created in:  dataset/image_library/women\Pink

Folder created in:  dataset/image_library/men\Purple

Folder created in:  dataset/image_library/women\Purple

Folder created in:  dataset/image_library/men\Brown

Folder created in:  dataset/image_library/women\Brown

Folder created in:  dataset/image_library/men\Grey

Folder created in:  dataset/image_library/women\Grey

Folder created in:  dataset/i

### Webscrape for man and women wears using selenium

##### load chrome drive
wd.get('https://google.com')

search_box = wd.find_element_by_css_selector('input.gLFyf')

search_box.send_keys('Dogs')

In [4]:
wd= webdriver.Chrome(executable_path=dvr)

In [5]:
def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls

In [6]:
def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")

#### search and download target images to image_library folder for hand labeling later

In [7]:
def search_and_download(search_term:str,driver_path:str,target_path='./dataset/image_library/men/scraped_images',number_images=500):
    target_folder = os.path.join(target_path,'_'.join(search_term.lower().split(' ')))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.1)
        
    for elem in res:
        persist_image(target_folder,elem)

In [8]:
search_term = 'men ankara styles'
search_and_download(search_term = search_term, driver_path = dvr)

Found: 100 search results. Extracting links from 0:100
Found: 51 image links, done!
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS02U7s16E1a8vORwFerUEU82ASMTu9KN77rQ&usqp=CAU - as ./dataset/image_library/men/scraped_images\men_ankara_styles\b9ee53bb5a.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcR-cc_Uiq62EMoL0EsUGvfqaZezy_jAdFZpcQ&usqp=CAU - as ./dataset/image_library/men/scraped_images\men_ankara_styles\a330887166.jpg
SUCCESS - saved https://i.ytimg.com/vi/rlBlyh1Z03E/maxresdefault.jpg - as ./dataset/image_library/men/scraped_images\men_ankara_styles\1e26248ba3.jpg
SUCCESS - saved https://2.bp.blogspot.com/-FCG598NX5DE/W52spm-h6hI/AAAAAAAAAtg/01xoKxmpdRI4s-4dOXHTZrmv9iHfbE5igCLcBGAs/s1600/ankara-styles-for-men-41_2.jpg - as ./dataset/image_library/men/scraped_images\men_ankara_styles\66d2d3a046.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTpbey76QRxMm8c8tDqDvK3xpsquUUzNlw1NA&usqp=CAU - as ./dataset/ima

SUCCESS - saved https://i.ytimg.com/vi/1qHv3l3fuDc/maxresdefault.jpg - as ./dataset/image_library/men/scraped_images\men_ankara_styles\1f19d4cbba.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQnAdrOLnpoJ-_AXnp5NS8Og0UHEmuWWSUEiQ&usqp=CAU - as ./dataset/image_library/men/scraped_images\men_ankara_styles\00268cd39b.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRziwfngIkflqUUmJT1KLeuRY3zg2JKWMqQxw&usqp=CAU - as ./dataset/image_library/men/scraped_images\men_ankara_styles\5a4f38f794.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcT65tPyZAgq3xuZWjFOV9aT-RY6zenCqjETEQ&usqp=CAU - as ./dataset/image_library/men/scraped_images\men_ankara_styles\46de22e369.jpg
SUCCESS - saved https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSXGSEC3Yu0DwU55t_A8x5xQ-LKEBVip3_vsA&usqp=CAU - as ./dataset/image_library/men/scraped_images\men_ankara_styles\f93e7eb605.jpg
SUCCESS - saved https://netstorage-yen.akamaized.net/images/4e

#### close chrome window when done

In [None]:
#wd.quit()