## Make your own face dataset (Week 12) - Step 1


####**Designed by Joon Son Chung, November 2020**

This script downloads images from Bing Image Search. At the time of writing, the API is free for up to 1,000 search queries per month.

Modify the following parameters, then click `Runtime > Run all`.

In [None]:
from google.colab import drive
import os, glob, sys, numpy, cv2, random, requests, shutil
from requests import exceptions

# mount Google Drive
drive.mount('/content/drive', force_remount=True)

# path of the data directory relative to the home folder of Google Drive
GDRIVE_HOME = '/content/drive/My Drive'
FOLDER      = 'MLVU/your_dataset' # This is the directory where your files will be saved

# this is the folder to write to
data_dir        = os.path.join(GDRIVE_HOME,FOLDER) 
temp_path       = './downloaded_images'
assert os.path.exists(data_dir)

# your Bing API key (Bing Search V7)
API_KEY = "7fbee1b51ba24fa1b08c7e93360a0182"

# keywords to search (names of people)
words = ['유재석','강호동',
         '박근혜','손흥민',
         '추미애','안철수']

# number of images per identity
max_results = 100

print('We are going to search and download images for',len(words),'identities')

This is the tool for searching and downloading from Bing. **You do not need to change this section.**

In [None]:
# Adapted from https://www.pyimagesearch.com

def search_and_download(term,tgt_dir,API_KEY,MAX_RESULTS=250,GROUP_SIZE=50):

  # Saved at tgt_dir/term
  save_dir = os.path.join(tgt_dir,term)

  # Make directory if missing
  if not os.path.exists(save_dir):
    print('Creating directory %s'%save_dir)
    os.makedirs(save_dir)

  URL = "https://api.bing.microsoft.com/v7.0/images/search"

  # when attempting to download images from the web both the Python
  # programming language and the requests library have a number of
  # exceptions that can be thrown so let's build a list of them now
  # so we can filter on them
  EXCEPTIONS = set([IOError, FileNotFoundError,
    exceptions.RequestException, exceptions.HTTPError,
    exceptions.ConnectionError, exceptions.Timeout])

  headers = {"Ocp-Apim-Subscription-Key" : API_KEY}
  params = {"q": term, "offset": 0, "count": GROUP_SIZE}
  # make the search
  print("[INFO] searching Bing API for '{}'".format(term))
  search = requests.get(URL, headers=headers, params=params)
  search.raise_for_status()
  # grab the results from the search, including the total number of
  # estimated results returned by the Bing API
  results = search.json()
  estNumResults = min(results["totalEstimatedMatches"], MAX_RESULTS)
  print("[INFO] {} total results for '{}'".format(estNumResults,
    term))
  # initialize the total number of images downloaded thus far
  total = 0

  # loop over the estimated number of results in `GROUP_SIZE` groups
  for offset in range(0, estNumResults, GROUP_SIZE):
    # update the search parameters using the current offset, then
    # make the request to fetch the results
    print("[INFO] making request for group {}-{} of {}...".format(
      offset, offset + GROUP_SIZE, estNumResults))
    params["offset"] = offset
    search = requests.get(URL, headers=headers, params=params)
    search.raise_for_status()
    results = search.json()
    print("[INFO] saving images for group {}-{} of {}...".format(
      offset, offset + GROUP_SIZE, estNumResults))
    # loop over the results
    for v in results["value"]:
      # try to download the image
      try:
        # make a request to download the image
        print("[INFO] fetching: {}".format(v["contentUrl"]))
        r = requests.get(v["contentUrl"], timeout=30)
        # build the path to the output image
        ext = v["contentUrl"][v["contentUrl"].rfind("."):]
        p = os.path.sep.join([save_dir, "{}{}".format(
          str(total).zfill(8), ext)])
        # write the image to disk
        f = open(p, "wb")
        f.write(r.content)
        f.close()
      # catch any errors that would not unable us to download the
      # image
      except Exception as e:
        # check to see if our exception is in our list of
        # exceptions to check for
        if type(e) in EXCEPTIONS:
          print("[INFO] skipping: {}".format(v["contentUrl"]))
          continue
        p = ''
      # try to load the image from disk
      image = cv2.imread(p)
      # if the image is `None` then we could not properly load the
      # image from disk (so it should be ignored)
      if image is None and os.path.exists(p):
        print("[INFO] deleting: {}".format(p))
        os.remove(p)
        continue
      # update the counter
      total += 1

This part executes the download script. 

In [None]:
for word in words:
  ## MAX_RESULTS defines the number of images per search term
  search_and_download(word,temp_path,API_KEY,MAX_RESULTS=max_results)

Check the output files. Then zip and save to Google Drive.

In [None]:
output_files = glob.glob(temp_path+'/*/*.jpg')

print('%d downloaded images found. Now zipping. '%len(output_files))

shutil.make_archive(data_dir+'/original_data', 'zip', root_dir=temp_path)