In [4]:
import os, string
import ggl_img_scraper as ggl
from tqdm import tqdm
from utils import *

In [5]:
# Necessary inputs
ggl_api_key = ''
search_engine_id = 'f1aca5d66c8d4435c'

raw_dir = '..\\data\\raw\\'
training_dir = '..\\data\\training\\'
validation_dir = '..\\data\\validation\\'

validation_split = .15  # Takes images for validation

num_images = 75
birds_txt = '..\\data\\bird_lists\\bay_area_birds.txt'
db_name = ".\\bird_im_urls.db"

buffer = 5
clear_dirs = True
remove_db = True

In [None]:
## Clear dir
if clear_dirs:
    clear_dir(raw_dir, [])
    clear_dir(training_dir, [])
    clear_dir(validation_dir, [])
if remove_db and os.path.exists(db_name):
    os.remove(db_name)
    if os.path.exists(db_name+"-journal"):
        os.remove(db_name+"-journal")

In [None]:
# Read-in txt file of bird names
# File must be new-line delimited
birds = []
with open(birds_txt) as f:
    if f.readable() is False:
        raise FileNotFoundError("ERROR: File is not a readable file.")
    birds = f.readlines()
birds = [ string.capwords(x.strip()) for x in birds]
birds.sort()

In [None]:
# Make a directory for every bird in the list in the training and validation directories
for bird in birds:
    if not os.path.exists(raw_dir + bird + '\\'):
        os.makedirs(raw_dir + bird + '\\')
    if not os.path.exists(training_dir + bird + '\\'):
        os.makedirs(training_dir + bird + '\\')
    if not os.path.exists(validation_dir + bird + '\\'):
        os.makedirs(validation_dir + bird + '\\')

In [None]:
retry_lst = {}

In [None]:
# Get Images for each bird from Google Images
for bird in tqdm(birds):
    search_query = f"real '{str(bird).strip()}' bird -drawing -map -cartoon -logo -baby -egg -painting -pattern -illustration -art -similar -information -creative -general -book -math -product -food -feed -help -zoologist -list -bingo -tattoo -ranch -cowboy"
    save_dir = raw_dir + '\\'+bird+'\\'
    saved = ggl.google_image_download(query=search_query, save_directory=save_dir, api_key=ggl_api_key, cx=search_engine_id, n=num_images, name=bird, db_name=db_name,delay=None, mute=True)
    if (len(saved)+buffer)  < num_images:
        retry_lst[bird] = saved

In [None]:
# Counts directories for correct number of images
count_jpg_images(raw_dir, num_images, buffer = buffer,raise_e = False)

In [None]:
print(len(retry_lst))
print(retry_lst)

### MANUALLY REMOVE BAD TRAINING DATA
### Fix any error that could have occured

In [None]:
# Retry getting images for brids in the list with a slightly different query
for bird in tqdm(retry_lst.keys()):
    search_query = str(bird)
    save_dir = raw_dir + '\\'+str(bird)+'\\'
    saved = ggl.google_image_download(query=search_query, save_directory=save_dir, api_key=ggl_api_key, cx=search_engine_id, n=num_images, name=bird, db_name=db_name,delay=None, exclude_urls=retry_lst[bird])
    if (len(saved)+buffer) < num_images:
        print(saved)
        print(bird + " saved " + str(len(saved)) + " not " + str(num_images))
    

In [6]:
# Normalize Every Image to RGB
confirm_image_readability(raw_dir, training_dir)

In [None]:
# Make Training and Validation Split
for bird in birds:
    move_random_files(os.path.join(training_dir,bird), 
                      os.path.join(validation_dir,bird), 
                      round(num_images * validation_split))

In [None]:
confirm_image_readability(validation_dir)