In [2]:
################################ Import Packages################################################

from selenium import webdriver
from bs4 import BeautifulSoup
import urllib.request
import requests
import shutil
import random
import math
import uuid
import time
import os


##################################################################################################
######################################## Scraping Image ##########################################
##################################################################################################

def scroll_webpage(driver, n_images):

    print("Scrolling the web page...")

    if n_images > 20:

        i = 0
        while i < 5:
            # for scrolling page
            driver.execute_script(
                "window.scrollBy(0,document.body.scrollHeight)")

            try:
                # for clicking show more results button
                driver.find_element_by_xpath(
                    "/html/body/div[2]/c-wiz/div[3]/div[1]/div/div/div/div/div[5]/input").click()
            except Exception as e:
                pass

            time.sleep(5)
            i += 1

    else:
        driver.execute_script("window.scrollBy(0,document.body.scrollHeight)")

    print("Scrolling End!")

    return None

############################################################################################


def scrap_images(url, n_images, driver_path):

    driver = webdriver.Chrome(driver_path)

    print("Opening the browser...")
    driver.get(url)

    scroll_webpage(driver, n_images)
    # Parsing
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    # Close the driver
    driver.close()
    print("Browser Closed!")

    img_tags = soup.find_all("img", class_="rg_i", limit=n_images)
    #imgs = soup.select('img[src^="data:image/jpeg"]')

    return img_tags

##########################################################################################################


def download_images(img_tags, save_directory):

    # Check the directory
    if not os.path.exists(save_directory):
        os.mkdir(save_directory)
    else:
        pass

    # Downloading iamges
    print("Start downloading.....")

    for img in img_tags:

        path1 = save_directory

        try:
            name = str(uuid.uuid4())
            path2 = name+".jpg"
            file_path = os.path.join(path1, path2)
            urllib.request.urlretrieve(img['src'], file_path)
        except:
            pass

    # Check the number of downloaded image
    n_downloded_image = len(os.listdir(save_directory))

    print("Download finished!")
    print("{} image downloaded.".format(n_downloded_image))

    return None


####################################################################################################
###################################### Creating Dataset ############################################
####################################################################################################

def train_labels_csv(dataset_directory, image_name, label):

    label = str(label)

    file_path = os.path.join(dataset_directory, "labels.csv")
    train_directory = os.path.join(dataset_directory,"train",image_name)

    if not os.path.exists(file_path):
        with open(file_path, "w") as f:

            header = "Image"+","+"Label"+"\n"
            f.write(header)
            
            image_list = os.listdir(train_directory)
            
            for img in image_list:
                single_row = img+","+label+"\n"
                f.write(single_row)
                
    else:
        with open(file_path, "a") as f:
            
            image_list = os.listdir(train_directory)
            
            for img in image_list:
                single_row = img+","+label+"\n"
                f.write(single_row)
        
            
    shuffle_csv_dataset(file_path)

    return None

############################################################################################################


def test_labels_csv(dataset_directory, image_name, label):

    label = str(label)

    file_path = os.path.join(dataset_directory, "test_labels.csv")
    test_directory = os.path.join(dataset_directory,"test",image_name)

    if not os.path.exists(file_path):
        with open(file_path, "w") as f:

            header = "Image"+","+"Label"+"\n"
            f.write(header)
            
            image_list = os.listdir(test_directory)
            
            for img in image_list:
                single_row = img+","+label+"\n"
                f.write(single_row)
                
    else:
        with open(file_path, "a") as f:
            
            image_list = os.listdir(test_directory)
            
            for img in image_list:
                single_row = img+","+label+"\n"
                f.write(single_row)
        
            
    shuffle_csv_dataset(file_path)

    return None

############################################################################################################


def shuffle_csv_dataset(file_path,seed=42):

    random.seed(seed)
    
    with open(file_path,"r") as f:
        lines= f.readlines()
        header = lines[0]
        rows = lines[1:]
    
    with open(file_path,"w") as f:
        f.write(header)
        random.shuffle(rows)
        for line in rows:
            f.write(line)
    
    return None

############################################################################################################


def train_test_split(data_directory, split=0.8, seed=42):

    random.seed(seed)

    image_list = os.listdir(data_directory)
    random.shuffle(image_list)

    dataset_length = len(image_list)
    train_length = math.ceil(len(image_list)*split)
    test_length = len(image_list) - train_length

    train = []
    test = []

    for i in range(train_length):
        train.append(image_list[i])

    for i in range(train_length, dataset_length):
        test.append(image_list[i])

    return train, test

##################################################################################################


def make_dataset(dataset_directory, image_directory, image_name, split=0.8, seed=42):

    train_path = os.path.join(dataset_directory, "train", image_name)
    test_path = os.path.join(dataset_directory, "test", image_name)

    paths = [train_path, test_path]

    for path in paths:
        if not os.path.exists(path):
            os.makedirs(path)
        else:
            pass

    train, test = train_test_split(image_directory, split, seed)

    for image in train:
        path = os.path.join(train_path, image)
        with open(image_directory+"/"+image, 'rb') as f:
            img = f.read()
            with open(path, "wb") as f:
                f.write(img)

    for image in test:
        path = os.path.join(test_path, image)
        with open(image_directory+"/"+image, 'rb') as f:
            img = f.read()
            with open(path, "wb") as f:
                f.write(img)

    return None

##########################################################################################################


def dataset_information(dataset_directory):
    
    folders = os.listdir(dataset_directory)
    
    file_path_train = os.path.join(dataset_directory,"labels.csv")
    with open(file_path_train,"r") as f:
        lines = f.readlines()
        rows = lines[1:]
        train_len = len(rows)
        
    file_path_test = os.path.join(dataset_directory,"test_labels.csv")
    with open(file_path_test,"r") as f:
        lines = f.readlines()
        rows = lines[1:]
        test_len = len(rows)
        
    
    train_path = os.path.join(dataset_directory,"train") 
    data = {}
    for idx,img_name in enumerate(os.listdir(train_path)):
        data[idx] = img_name
        
    Information = Information = ''' ---------------------------------------
                                    Info:
                                    ---------------------------------------
                                    Data : {}
                                    Training set length : {}
                                    Test set length : {}
                                    Folders in the directory : {}
                                    ---------------------------------------'''.format( data, train_len, test_len, folders)
    
    return Information

##########################################################################################################


def main(dataset=False,del_img_dir=False,dataset_info=True):

    image_name = input("What do you want to download? ")
    n_images = int(input("How many images? "))
    label = int(input("Label of the image? "))
    url = 'https://www.google.com/search?tbm=isch&q='+image_name
    driver_path = "readonly/chromedriver.exe"
    image_directory = "readonly/images"+"_"+image_name
    dataset_directory = "readonly/Data_Set"
    
    tic = time.time()
    
    img_tags = scrap_images(url, n_images, driver_path)

    download_images(img_tags=img_tags, save_directory=image_directory)
    
    toc = time.time()
    
    elapsed_time = toc-tic
    
    print("Time elapsed {} seconds".format(elapsed_time))
    

    if dataset:
        make_dataset(dataset_directory, image_directory, image_name)
        train_labels_csv(dataset_directory, image_name, label)
        test_labels_csv(dataset_directory, image_name, label)
        print("Dataset Created!")
    else:
        print("Dataset is not created!")
    
    if del_img_dir:
        try:
            shutil.rmtree(image_directory)
            print("Image_directory is deleted!")
        except:
            pass
    else:
        print("Image_directory is not deleted!")
        
    
    if dataset_info:
        information = dataset_information(dataset_directory)
        print(information)
    else:
        pass

    return None

#############################################################################################################
#############################################################################################################
#############################################################################################################


if __name__ == '__main__':

    main(dataset=True)

What do you want to download? cat
How many images? 60
Label of the image? 1
Opening the browser...
Scrolling the web page...
Scrolling End!
Browser Closed!
Start downloading.....
Download finished!
55 image downloaded.
Time elapsed 42.68992042541504 seconds
Dataset Created!
Image_directory is not deleted!
 ---------------------------------------
                                    Info:
                                    ---------------------------------------
                                    Data : {0: 'car', 1: 'cat'}
                                    Training set length : 82
                                    Test set length : 20
                                    Folders in the directory : ['labels.csv', 'test', 'test_labels.csv', 'train']
                                    ---------------------------------------
