# VGG16 Deep Net for Image Recognition using Web Scraping from Selenium - Transfer Learning & Augmentation (Advanced Deep Learning)


### Loading the VGG16 Model Weights - We don't include the top layer.

In [None]:
from keras.applications import vgg16

In [None]:
# pre trained weights : create my model
model = vgg16.VGG16(weights='imagenet', include_top = False, input_shape=(224,224,3))

### Here we freeze all the intermediate layers so that we do not have to retrain the model.

In [None]:

for layer in model.layers:
    layer.trainable = False
    
# Let's print our layers 
for (i,layer) in enumerate(model.layers):
    print(str(i) + " "+ layer.__class__.__name__, layer.trainable)

### This function returns our new Fully Connected Layer we add at the botttom.

In [None]:
def fcl(bottom_model, num_classes):
    """creates the top or head of the model that will be 
    placed ontop of the bottom layers"""

    top_model = bottom_model.output
    top_model = Flatten(name = "flatten")(top_model)
    top_model = Dense(1024,activation='relu')(top_model)
    top_model = Dense(1024,activation='relu')(top_model)
    top_model = Dense(512,activation='relu')(top_model)
    top_model = Dropout(0.3)(top_model)
    top_model = Dense(num_classes,activation='softmax')(top_model)
    return top_model

### Now we add our Fully Connected Layer to the pretrained VGG16 model.

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, GlobalAveragePooling2D
from keras.layers import Conv2D, MaxPooling2D, ZeroPadding2D
from keras.layers.normalization import BatchNormalization
from keras.models import Model

num_classes = 4

FC_Head = fcl(model, num_classes)

newModel = Model(inputs = model.input, outputs = FC_Head)

print(newModel.summary())

## Acquiring our Dataset through WebScraping from Google WebDriver using Selenium

In [5]:
import selenium
import os
import time
import requests
import io
import hashlib
from selenium import webdriver
from PIL import Image


DRIVER_PATH= 'C://Users//KIIT//Desktop//Scraping//chromedriver'
wd = webdriver.Chrome(executable_path=DRIVER_PATH)

def fetch_image_urls(query:str, max_links_to_fetch:int, wd:webdriver, sleep_between_interactions:int=1):
    def scroll_to_end(wd):
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(sleep_between_interactions)    
    
    # build the google query
    search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"

    # load the page
    wd.get(search_url.format(q=query))

    image_urls = set()
    image_count = 0
    results_start = 0
    while image_count < max_links_to_fetch:
        scroll_to_end(wd)

        # get all image thumbnail results
        thumbnail_results = wd.find_elements_by_css_selector("img.Q4LuWd")
        number_results = len(thumbnail_results)
        
        print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
        
        for img in thumbnail_results[results_start:number_results]:
            # try to click every thumbnail such that we can get the real image behind it
            try:
                img.click()
                time.sleep(sleep_between_interactions)
            except Exception:
                continue

            # extract image urls    
            actual_images = wd.find_elements_by_css_selector('img.n3VNCb')
            for actual_image in actual_images:
                if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
                    image_urls.add(actual_image.get_attribute('src'))

            image_count = len(image_urls)

            if len(image_urls) >= max_links_to_fetch:
                print(f"Found: {len(image_urls)} image links, done!")
                break
        else:
            print("Found:", len(image_urls), "image links, looking for more ...")
            time.sleep(30)
            return
            load_more_button = wd.find_element_by_css_selector(".mye4qd")
            if load_more_button:
                wd.execute_script("document.querySelector('.mye4qd').click();")

        # move the result startpoint further down
        results_start = len(thumbnail_results)

    return image_urls


def persist_image(folder_path:str,url:str):
    try:
        image_content = requests.get(url).content

    except Exception as e:
        print(f"ERROR - Could not download {url} - {e}")

    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path,hashlib.sha1(image_content).hexdigest()[:10] + '.jpg')
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SUCCESS - saved {url} - as {file_path}")
    except Exception as e:
        print(f"ERROR - Could not save {url} - {e}")



def search_and_download(search_term:str,driver_path:str,target_path='C://Users//KIIT//Desktop//face_Recog_VGG//faces//validation',number_images=5):
    target_folder = os.path.join(target_path,'_'.join('n1'))

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    with webdriver.Chrome(executable_path=driver_path) as wd:
        res = fetch_image_urls(search_term, number_images, wd=wd, sleep_between_interactions=0.5)
        
    for elem in res:
        persist_image(target_folder,elem)

In [6]:

search_term= 'Tim Cook'


search_and_download(search_term=search_term, 
                    driver_path=DRIVER_PATH)

Found: 200 search results. Extracting links from 0:200
Found: 5 image links, done!
SUCCESS - saved https://thumbor.forbes.com/thumbor/960x0/https%3A%2F%2Fspecials-images.forbesimg.com%2Fdam%2Fimageserve%2F32e32455fee0481796f20d7a5a224fff%2F960x0.jpg%3Ffit%3Dscale - as C://Users//KIIT//Desktop//face_Recog_VGG//faces//validation\n_1\183c7ea95e.jpg
SUCCESS - saved https://i.gadgets360cdn.com/large/tim_cook_reuters_full_1570163405539.JPG - as C://Users//KIIT//Desktop//face_Recog_VGG//faces//validation\n_1\e9a65bc11d.jpg
SUCCESS - saved https://i.insider.com/5dd48479fd9db244be318c88?width=1100&format=jpeg&auto=webp - as C://Users//KIIT//Desktop//face_Recog_VGG//faces//validation\n_1\8d33d04ee5.jpg
SUCCESS - saved https://image.cnbcfm.com/api/v1/image/105608434-1543945658496rts28qzc.jpg?v=1554921416&w=1400&h=950 - as C://Users//KIIT//Desktop//face_Recog_VGG//faces//validation\n_1\6ef7e1aec8.jpg
SUCCESS - saved https://pbs.twimg.com/profile_images/1194113737092935681/63O1znGw.jpg - as C://Use

##  Loading our Faces Dataset

In [None]:
from keras.preprocessing.image import ImageDataGenerator

train_data_dir = 'faces/train/'
validation_data_dir = 'faces/validation/'

# Let's use some data augmentaiton 
train_datagen = ImageDataGenerator(
      rescale=1./255,
      rotation_range=45,
      width_shift_range=0.3,
      height_shift_range=0.3,
      horizontal_flip=True,
      fill_mode='nearest')
 
validation_datagen = ImageDataGenerator(rescale=1./255)
 
# set our batch size (typically on most mid tier systems we'll use 16-32)
batch_size = 16
 
train_generator = train_datagen.flow_from_directory(
        train_data_dir,
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='categorical')
 
validation_generator = validation_datagen.flow_from_directory(
        validation_data_dir,
        target_size=(224, 224),
        batch_size=batch_size,
        class_mode='categorical',
        shuffle=False)

##  Training our Model with RMSprop Optimizer, ModelCheckpoint & EarlyStopping Callbacks

In [None]:
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint, EarlyStopping

                     
checkpoint = ModelCheckpoint("Faces_VGG16.h5",
                             monitor="val_loss",
                             mode="min",
                             save_best_only = True,
                             verbose=1)

earlystop = EarlyStopping(monitor = 'val_loss', 
                          min_delta = 0, 
                          patience = 3,
                          verbose = 1,
                          restore_best_weights = True)

# we put our call backs into a callback list
callbacks = [earlystop, checkpoint]

# We use a very small learning rate 
newModel.compile(loss = 'categorical_crossentropy',
              optimizer = RMSprop(lr = 0.001),
              metrics = ['accuracy'])

# Enter the number of training and validation samples here
nb_train_samples = 280
nb_validation_samples = 120

# We only train 5 EPOCHS 
epochs = 5
batch_size = 1

history = newModel.fit_generator(
    train_generator,
    steps_per_epoch = nb_train_samples // batch_size,
    epochs = epochs,
    callbacks = callbacks,
    validation_data = validation_generator,
    validation_steps = nb_validation_samples // batch_size)

newModel.save("Faces_VGG16.h5")

### Loading our Trained Model Classifier

In [None]:
from keras.models import load_model

classifier = load_model('faces_VGG16.h5')

### Testing our Classifier

In [None]:
import cv2
import numpy as np
from os import listdir
from os.path import isfile, join

faces_dict = {"[0]": "Elon_Musk", 
              "[1]": "Tim_Cook",
              "[2]": "Jeff_Bezos",
              "[3]": "Bill_Gates"}

faces_dict_n = {"_n0": "Elon_Musk", 
                "_n1": "Tim_Cook",
                "_n2": "Jeff_Bezos",
                "_n3": "Bill_Gates"}

def draw_test(name, pred, im):
    face = faces_dict[str(pred)]
    BLACK = [0,0,0]
    expanded_image = cv2.copyMakeBorder(im, 80, 0, 0, 100 ,cv2.BORDER_CONSTANT,value=BLACK)
    cv2.putText(expanded_image, face, (20, 60) , cv2.FONT_HERSHEY_SIMPLEX,1, (0,0,255), 2)
    cv2.imshow(name, expanded_image)

def getRandomImage(path):
    """function loads a random images from a random folder in our test path """
    folders = list(filter(lambda x: os.path.isdir(os.path.join(path, x)), os.listdir(path)))
    random_directory = np.random.randint(0,len(folders))
    path_class = folders[random_directory]
    print("Class - " + faces_dict_n[str(path_class)])
    file_path = path + path_class
    file_names = [f for f in listdir(file_path) if isfile(join(file_path, f))]
    random_file_index = np.random.randint(0,len(file_names))
    image_name = file_names[random_file_index]
    return cv2.imread(file_path+"/"+image_name)    

for i in range(0,10):
    input_im = getRandomImage("faces/validation/")
    input_original = input_im.copy()
    input_original = cv2.resize(input_original, None, fx=0.5, fy=0.5, interpolation = cv2.INTER_LINEAR)
    
    input_im = cv2.resize(input_im, (224, 224), interpolation = cv2.INTER_LINEAR)
    input_im = input_im / 255.
    input_im = input_im.reshape(1,224,224,3) 
    
    # Get Prediction
    res = np.argmax(classifier.predict(input_im, 1, verbose = 0), axis=1)
    
    # Show image with predicted class
    draw_test("Prediction", res, input_original) 
    cv2.waitKey(0)

cv2.destroyAllWindows()