# Imports

In [2]:
import imutils
from tensorflow.keras.applications import ResNet50                      
from tensorflow.keras.applications.resnet import preprocess_input
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.applications import imagenet_utils
from imutils.object_detection import non_max_suppression                
import numpy as np
import argparse
import time
import cv2
from google.colab.patches import cv2_imshow

# Helper Functions

1. Sliding Window
2. Image Pyramid

In [3]:
'''
to find where in the image an object is by sliding the classification window 
from left-to-right (column-wise) and top-to-bottom (row-wise)
'''
def sliding_window(image, step, ws):
	# slide a window across the image
	for y in range(0, image.shape[0] - ws[1], step):
		for x in range(0, image.shape[1] - ws[0], step):
			# yield the current window
			yield (x, y, image[y:y + ws[1], x:x + ws[0]])

'''
assists in generating copies of our image at different scales so that we can find objects of different sizes
'''
def image_pyramid(image, scale=1.5, minSize=(224, 224)):
	# yield the original image
	yield image
	
	# keep looping over the image pyramid
	while True:
		# compute the dimensions of the next image in the pyramid
		w = int(image.shape[1] / scale)
		image = imutils.resize(image, width=w)

		# if the resized image does not meet the supplied minimum
		# size, then stop constructing the pyramid
		if image.shape[0] < minSize[1] or image.shape[1] < minSize[0]:
			break

		# yield the next image in the pyramid
		yield image

# Detector 

### Initialization and Pre-training

In [63]:
# initialize variables used for the object detection procedure
print("initializing variables...")

WIDTH = 600						                                                           # to resize test images, if reqd, to maintain consistency
PYR_SCALE = 1.5                                                                  # image pyramid scale factor
WIN_STEP = 16					                                                           # sliding window step size
INPUT_SIZE = (224, 224)			                                                     # classification CNN input dimensions
IMAGE_NAME = "images/hummingbird.jpg"                                            # test image
ROI_SIZE = (250, 250)                                                            # to control the aspect ratio of objects we want to detect (test-image specific)
MIN_CONF = 0.9                                                                   # minimum confidence/probability threshold to mark an RoI positive (test-image specific)
VISUALIZE = False                                                                # flag to be set to visualize sliding window in action

# load our the network weights from disk
print("loading network...")
model = ResNet50(weights="imagenet", include_top=True)

# load the input image from disk, resize it such that it has the
# has the supplied width, and then grab its dimensions
orig = cv2.imread(IMAGE_NAME)
orig = imutils.resize(orig, width=WIDTH)
(H, W) = orig.shape[:2]

initializing variables...
loading network...


### Retrieving RoIs

In [64]:
# initialize the image pyramid
pyramid = image_pyramid(orig, scale=PYR_SCALE, minSize=ROI_SIZE)

# initialize two lists, one to hold the ROIs generated from the image
# pyramid and sliding window, and another list used to store the
# (x, y)-coordinates of where the ROI was in the original image
rois = []
locs = []

# time how long it takes to loop over the image pyramid layers and
# sliding window locations
start = time.time()

# loop over the image pyramid
for image in pyramid:
	# determine the scale factor between the *original* image
	# dimensions and the *current* layer of the pyramid
	scale = W / float(image.shape[1])

	# for each layer of the image pyramid, loop over the sliding
	# window locations
	for (x, y, roiOrig) in sliding_window(image, WIN_STEP, ROI_SIZE):
		# scale the (x, y)-coordinates of the ROI with respect to the
		# *original* image dimensions
		x = int(x * scale)
		y = int(y * scale)
		w = int(ROI_SIZE[0] * scale)
		h = int(ROI_SIZE[1] * scale)

		# take the ROI and pre-process it so we can later classify
		# the region using Keras/TensorFlow
		roi = cv2.resize(roiOrig, INPUT_SIZE)
		roi = img_to_array(roi)
		roi = preprocess_input(roi)

		# update our list of ROIs and associated coordinates
		rois.append(roi)
		locs.append((x, y, x + w, y + h))

		# check to see if we are visualizing each of the sliding
		# windows in the image pyramid
		if VISUALIZE:
			# clone the original image and then draw a bounding box
			# surrounding the current region
			clone = orig.copy()
			cv2.rectangle(clone, (x, y), (x + w, y + h),
				(0, 255, 0), 2)

			# show the visualization and current ROI
			cv2_imshow(clone)
			cv2_imshow(roiOrig)
			cv2.waitKey(0)

# show how long it took to loop over the image pyramid layers and
# sliding window locations
end = time.time()
print("[INFO] looping over pyramid/windows took {:.5f} seconds".format(end - start))

[INFO] looping over pyramid/windows took 0.10964 seconds


### Producing the Output

In [None]:
# convert the ROIs to a NumPy array
rois = np.array(rois, dtype="float32")

# classify each of the proposal ROIs using ResNet and then show how
# long the classifications took
print("classifying ROIs...")
start = time.time()
preds = model.predict(rois)
end = time.time()
print("classifying ROIs took {:.5f} seconds".format(
	end - start))

# decode the predictions and initialize a dictionary which maps class
# labels (keys) to any ROIs associated with that label (values)
preds = imagenet_utils.decode_predictions(preds, top=1)
labels = {}

# loop over the predictions
for (i, p) in enumerate(preds):
	# grab the prediction information for the current ROI
	(imagenetID, label, prob) = p[0]

	# filter out weak detections by ensuring the predicted probability
	# is greater than the minimum probability
	if prob >= MIN_CONF:                                          # NEED TO FIX THIS: if prob>= MIN_CONF
		# grab the bounding box associated with the prediction and
		# convert the coordinates
		box = locs[i]

		# grab the list of predictions for the label and add the
		# bounding box and probability to the list
		L = labels.get(label, [])
		L.append((box, prob))
		labels[label] = L


# loop over the labels for each of detected objects in the image
for label in labels.keys():
	# clone the original image so that we can draw on it
  print("showing results for '{}'".format(label))
  clone = orig.copy()

	# loop over all bounding boxes for the current label
  for (box, prob) in labels[label]:
		# draw the bounding box on the image
    (startX, startY, endX, endY) = box
    cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)

  # show the results *before* applying non-maxima suppression, then
  # clone the image again so we can display the results *after*
  # applying non-maxima suppression
  print("Before")
  cv2_imshow(clone)
  clone = orig.copy()
 
  # extract the bounding boxes and associated prediction
	# probabilities, then apply non-maxima suppression
  boxes = np.array([p[0] for p in labels[label]])
  proba = np.array([p[1] for p in labels[label]])
  boxes = non_max_suppression(boxes, proba)

	# loop over all bounding boxes that were kept after applying
	# non-maxima suppression
  for (startX, startY, endX, endY) in boxes:
		# draw the bounding box and label on the image
    cv2.rectangle(clone, (startX, startY), (endX, endY), (0, 255, 0), 2)
    y = startY - 10 if startY - 10 > 10 else startY + 10
    cv2.putText(clone, label, (startX, y), cv2.FONT_HERSHEY_SIMPLEX, 0.45, (0, 255, 0), 2)

	# show the output after apply non-maxima suppression
  print("After")
  cv2_imshow(clone)
  # cv2.waitKey(0)