# Original Model Inference

This part of code performs inference with the original object detection model. The output frame with bounding box will be saved separately from the frame without person or false positive where model failed to detect.<br>
model_type variable has to be updated with corrected model - ssd or rcnn. <br>
model_name variable should contain the model to be used for inference.<br>
The output folder to be created will be rcnn_output_detected_images and rcnn_output_images respectively.

In [1]:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
from collections import defaultdict
from io import StringIO
import cv2 as cv2
import pathlib
import time

In [2]:
# Playing video from file
cap = cv2.VideoCapture('Pedestrian_Detect_2_1_1.mp4')

In [3]:
def load_model(model_name):
    base_url = 'http://download.tensorflow.org/models/object_detection/'
    model_file = model_name + '.tar.gz'
    model_dir = tf.keras.utils.get_file(
    fname=model_name, 
    origin=base_url + model_file,
    untar=True)

    model_dir = pathlib.Path(model_dir)/"saved_model"

    model = tf.saved_model.load(str(model_dir))
    model = model.signatures['serving_default']

    return model

In [4]:
# ssd or rcnn
model_type = "rcnn"
#model_name = 'ssd_mobilenet_v2_coco_2018_03_29'
model_name = 'faster_rcnn_inception_v2_coco_2018_01_28'
HEIGHT = 600
WIDTH = 1024
#load the model
detection_model = load_model(model_name)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [5]:
# frame counter
count = 0
print(detection_model.inputs)
detection_model.output_dtypes


[<tf.Tensor 'image_tensor:0' shape=(None, None, None, 3) dtype=uint8>]


{'detection_scores': tf.float32,
 'detection_classes': tf.float32,
 'num_detections': tf.float32,
 'detection_boxes': tf.float32}

In [6]:
def perform_inference(model, frame):
    # The input needs to be a tensor, convert it using `tf.convert_to_tensor`.
    input_tensor = tf.convert_to_tensor(image)
    # The model expects a batch of images, so add an axis with `tf.newaxis`.
    input_tensor = input_tensor[tf.newaxis,...]
    
    # Run inference
    output_dict = model(input_tensor)
    
    #print(output_dict)
    num_detections = int(output_dict.pop('num_detections'))
        
    output_dict = {key:value[0, :num_detections].numpy() 
                 for key,value in output_dict.items()}
    output_dict['num_detections'] = num_detections

    # detection_classes should be ints.
    output_dict['detection_classes'] = output_dict['detection_classes'].astype(np.int64)
    
    return output_dict

In [9]:
# Grab the shape of the input 
width = int(cap.get(3))
height = int(cap.get(4))
print("Started video inference....")
total_inf_start = time.time()
while(cap.isOpened()):
    # Capture frame-by-frame
    ret, frame = cap.read()
    if not ret:
        break
    image = cv2.resize(frame, (HEIGHT, WIDTH))
    # Actual inference.    
    # Start asynchronous inference for specified request.
    frame_inf_start = time.time()
    output_dict = perform_inference(detection_model, image)
    frame_det_time = time.time() - frame_inf_start
    inf_time_message = "Inference time: {:.3f}ms"\
                               .format(frame_det_time * 1000)
    cv2.putText(frame, inf_time_message, (15, 15),
                        cv2.FONT_HERSHEY_COMPLEX, 0.5, (200, 10, 10), 1)
    if 1 in output_dict['detection_classes']:
        if len(output_dict['detection_classes']) > 1:
            a, cnts = np.unique(output_dict['detection_classes'], return_counts=True)
            if a[cnts > 1] and a[cnts.argmax()] == 1:
                print("Error")
                print(output_dict['detection_classes'])
                # store the images
                cv2.imwrite(model_type+"_output_images\image"+str(count)+".jpg", frame)
            else:
                # get the index for person class
                index = np.where(output_dict['detection_classes'] == 1)
                # filter based on probability score
                if output_dict['detection_scores'][index] >= 0.4:
                    values = output_dict['detection_boxes'][index]
                    for value in values:
                        xmin = int(value[1] * width)
                        ymin = int(value[0] * height)
                        xmax = int(value[3] * width)
                        ymax = int(value[2] * height)
                        cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 55, 255), 1)
                        # store the images with bounding box
                        cv2.imwrite(model_type+"_output_detected_images\image"+str(count)+".jpg", frame)
                else:
                    # store the images - probability is less
                    cv2.imwrite(model_type+"_output_images\image"+str(count)+".jpg", frame)
        else:
            if output_dict['detection_scores'] >= 0.4 and output_dict['detection_classes'] == 1:
                xmin = int(output_dict['detection_boxes'][0][1] * width)
                ymin = int(output_dict['detection_boxes'][0][0] * height)
                xmax = int(output_dict['detection_boxes'][0][3] * width)
                ymax = int(output_dict['detection_boxes'][0][2] * height)
                cv2.rectangle(frame, (xmin, ymin), (xmax, ymax), (0, 55, 255), 1)
                # store the images with bounding box
                cv2.imwrite(model_type+"_output_detected_images\image"+str(count)+".jpg", frame)
            else:
                # store the images - probability is less
                print(count + output_dict['detection_scores'])
                cv2.imwrite(model_type+"_output_images\image"+str(count)+".jpg", frame)
    else:
        # store the images
         cv2.imwrite(model_type+"_output_images\image"+str(count)+".jpg", frame)
                
    
    count = count + 1    

# When everything done, release the video capture and video write objects
cap.release()


# Closes all the frames
cv2.destroyAllWindows()    
print("Total Frame processed:",count)
total_det_time = time.time() - total_inf_start
total_inf_time_message = "Inference time: {:.3f}ms"\
                               .format(total_det_time * 1000)
print(total_inf_time_message)

Started video inference....




[380.33328]
Error
[87  1  1]
Error
[ 1 87  1]
Error
[ 1 87  1 28]


KeyboardInterrupt: 