# Real-time Multi-object detection and tracking

# Imports

In [1]:
import numpy as np
import os
import six.moves.urllib as urllib
import sys
import tarfile
import tensorflow as tf
import zipfile
import cv2
import time
from multiprocessing import Queue, Pool
from distutils.version import StrictVersion
from collections import defaultdict
from io import StringIO
from matplotlib import pyplot as plt
from PIL import Image

if StrictVersion(tf.__version__) < StrictVersion('1.9.0'):
  raise ImportError('Please upgrade your TensorFlow installation to v1.9.* or later!')
%matplotlib inline

## Object detection imports
Here are the imports from the object detection module.

In [2]:
from utils import label_map_util

from utils import visualization_utils as vis_util
#from imutils.video import WebcamVideoStream
from imutils.video import FPS

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

The backend was *originally* set to 'module://ipykernel.pylab.backend_inline' by the following code:
  File "/usr/lib/python2.7/runpy.py", line 174, in _run_module_as_main
    "__main__", fname, loader, pkg_name)
  File "/usr/lib/python2.7/runpy.py", line 72, in _run_code
    exec code in run_globals
  File "/home/nvidia/.local/lib/python2.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/nvidia/.local/lib/python2.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/nvidia/.local/lib/python2.7/site-packages/ipykernel/kernelapp.py", line 499, in start
    self.io_loop.start()
  File "/home/nvidia/.local/lib/python2.7/site-packages/tornado/ioloop.py", line 1073, in start

# Model preparation 

## Variables

In [3]:
#MODEL_NAME = 'ssd_mobilenet_v1_coco_2017_11_17'
MODEL_NAME = 'ssd_mobilenet_v11'
# Path to frozen detection graph. This is the actual model that is used for the object detection.
PATH_TO_FROZEN_GRAPH = MODEL_NAME + '/frozen_inference_graph.pb'

# List of the strings that is used to add correct label for each box.
PATH_TO_LABELS = os.path.join('data', 'mscoco_label_map.pbtxt')

In [4]:
_tf_config = tf.ConfigProto(allow_soft_placement=True)
_tf_config.gpu_options.allow_growth=True

## Load a (frozen) Tensorflow model into memory.

In [5]:
detection_graph = tf.Graph()
with detection_graph.as_default():
  od_graph_def = tf.GraphDef()
  with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
    serialized_graph = fid.read()
    od_graph_def.ParseFromString(serialized_graph)
    tf.import_graph_def(od_graph_def, name='')

## Loading label map

In [6]:
#category_index = label_map_util.create_category_index_from_labelmap(PATH_TO_LABELS, use_display_name=True)
NUM_CLASSES = 90
label_map = label_map_util.load_labelmap(PATH_TO_LABELS)
categories = label_map_util.convert_label_map_to_categories(label_map, max_num_classes=NUM_CLASSES, use_display_name=True)
category_index = label_map_util.create_category_index(categories)

## Helper functions

In [7]:
from threading import Thread
class WebcamVideoStream:
    def __init__(self, src, width, height):
        # initialize the video camera stream and read the first frame
        # from the stream
        self.stream = cv2.VideoCapture(src)
        self.stream.set(cv2.CAP_PROP_FRAME_WIDTH, width)
        self.stream.set(cv2.CAP_PROP_FRAME_HEIGHT, height)
        (self.grabbed, self.frame) = self.stream.read()
        
        # initialize the variable used to indicate if the thread should
        # be stopped
        self.stopped = False

    def start(self):
        # start the thread to read frames from the video stream
        Thread(target=self.update, args=()).start()
        return self

    def update(self):
        # keep looping infinitely until the thread is stopped
        while True:
            # if the thread indicator variable is set, stop the thread
            if self.stopped:
                self.stream.release()
                return

            # otherwise, read the next frame from the stream
            (self.grabbed, self.frame) = self.stream.read()

    def read(self):
        # return the frame most recently read
        return self.frame

    def stop(self):
        # indicate that the thread should be stopped
        self.stopped = True

In [8]:
def conv_detect2track(box, width, height):
    # transforms normalized to absolut coords
    ymin, xmin, ymax, xmax = box
    ymin = ymin*height
    xmin = xmin*width
    ymax = ymax*height
    xmax = xmax*width
    boxwidth= xmax - xmin
    boxheight = ymax - ymin
    
    newbox = [xmin,ymin, boxwidth, boxheight]
    #newbox = map(int,newbox)
    return newbox

def conv_track2detect(box, width, height):
    # transforms absolut to normalized coords
    dw = 1./width
    dh = 1./height
    x, y, boxwidth, boxheight = box #map(float,box)
    xmin = x * dw
    ymin = y * dh
    xmax = (x+boxwidth) * dw
    ymax = (y+boxheight) * dh
    
    newbox = np.array([ymin,xmin,ymax,xmax])
    return newbox

In [9]:
def worker(input_q, output_q):
    
    import sys
    sys.path.append(os.getcwd()+'/KCFcpp')
    import KCF
    tracker = KCF.kcftracker(False, True, False, False)
    tracker_counter = 0
    track = False

    detection_graph = tf.Graph()
    with detection_graph.as_default():
        od_graph_def = tf.GraphDef()
        with tf.gfile.GFile(PATH_TO_FROZEN_GRAPH, 'rb') as fid:
            serialized_graph = fid.read()
            od_graph_def.ParseFromString(serialized_graph)
            tf.import_graph_def(od_graph_def, name='')

        sess = tf.Session(graph=detection_graph,config= _tf_config)

    fps = FPS().start()
    frame_counter = 0
    printer = False #Using this boolean to print only few frame's details as text
    
    while True:
        if not track:
            image_np = input_q.get()
            frame_counter += 1
            if frame_counter % 10 == 0:
                printer = True

            image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')
            detection_boxes = detection_graph.get_tensor_by_name('detection_boxes:0')
            detection_scores = detection_graph.get_tensor_by_name('detection_scores:0')
            detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
            num_detections = detection_graph.get_tensor_by_name('num_detections:0')
            #print(image_np.shape)
              # to have a shape [1, None, None, 3]
            image_np_expanded = np.expand_dims(image_np, axis=0)
              # Detection.
            (boxes, scores, classes, num) = sess.run(
                  [detection_boxes, detection_scores, detection_classes, num_detections],
                  feed_dict={image_tensor: image_np_expanded})
              # Visualization
            boxes, classes, scores = np.squeeze(boxes),np.squeeze(classes).astype(np.int32),np.squeeze(scores)
            vis_util.visualize_boxes_and_labels_on_image_array(
                  image_np,
                  boxes, 
                  classes, 
                  scores,
                  category_index,
                  use_normalized_coordinates=True,
                  line_thickness=3,
                  min_score_thresh=.5)

            for box, score, _class in zip(boxes, scores, classes):
                if printer:
                    label = category_index[_class]['name']
                    print("label: {}\nscore: {}\nbox: {}".format(label, score, box))
                    printer = False

            output_q.put(image_np)
            
            # Activate Tracker
            if num <= 5: # Max number of objects to track
                tracker_frame = image_np
                track = True
                first_track = True

        # Tracking
        else:
            frame = input_q.get()
            frame_counter += 1
            if frame_counter % 10 == 0:
                printer = True
                
            if first_track:
                trackers = []
                tracker_boxes = boxes
                for box in boxes[~np.all(boxes == 0, axis=1)]:
                        tracker.init(conv_detect2track(box,480, 360), tracker_frame)
                        trackers.append(tracker)
                first_track = False

            for idx,tracker in enumerate(trackers):
                tracker_box = tracker.update(frame)
                tracker_boxes[idx,:] = conv_track2detect(tracker_box, 480, 360)

            vis_util.visualize_boxes_and_labels_on_image_array(
                  frame,
                  boxes, 
                  classes, 
                  scores,
                  category_index,
                  use_normalized_coordinates=True,
                  line_thickness=3,
                  min_score_thresh=.5)
    
            tracker_counter += 1
            if tracker_counter >= 20: #Number of tracked frames between detections
                track = False
                tracker_counter = 0
            
            output_q.put(frame)
            
        fps.update()
        
    fps.stop()
    sess.close()

# Detection

In [None]:
input_q = Queue(maxsize=5)
output_q = Queue(maxsize=5)
pool = Pool(2, worker, (input_q, output_q))


print('[INFO] sampling THREADED frames from webcam...')
vs = WebcamVideoStream(src=0,width = 480,height=360).start()

fourcc = cv2.VideoWriter_fourcc(*"MJPG")

out = cv2.VideoWriter('output.avi', fourcc, 5.0, (544, 288),True)
frame_counter = 0

fps = FPS().start()
f_count = 0

while True:
    frame = vs.read()
    input_q.put(frame)

    t = time.time()
    
    if output_q.empty():
        pass  # fill up queue
    else:
        output_rgb = output_q.get()
        out.write(output_rgb)
        cv2.imshow('Video', output_rgb)
        f_count += 1
        if f_count >= 1000: #For testing.. limiting the number of frames
            break
        
    fps.update()

    #print('[INFO] elapsed time: {:.2f}'.format(time.time() - t))

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

fps.stop()
print('[INFO] elapsed time (total): {:.2f}'.format(fps.elapsed()))
print('[INFO] approx. FPS: {:.2f}'.format(fps.fps()))

pool.terminate()
vs.stop()
out.release()
cv2.destroyAllWindows()

[INFO] sampling THREADED frames from webcam...
label: person
score: 0.787391662598
box: [0.09397298 0.30995643 0.9922058  0.993363  ]
label: person
score: 0.790426909924
box: [0.1174548  0.2920087  0.9933994  0.99393636]
label: person
score: 0.807669401169
box: [0.12734544 0.34864846 0.9957831  0.99904966]
label: person
score: 0.895899713039
box: [0.16184935 0.09494008 0.99676144 0.49622512]
label: person
score: 0.850935637951
box: [0.1495997  0.12336215 0.99738336 0.52386606]
label: person
score: 0.761521577835
box: [0.2262091 0.4102541 0.9885536 0.9978684]
label: person
score: 0.807542681694
box: [0.23116577 0.41477644 0.98890626 0.9987943 ]
label: person
score: 0.753902256489
box: [0.17485568 0.39481848 0.9898746  0.99578756]
label: person
score: 0.808250248432
box: [0.20639667 0.4437852  0.9663031  0.9956881 ]
label: person
score: 0.875904262066
box: [0.21151507 0.43510184 0.98342717 0.9979001 ]
label: person
score: 0.880375444889
box: [0.22131747 0.43350065 0.9824597  0.99450004]
