# The goal of this notebook is to detect objects through the Intel RealSense camera, and output a stream showing boxes (and distances) of the detected objects.

In [1]:
# Import Statements
import pyrealsense2 as rs
import numpy as np
import cv2
import math
import time

RealSense Configuration Code:

In [2]:
# Configure depth and color streams
pipeline = rs.pipeline()
config = rs.config()
config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30) # depth stream
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30) # color stream
align = rs.align(rs.stream.color) # align both streams to same pov

In [3]:
# Enable visualizer and filters for later use
colorizer = rs.colorizer()
spatial = rs.spatial_filter()
spatial.set_option(rs.option.filter_magnitude, 5)
spatial.set_option(rs.option.filter_smooth_alpha, 1)
spatial.set_option(rs.option.filter_smooth_delta, 50)
spatial.set_option(rs.option.holes_fill, 3)
hole_filling = rs.hole_filling_filter()
depth_to_disparity = rs.disparity_transform(True)
disparity_to_depth = rs.disparity_transform(False)

OpenCV Detection Configration Code:

In [4]:
# Image detection size
expected = 300
inScaleFactor = 0.007843
meanVal = 127.53

In [5]:
net = cv2.dnn.readNetFromTensorflow('frozen_inference_graph.pb', 'graph.pbtxt') # pretrained net

swapRB = True
classNames = { 0: 'background',
    1: 'person', 2: 'bicycle', 3: 'car', 4: 'motorcycle', 5: 'airplane', 6: 'bus',
    7: 'train', 8: 'truck', 9: 'boat', 10: 'traffic light', 11: 'fire hydrant',
    13: 'stop sign', 14: 'parking meter', 15: 'bench', 16: 'bird', 17: 'cat',
    18: 'dog', 19: 'horse', 20: 'sheep', 21: 'cow', 22: 'elephant', 23: 'bear',
    24: 'zebra', 25: 'giraffe', 27: 'backpack', 28: 'umbrella', 31: 'handbag',
    32: 'tie', 33: 'suitcase', 34: 'frisbee', 35: 'skis', 36: 'snowboard',
    37: 'sports ball', 38: 'kite', 39: 'baseball bat', 40: 'baseball glove',
    41: 'skateboard', 42: 'surfboard', 43: 'tennis racket', 44: 'bottle',
    46: 'wine glass', 47: 'cup', 48: 'fork', 49: 'knife', 50: 'spoon',
    51: 'bowl', 52: 'banana', 53: 'apple', 54: 'sandwich', 55: 'orange',
    56: 'broccoli', 57: 'carrot', 58: 'hot dog', 59: 'pizza', 60: 'donut',
    61: 'cake', 62: 'chair', 63: 'couch', 64: 'potted plant', 65: 'bed',
    67: 'dining table', 70: 'toilet', 72: 'tv', 73: 'laptop', 74: 'mouse',
    75: 'remote', 76: 'keyboard', 77: 'cell phone', 78: 'microwave', 79: 'oven',
    80: 'toaster', 81: 'sink', 82: 'refrigerator', 84: 'book', 85: 'clock',
    86: 'vase', 87: 'scissors', 88: 'teddy bear', 89: 'hair drier', 90: 'toothbrush' }  

Final Stream Output:

In [6]:
# Start streaming
time.sleep(3)

profile = pipeline.start(config)

try:
    while True:

        # Wait for a coherent pair of frames: depth and color
        frames = pipeline.wait_for_frames()
        frames = align.process(frames)
        
        depth_frame = frames.get_depth_frame()
        color_frame = frames.get_color_frame()
        
        if not depth_frame or not color_frame:
            continue

        # filter depth stream: depth2disparity -> spatial -> disparity2depth -> hole_filling
        depth_frame = depth_to_disparity.process(depth_frame)
        depth_frame = spatial.process(depth_frame)
        depth_frame = disparity_to_depth.process(depth_frame)
        depth_frame = hole_filling.process(depth_frame)

        # get intrinsics
        depth_intrin = depth_frame.profile.as_video_stream_profile().intrinsics
        color_intrin = color_frame.profile.as_video_stream_profile().intrinsics
        
        # Convert images to numpy arrays
        depth_image = np.asanyarray(colorizer.colorize(depth_frame).get_data())
        color_image = np.asanyarray(color_frame.get_data())
        
        # crop color image for detection
        height, width = color_image.shape[:2]
        expected = 300
        aspect = width / height
        resized_color_image = cv2.resize(color_image, (round(expected * aspect), expected))
        crop_start = round(expected * (aspect - 1) / 2)
        crop_color_img = resized_color_image[0:expected, crop_start:crop_start+expected]

        # Perform object detection through net
        blob = cv2.dnn.blobFromImage(crop_color_img, inScaleFactor, (expected, expected), meanVal, False)
        net.setInput(blob)
        detections = net.forward("detection_out")
        
        label = detections[0,0,0,1]
        conf  = detections[0,0,0,2]
        xmin  = detections[0,0,0,3]
        ymin  = detections[0,0,0,4]
        xmax  = detections[0,0,0,5]
        ymax  = detections[0,0,0,6]
        
        className = classNames[int(label)]

        # Calculate box coordinates of detected object
        scale = height / expected
        xmin_depth = int((xmin * expected + crop_start) * scale)
        ymin_depth = int((ymin * expected) * scale)
        xmax_depth = int((xmax * expected + crop_start) * scale)
        ymax_depth = int((ymax * expected) * scale)
        xmin_depth,ymin_depth,xmax_depth,ymax_depth
        
        # Calculate depth of object
        depth = np.asanyarray(depth_frame.get_data())
        # Crop depth data:
#         depth = depth[xmin_depth:xmax_depth,ymin_depth:ymax_depth].astype(float)
        depth = depth[math.floor((xmax_depth+xmin_depth)/2-1):math.ceil((xmax_depth+xmin_depth)/2+1),math.floor((ymax_depth+ymin_depth)/2-1):math.ceil((ymax_depth+ymin_depth)/2+1)].astype(float)

        # Get data scale from the device and convert to meters
        depth_scale = profile.get_device().first_depth_sensor().get_depth_scale()
        depth = depth * depth_scale
        dist,_,_,_ = cv2.mean(depth)        
        
        # Get 3d point of object detected
        depth_point = rs.rs2_deproject_pixel_to_point(depth_intrin, [int((xmax_depth+xmin_depth)/2), int((ymax_depth+ymin_depth)/2)], dist)
        print(depth_point)
        
        # Draw square on depth and color streams
        cv2.rectangle(depth_image, (xmin_depth, ymin_depth), 
            (xmax_depth, ymax_depth), (255, 255, 255), 2)
        cv2.rectangle(color_image, (xmin_depth, ymin_depth), 
            (xmax_depth, ymax_depth), (255, 255, 255), 2)
        cv2.putText(color_image, className+" @ "+"{:.2f}".format(dist)+"meters away", 
            (xmin_depth, ymin_depth),
            cv2.FONT_HERSHEY_COMPLEX, 0.5, (255,255,255))
        
        # Stack both images horizontally
        images = np.hstack((color_image, depth_image))

        # Show images
        cv2.namedWindow('RealSense', cv2.WINDOW_AUTOSIZE)
        cv2.imshow('RealSense', images)
        key = cv2.waitKey(1)
        
        # Press esc or 'q' to close the image window
        if key & 0xFF == ord('q') or key == 27:
            cv2.destroyAllWindows()
            break
        
finally:

    # Stop streaming
    pipeline.stop()

[-0.11164698749780655, -0.012419451028108597, 0.5315000414848328]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[-0.09985727071762085, -0.00039929308695718646, 0.5281667113304138]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, -0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[-0.09903547167778015, -0.0012527074432000518, 0.5283333659172058]
[-0.09995179623365402, -0.001253497670404613, 0.5286666750907898]
[-0.10004632920026779, -0.0004000490589533001, 0.5291666984558105]
[0.0, 0.0, 0.0]
[0.0, -0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[-0.09982574731111526, -0.00039916703826747835, 0.527999997138977]
[-0.09906671196222305, -0.0012531025568023324, 0.5285000205039978]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, 0.0, 0.0]
[0.0, -0.0, 0.0]
[0.0, -0.0, 0.0]
[0.0, 0.0, 0.0]
[-0.09916044026613235, -0.001254288130439818, 0.5290000438690186

[-0.653068482875824, 0.010595045052468777, 2.5910000801086426]
[-0.5726942420005798, 0.05488578975200653, 2.188000202178955]
[-0.5524696111679077, 0.10281293839216232, 2.0850000381469727]
[-0.43559467792510986, 0.09224522113800049, 1.6540000438690186]
[-0.5547204613685608, 0.06644457578659058, 2.1063334941864014]
[-0.6077607870101929, 0.009379197843372822, 2.2936668395996094]
[-0.6728398203849792, -0.0019080555066466331, 2.5238890647888184]
[-0.6698528528213501, -0.0019111635629087687, 2.5280001163482666]
[-0.6578996181488037, 0.002132947091013193, 2.48288893699646]
[-0.3258838951587677, 0.026083702221512794, 1.19350004196167]
[-0.3357279598712921, 0.03152376413345337, 1.1806666851043701]
[0.0, -0.0, 0.0]
[-0.24460799992084503, 0.012063788250088692, 4.87600040435791]
[-0.17053137719631195, -0.004805802833288908, 0.8580000400543213]
[-0.07925537973642349, 0.0006193812587298453, 0.7210000157356262]
[-0.07488976418972015, 0.0005939913098700345, 0.691444456577301]
[-0.03516480326652527, -0

[-0.12482679635286331, -0.006258435547351837, 0.7086666822433472]
[-0.1263001561164856, 0.001790435053408146, 0.7236667275428772]
[-0.13602620363235474, 0.0005927504971623421, 0.690000057220459]
[-0.13865576684474945, -0.008346485905349255, 0.6920000314712524]
[-0.12439939379692078, 0.0005904596182517707, 0.687333345413208]
[-0.1218205913901329, -0.0016549963038414717, 0.6980000138282776]
[-0.12493301182985306, 0.0017710543470457196, 0.7158333659172058]
[-0.10819555819034576, 0.0006319807725958526, 0.7356666922569275]
[-0.10531577467918396, 0.0006289741140790284, 0.7321667075157166]
[-0.09873490780591965, 0.0006322671542875469, 0.7360000610351562]
[-0.0924600288271904, 0.0006381851271726191, 0.7428889274597168]
[-0.08001088351011276, 0.0006162314093671739, 0.7173333764076233]
[-0.08998466283082962, 0.0006210993742570281, 0.7230000495910645]
[-0.0767911896109581, 0.0006090725655667484, 0.7090000510215759]
[-0.052301421761512756, 0.0028138046618551016, 0.6881111264228821]
[-0.05705493688