## Team 5 members: Ang Boon Yew (A0096966E), Tea Lee Seng (A0198538J), Yang Xiaoyan (A0056720L)

In [1]:
# -*- coding: utf-8 -*-
import cv2
import numpy as np

In [2]:
videopath       = 'ironman.mp4'
outpath         = 'ssd_ironman.mp4'
prototxt        = 'MobileNetSSD_deploy.prototxt'
caffemodel      = 'MobileNetSSD_deploy.caffemodel'
scoreThres      = 0.6
nmsThres        = 0.4
classNames      = {0: 'background',
                   1: 'aeroplane', 
                   2: 'bicycle', 
                   3: 'bird', 
                   4: 'boat',
                   5: 'bottle', 
                   6: 'bus', 
                   7: 'car', 
                   8: 'cat', 
                   9: 'chair',
                   10: 'cow', 
                   11: 'diningtable', 
                   12: 'dog', 
                   13: 'horse',
                   14: 'motorbike', 
                   15: 'person', 
                   16: 'pottedplant',
                   17: 'sheep', 
                   18: 'sofa', 
                   19: 'train', 
                   20: 'tvmonitor'}

In [3]:
net             = cv2.dnn.readNetFromCaffe(prototxt,
                                           caffemodel)

In [4]:
print("Analyzing video ...")
vs              = cv2.VideoCapture(videopath)
fps             = vs.get(cv2.CAP_PROP_FPS)
W               = int(vs.get(cv2.CAP_PROP_FRAME_WIDTH))
H               = int(vs.get(cv2.CAP_PROP_FRAME_HEIGHT))
writer          = None

Analyzing video ...


## Draw bounding boxes and put objects' name after apply non-maximal suppresion

In [5]:
def pltDetect(img,boxes,classId,confidences, scoreThres = scoreThres,nmsThres=nmsThres):
    img_org=img.copy()
    selected = cv2.dnn.NMSBoxes(bboxes=boxes,
                                scores=confidences,
                                score_threshold=scoreThres,
                                nms_threshold=nmsThres)
    colorset = np.random.uniform(0,
                                127,
                                size=(len(classNames),3))

    for j in selected:
        j = j[0]
        box = boxes[j]
        color = colorset[classId[j]]
        txtlbl = str(classNames[classId[j]])
        confid = str(round(confidences[j],2))
        txtsize = cv2.getTextSize(txtlbl + confid,
                                    cv2.FONT_HERSHEY_SIMPLEX,
                                    0.5,
                                    1)
        bsize = txtsize[0]
        bsline = txtsize[1]
        x = int(box[0])
        y = int(box[1])
        w = int(box[2])
        h = int(box[3])
        cv2.rectangle(img,
                    (x,y),
                    (x+w,y+h),
                    (0, 255, 0),
                    2)

        cv2.rectangle(output,          # draw text box
                          (x-1,y),
                          (x+bsize[0],y+bsize[1]+bsline),
                          (0, 255, 0),
                          -1)

        cv2.putText(img,
                    txtlbl + confid,
                    (x-1,y+bsize[1]),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 0, 0),
                    1,
                    cv2.LINE_AA)

## Detect objects in the frame and return the bounding boxes, class ID and confidence level

In [6]:
def ssdDetect(img,scFactor=1/127.5,nrMean=(127.5,127.5,127.5),RBSwap=False):
    blob = cv2.dnn.blobFromImage(image=img,
                                scalefactor=scFactor,
                                size=(300, 300),
                                mean=nrMean,
                                swapRB=RBSwap,
                                crop=False)
    rows = blob.shape[2]
    cols = blob.shape[3]
    net.setInput(blob)
    pred = net.forward()

    classIds = []
    confidences = []
    boxes = []
    numOfObjects= pred.shape[2]

    for i in range(numOfObjects):
        confidence  = pred[0, 0, i, 2]      # confidence score
        classId     = int(pred[0, 0, i, 1]) # Class label
                                            # Object location 

        x1          = int(pred[0, 0, i, 3] * cols) 
        y1          = int(pred[0, 0, i, 4] * rows)
        x2          = int(pred[0, 0, i, 5] * cols)
        y2          = int(pred[0, 0, i, 6] * rows)

                                            # Factor for scaling
        hFactor     = H/300.0  
        wFactor     = W/300.0

                                            # Rescale the positions
        x1          = int(wFactor*x1) 
        y1          = int(hFactor*y1)
        x2          = int(wFactor*x2)
        y2          = int(hFactor*y2)

        x           = x1
        y           = y1
        w           = x2-x1
        h           = y2-y1
        classIds.append(classId)
        confidences.append(float(confidence))
        boxes.append([x, y, w, h])
    return [boxes,classIds,confidences]

In [7]:
while True:
    (grabbed,
     frame)     = vs.read()

    if not grabbed:
        break

    output      = frame.copy()    
    boxes,classId,confidences = ssdDetect(output)
    pltDetect(output,boxes,classId,confidences,scoreThres=scoreThres,nmsThres=nmsThres)

    if writer is None:
        fourcc = cv2.VideoWriter_fourcc(*"X264")
                                            # Use X264 encoder to encode video into
                                            #   H.264 encoding standard
        writer = cv2.VideoWriter(outpath,
                                 fourcc,
                                 fps,
                                 (W, H),
                                 True)

                            # Write the output frame to disk
    writer.write(output)        
    cv2.imshow("SSD detection",output)
    
    if cv2.waitKey(1) >= 0:  # Break with ESC 
        break

In [8]:
print("Closing ...")
writer.release()
vs.release()

Closing ...
