### Install Dependencies

In [None]:
!pip install opencv-python==3.4.2.16
!pip install imutils
!pip install scikit-image
!pip install matplotlib
!pip install numpy

### Import packages

In [385]:
import cv2
import numpy as np
import os
import imutils
import copy
import time
import math
from skimage.measure import compare_ssim
import matplotlib.pyplot as plt
from scipy.spatial import distance as dist
from collections import OrderedDict
%matplotlib inline

### Load yolo

In [2]:
net=cv2.dnn.readNet("../Yolo/yolov3.weights","../Yolo/yolov3.cfg")
net.setPreferableBackend(cv2.dnn.DNN_BACKEND_OPENCV)
net.setPreferableTarget(cv2.dnn.DNN_TARGET_OPENCL)
layer_names = net.getLayerNames()
outputlayers = [layer_names[i[0]-1] for i in net.getUnconnectedOutLayers()]

with open("../Yolo/coco.names","r") as f:
    classes = [line.strip() for line in f.readlines()]

### Convert video to frames

In [408]:
cap = cv2.VideoCapture("../NMPS-CD/Toys3/Track3.mp4")

counter=1
while (cap.isOpened()):
    ret, frame = cap.read()
    frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
    #frame = cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE)
    #frame = cv2.rotate(frame, cv2.ROTATE_180)
    if ret==0:
        break
    #cahnge path and file name
    FrameNo = '../NMPS-CD/Toys3/Track3/Frame'+str(counter)+'.jpg'
    cv2.imwrite(FrameNo,frame)
    counter = counter + 1
cap.release()

### Load frames

In [477]:
ref_dir = '../NMPS-CD/Toys3/Ref'
video_dir = '../NMPS-CD/Toys3/Track3'
#video_dir = '../NMPS-CD/Toys/Track2'
ref_data = []
video_data = []
n_ref_frames=0
n_video_frames=0
i=1
for f1 in os.listdir(ref_dir):
    img = cv2.imread(os.path.join(ref_dir,'Frame'+str(i)+'.jpg'))
    img = cv2.resize(img, (720,1280), interpolation = cv2.INTER_AREA)
    ref_data.append(img)
    n_ref_frames+=1
    i+=1
i=1
for f1 in os.listdir(video_dir):
    img = cv2.imread(os.path.join(video_dir,'Frame'+str(i)+'.jpg'))
    img = cv2.resize(img, (720,1280), interpolation = cv2.INTER_AREA)
    video_data.append(img)
    n_video_frames+=1
    i+=1

### Find objects in each frame

In [478]:
CONF_THRESHOLD = 0.7
NMS_THRESHOLD = 0.4
def post_process(frame, outs, conf_threshold, nms_threshold):
    frame_height = frame.shape[0]
    frame_width = frame.shape[1]

    # Scan through all the bounding boxes output from the network and keep only
    # the ones with high confidence scores. Assign the box's class label as the
    # class with the highest score.
    confidences = []
    boxes = []
    class_ids = []
    final_boxes = []
    final_classes=[]
    final_confidences=[]
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > conf_threshold:
                center_x = int(detection[0] * frame_width)
                center_y = int(detection[1] * frame_height)
                width = int(detection[2] * frame_width)
                height = int(detection[3] * frame_height)
                left = int(center_x - width / 2)
                top = int(center_y - height / 2)
                confidences.append(float(confidence))
                boxes.append([left, top, width, height])
                class_ids.append(class_id)

    # Perform non maximum suppression to eliminate redundant
    # overlapping boxes with lower confidences.
    indices = cv2.dnn.NMSBoxes(boxes, confidences, conf_threshold,
                               nms_threshold)
    for i in indices:
        i = i[0]
        box = boxes[i]
        left = box[0]
        top = box[1]
        width = box[2]
        height = box[3]
        final_boxes.append(box)
        final_classes.append(class_ids[i])
        final_confidences.append(confidences[i])
    return zip(final_boxes, final_confidences, final_classes)

In [479]:
IMG_WIDTH = 416
IMG_HEIGHT = 416
def yolo_get_objects(data):
    objs = []
    for image in data:
        # Create a 4D blob from a frame.
        blob = cv2.dnn.blobFromImage(image, 1 / 255, (IMG_WIDTH, IMG_HEIGHT),[0, 0, 0], 1, crop=False)

        # Sets the input to the network
        net.setInput(blob)

        # Runs the forward pass to get output of the output layers
        outs = net.forward(outputlayers)

        # Remove the bounding boxes with low confidence
        objs.append(list(post_process(image, outs, CONF_THRESHOLD, NMS_THRESHOLD)))
    return objs

In [480]:
ref_objs = yolo_get_objects(ref_data)
video_objs = yolo_get_objects(video_data)

In [481]:
class CentroidTracker():
    def __init__(self, maxDisappeared=0):
        self.nextObjectID = 0
        self.objects = OrderedDict()
        self.disappeared = OrderedDict()
        self.maxDisappeared = maxDisappeared
        self.appearing = OrderedDict()
        self.lifetime = []
        self.frameNo = 0

    def register(self, centroid, box):
        self.objects[self.nextObjectID] = (centroid, box)
        self.disappeared[self.nextObjectID] = 0
        self.appearing[self.nextObjectID]=(self.frameNo, (centroid, box))
        self.nextObjectID += 1

    def deregister(self, objectID):
        self.lifetime.append((objectID, self.appearing[objectID], 
                             (self.frameNo-self.maxDisappeared,self.objects[objectID])))
        del self.objects[objectID]
        del self.disappeared[objectID]

    def deregister_all(self):
        objs = list(self.objects.keys())
        for objectID in objs:
            self.deregister(objectID)
    
    def update(self, rects):
        self.frameNo+=1
        if len(rects) == 0:
            for objectID in list(self.disappeared.keys()):
                self.disappeared[objectID] += 1
                if self.disappeared[objectID] > self.maxDisappeared:
                    self.deregister(objectID)
            return self.objects
        inputCentroids = np.zeros((len(rects), 2), dtype="int")
        inputBoxes = {}
        for (i, box) in enumerate(rects):
            (startX, startY, endX, endY) = box
            cX = int((2*startX + endX) / 2.0)
            cY = int((2*startY + endY) / 2.0)
            inputCentroids[i] = (cX, cY)
            inputBoxes[tuple(inputCentroids[i])] = box
        if len(self.objects) == 0:
            for i in range(0, len(inputCentroids)):
                self.register(inputCentroids[i], inputBoxes[tuple(inputCentroids[i])])
        else:
            objectIDs = list(self.objects.keys())
            objectCentroids = [l for l,b in list(self.objects.values())]
            D = dist.cdist(np.array(objectCentroids), inputCentroids)
            rows = D.min(axis=1).argsort()
            cols = D.argmin(axis=1)[rows]
            usedRows = set()
            usedCols = set()
            for (row, col) in zip(rows, cols):
                if row in usedRows or col in usedCols:
                    continue
                objectID = objectIDs[row]
                self.objects[objectID] = (inputCentroids[col], inputBoxes[tuple(inputCentroids[col])])
                self.disappeared[objectID] = 0
                usedRows.add(row)
                usedCols.add(col)
            unusedRows = set(range(0, D.shape[0])).difference(usedRows)
            unusedCols = set(range(0, D.shape[1])).difference(usedCols)
            if D.shape[0] >= D.shape[1]:
                for row in unusedRows:
                    objectID = objectIDs[row]
                    self.disappeared[objectID] += 1
                    if self.disappeared[objectID] > self.maxDisappeared:
                        self.deregister(objectID)
            else:
                for col in unusedCols:
                    self.register(inputCentroids[col], inputBoxes[tuple(inputCentroids[i])])
        return self.objects

In [540]:
ref_ct = CentroidTracker(20)
ref_obj_track = []
#frameNo = 1
for i in range(0,len(ref_data)):
    objects = ref_ct.update([box[0] for box in ref_objs[i]])
    ref_obj_track.append(dict(objects.items()))
    #frameNo+=1
ref_ct.deregister_all()
ref_obj_lifetime = ref_ct.lifetime

In [541]:
video_ct = CentroidTracker(20)
video_obj_track = []
#frameNo = 1
for i in range(0,len(video_data)):
    objects = video_ct.update([box[0] for box in video_objs[i]])
    video_obj_track.append(dict(objects.items()))
    #frameNo+=1
video_ct.deregister_all()
video_obj_lifetime = video_ct.lifetime

In [542]:
ref_obj_lifetime

[(0,
  (3, (array([262, 691]), [199, 572, 127, 239])),
  (116, (array([708, 667]), [698, 520, 21, 294]))),
 (1,
  (208, (array([ 47, 668]), [0, 571, 95, 195])),
  (292, (array([632, 677]), [546, 579, 172, 196]))),
 (2,
  (474, (array([ 36, 925]), [0, 880, 73, 90])),
  (565, (array([668, 889]), [620, 845, 97, 88]))),
 (3,
  (615, (array([ 72, 894]), [3, 819, 138, 150])),
  (656, (array([490, 808]), [371, 703, 239, 211])))]

In [543]:
ref_obj_track

[{},
 {},
 {0: (array([262, 691]), [199, 572, 127, 239])},
 {0: (array([262, 691]), [199, 572, 127, 239])},
 {0: (array([262, 691]), [199, 572, 127, 239])},
 {0: (array([263, 692]), [199, 573, 128, 238])},
 {0: (array([263, 692]), [199, 573, 128, 238])},
 {0: (array([262, 691]), [198, 574, 129, 235])},
 {0: (array([261, 691]), [197, 574, 129, 235])},
 {0: (array([261, 683]), [198, 561, 126, 245])},
 {0: (array([261, 684]), [198, 560, 127, 248])},
 {0: (array([262, 683]), [199, 559, 127, 249])},
 {0: (array([263, 684]), [199, 561, 128, 246])},
 {0: (array([262, 684]), [199, 561, 127, 246])},
 {0: (array([263, 684]), [200, 561, 127, 246])},
 {0: (array([263, 683]), [200, 563, 127, 241])},
 {0: (array([264, 684]), [201, 564, 127, 240])},
 {0: (array([265, 684]), [202, 567, 127, 234])},
 {0: (array([266, 685]), [203, 566, 127, 238])},
 {0: (array([267, 684]), [205, 565, 125, 238])},
 {0: (array([268, 684]), [206, 564, 125, 240])},
 {0: (array([269, 684]), [207, 565, 125, 238])},
 {0: (arra

In [544]:
video_obj_lifetime

[(0,
  (2, (array([241, 741]), [163, 601, 156, 280])),
  (115, (array([706, 772]), [694, 619, 25, 306]))),
 (1,
  (210, (array([ 49, 796]), [0, 704, 98, 185])),
  (307, (array([629, 805]), [540, 710, 178, 191]))),
 (2,
  (373, (array([ 32, 866]), [2, 755, 60, 223])),
  (448, (array([700, 865]), [681, 762, 38, 206]))),
 (3,
  (646, (array([ 91, 995]), [14, 889, 155, 213])),
  (685, (array([594, 907]), [482, 798, 225, 218])))]

In [545]:
video_obj_track

[{},
 {0: (array([241, 741]), [163, 601, 156, 280])},
 {0: (array([240, 742]), [162, 603, 157, 278])},
 {0: (array([240, 742]), [162, 603, 157, 278])},
 {0: (array([240, 742]), [162, 603, 157, 278])},
 {0: (array([240, 739]), [161, 601, 159, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([241, 740]), [161, 602, 160, 277])},
 {0: (array([239, 739]), [161, 601, 156, 277])},
 {0: (array([239, 739]), [161, 601, 156, 277])},
 {0: (array([239, 739]), [161, 601, 156, 277])},
 {0: (array([239, 739]), [161, 601, 156, 277])},
 {0: (array([23

In [529]:
def compare_objs(img1, box1, img2, box2):
    width = img1.shape[0]
    height = img1.shape[1]
    
    left1, top1, width1, height1 = box1
    mask = np.zeros((width, height),np.uint8)
    mask = cv2.rectangle(mask, (left1, top1), (left1+width1, top1+height1), 1, thickness=-1)
    img1_masked = cv2.bitwise_and(img1, img1, mask=mask)
    
    left2, top2, width2, height2 = box2
    mask = np.zeros((width, height),np.uint8)
    mask = cv2.rectangle(mask, (left2, top2), (left2+width2, top2+height2), 1, thickness=-1)
    img2_masked = cv2.bitwise_and(img2, img2, mask=mask)
    
    orb_detector = cv2.ORB_create(5000)
    kp1, d1 = orb_detector.detectAndCompute(img2_masked, None) 
    kp2, d2 = orb_detector.detectAndCompute(img1_masked, None)
    matcher = cv2.BFMatcher(cv2.NORM_HAMMING, crossCheck = True)
    matches = matcher.match(d1, d2)
    matches.sort(key = lambda x: x.distance)
    matches = matches[:int(len(matches)*90)] 
    no_of_matches = len(matches)
    p1 = np.zeros((no_of_matches, 2)) 
    p2 = np.zeros((no_of_matches, 2))
    for k in range(len(matches)): 
        p1[k, :] = kp1[matches[k].queryIdx].pt 
        p2[k, :] = kp2[matches[k].trainIdx].pt
    homography, mask = cv2.findHomography(p1, p2, cv2.RANSAC)
    if(type(homography)==type(None)):
        return (None, None)
    transformed_img = cv2.warpPerspective(img2_masked, homography, (height, width))
    (score, diff) = compare_ssim(cv2.cvtColor(img1_masked, cv2.COLOR_BGR2GRAY),cv2.cvtColor(transformed_img, cv2.COLOR_BGR2GRAY), full=True)
    return (score,diff)

In [546]:
mid_frames1 = [round((start[0]+end[0])/2) for objID, start, end in ref_obj_lifetime]
mid_frames2 = [round((start[0]+end[0])/2) for objID, start, end in video_obj_lifetime]
print(mid_frames1, mid_frames2)

[60, 250, 520, 636] [58, 258, 410, 666]


In [531]:
def couple_objectIDs(data1, data2, lifetime1, lifetime2, track1, track2, mid_frames1, mid_frames2):
    width = data1[0].shape[0]
    height = data1[0].shape[1]
    obj_pairs = {}
    score_history = {}
    threshold = 50
    for i in range(0,len(lifetime1)):
        score_history[i] = OrderedDict()
        offset = 0
        #print("t1",track1[mid_frames1[i]])
        mid1 = mid_frames1[i]
        #print(mid1)
        while i not in list(track1[mid1].keys()):
            offset+=1
            if mid1+offset < len(track1) and i in list(track1[mid1+offset].keys()): mid1=mid1+offset
            if mid1-offset > 0 and i in list(track1[mid1-offset].keys()): mid1=mid1-offset
        #print(mid1)
        obj1 = track1[mid1][lifetime1[i][0]][1]
        obj1_img = data1[mid1]
        max_ssim = 0
        for j in range(0,len(lifetime2)):
            offset = 0
            #print("t2",track2[mid_frames2[j]])
            mid2 = mid_frames2[j]
            #print(mid2)
            while j not in list(track2[mid2].keys()):
                offset+=1
                if mid2+offset < len(track2) and j in list(track2[mid2+offset].keys()): mid2=mid2+offset
                if mid2-offset > 0 and j in list(track2[mid2-offset].keys()): mid2=mid2-offset
            #print(mid2)
            obj2 = track2[mid2][lifetime2[j][0]][1]
            left2, top2, width2, height2 = obj2
            obj2_img = data2[mid2]
            
            (score, diff) = compare_objs(obj1_img, obj1, obj2_img, obj2)
            if score:
                if score >= 0.7:
                    score_history[i][j] = score
    
    def get_key(dict, val):
        for key, value in dict.items():
            if val == value:
                return key
    
    def assign_objs(objid, ignore_list):
        ignore_list.sort()
        matched_objects = list(score_history[objid].keys())
        matched_objects.sort()
        if ignore_list == matched_objects:
            return
        ignored_score_history = copy.deepcopy(score_history)
        for i in ignore_list:
            del ignored_score_history[objid][i]
        obj_with_max_score = max(ignored_score_history[objid], key=ignored_score_history[objid].get)
        if(obj_with_max_score not in obj_pairs.values()):
            obj_pairs[objid] = obj_with_max_score
        else:
            initial_assignment = get_key(obj_pairs, obj_with_max_score)
            if score_history[initial_assignment][obj_with_max_score] > score_history[objid][obj_with_max_score]:
                ignore_list.append(obj_with_max_score)
                assign_objs(objid, ignore_list)
            else:
                obj_pairs[objid] = obj_with_max_score
                ignore_list = [obj_with_max_score]
                del obj_pairs[initial_assignment]
                assign_objs(initial_assignment, ignore_list)
    
    for i in score_history.keys():
        assign_objs(i, [])
    return (obj_pairs, score_history)

In [547]:
obj_pairs = {}
score_history = {}
if len(ref_obj_lifetime) < len(video_obj_lifetime):
    obj_pairs,score_history = couple_objectIDs(ref_data, video_data, ref_obj_lifetime, video_obj_lifetime, ref_obj_track, video_obj_track, mid_frames1,mid_frames2)
else:
    obj_pairs, score_history = couple_objectIDs(video_data, ref_data, video_obj_lifetime, ref_obj_lifetime, video_obj_track, ref_obj_track, mid_frames2, mid_frames1)
    obj_pairs = dict([(value, key) for key, value in obj_pairs.items()])
print(obj_pairs)



{0: 0, 1: 1, 2: 2, 3: 3}


In [548]:
score_history

{0: OrderedDict([(0, 0.9840135438103852),
              (1, 0.9046708357851139),
              (2, 0.9505718476961165)]),
 1: OrderedDict([(1, 0.9851720928327147)]),
 2: OrderedDict([(0, 0.8838467599109967), (2, 0.892703255458985)]),
 3: OrderedDict([(3, 0.9697060495461012)])}

In [549]:
missing_objs = [x for x in range(0,len(ref_obj_lifetime)) if x not in obj_pairs.keys()]
new_objs = [x for x in range(0,len(video_obj_lifetime)) if x not in obj_pairs.values()]
print(missing_objs, new_objs)

[] []


In [550]:
approx_missing = []
for obj in missing_objs:
    i = 0
    while(ref_obj_lifetime[i][0] != obj): i+=1
    
    i_before_ref = 0
    while(obj!=0 and ref_obj_lifetime[i_before_ref][0] != obj-1): i_before_ref=+1
    i_after_ref = 0
    while(ref_obj_lifetime[i_after_ref][0] != obj): i_after_ref+=1
    i_after_ref+=1
    if i_after_ref>=len(ref_obj_lifetime): i_after_ref=-1
    
    i_before_video = 0
    while(obj!=0 and video_obj_lifetime[i_before_video][0] != obj_pairs[obj]-1): i_before_video=+1
    i_after_video = 0
    while(video_obj_lifetime[i_after_video][0] != obj_pairs[ref_obj_lifetime[i_after_ref][0]]): i_after_video+=1
    
    approx_start_frame = math.ceil((ref_obj_lifetime[i][1][0] * video_obj_lifetime[i_before_video][2][0]) / ref_obj_lifetime[i_before_ref][2][0])
    approx_end_frame = math.floor((ref_obj_lifetime[i][2][0] * video_obj_lifetime[i_after_video][1][0]) / ref_obj_lifetime[i_after_ref][1][0])
    ref_step_incr = (ref_obj_lifetime[i][2][0] - ref_obj_lifetime[i][1][0]) / (approx_end_frame - approx_start_frame)
    
    approx_missing.append([obj, approx_start_frame, approx_end_frame, ref_obj_lifetime[i][1][0], ref_obj_lifetime[i][2][0], ref_step_incr])
print(approx_missing)

[]


In [536]:
output_frames = copy.deepcopy(video_data)

for mobj in approx_missing:
    video_ptr = mobj[1]
    ref_ptr = mobj[1]
    while(video_ptr < mobj[2]):
        if mobj[0] in ref_obj_track[math.floor(ref_ptr)].keys():
            left, top, width, height = ref_obj_track[round(ref_ptr)][mobj[0]][1]
            cv2.rectangle(output_frames[video_ptr], (left, top), (left + width,top + height),(0, 0, 255),2)
            cv2.putText(output_frames[video_ptr], "Missing", (left, top), cv2.FONT_HERSHEY_SIMPLEX,  
                    0.5, (255,0,0), 1, cv2.LINE_AA)
            
        ref_ptr+=mobj[5]
        video_ptr+=1

for i in range(0,len(video_obj_track)):
    for objID in video_obj_track[i].keys():
        if objID in new_objs:
            left, top, width, height = video_obj_track[i][objID][1]
            cv2.rectangle(output_frames[i], (left, top), (left + width,top + height),(0, 255, 0),2)
            cv2.putText(output_frames[i], "New", (left, top), cv2.FONT_HERSHEY_SIMPLEX,  
                    0.5, (255,0,0), 1, cv2.LINE_AA)

    cv2.namedWindow("output", cv2.WINDOW_NORMAL)
    cv2.imshow("output", output_frames[i])
    
    #time.sleep(0.05)
    
    keyboard = cv2.waitKey(30) & 0xFF
    if keyboard == 'q' or keyboard == 27:
        break
    
cv2.destroyAllWindows()
del output_frames[:]

In [551]:
import time
import copy

for i in range(0,len(ref_obj_track)):
    output_frames = copy.deepcopy(ref_data[i])
    for objID in ref_obj_track[i].keys():
        left, top, width, height = ref_obj_track[i][objID][1]
        cv2.rectangle(output_frames, (left, top), (left + width,top + height),(0, 255, 0),2)
        cv2.putText(output_frames, str(objID), (left, top), cv2.FONT_HERSHEY_SIMPLEX,  
                    1, (255,0,0), 2, cv2.LINE_AA)

    cv2.namedWindow("output", cv2.WINDOW_NORMAL)
    cv2.imshow("output", output_frames)
    
    #time.sleep(0.05)
    
    keyboard = cv2.waitKey(30) & 0xFF
    if keyboard == 'q' or keyboard == 27:
        break
    
cv2.destroyAllWindows()

In [554]:
import time
import copy

for i in range(0,len(video_obj_track)):
    output_frames = copy.deepcopy(video_data[i])
    for objID in video_obj_track[i].keys():
        left, top, width, height = video_obj_track[i][objID][1]
        cv2.rectangle(output_frames, (left, top), (left + width,top + height),(0, 255, 0),2)
        cv2.putText(output_frames, str(objID), (left, top), cv2.FONT_HERSHEY_SIMPLEX,  
                    1, (255,0,0), 2, cv2.LINE_AA)

    cv2.namedWindow("output", cv2.WINDOW_NORMAL)
    cv2.imshow("output", output_frames)
    
    time.sleep(0.05)
    
    keyboard = cv2.waitKey(30) & 0xFF
    if keyboard == 'q' or keyboard == 27:
        break
    
cv2.destroyAllWindows()

In [110]:
import time
output_frames = ref_data.copy()
for i in range(0,len(ref_obj_track)):
    cv2.namedWindow("output", cv2.WINDOW_NORMAL)
    cv2.imshow("output", output_frames[i])
    
    time.sleep(0.05)
    
    keyboard = cv2.waitKey(30) & 0xFF
    if keyboard == 'q' or keyboard == 27:
        break
    
cv2.destroyAllWindows()