### Q3. Implement an algorithms using opencv that detect and track multiple objects of different shapes and sizes in a real-time video stream, while accurately estimating their positions and orientations

In [52]:
import cv2
import time
import numpy as np

In [53]:
#Load YOLO
net = cv2.dnn.readNet("yolov3.weights","yolov3.cfg") # Original yolov3
classes = []
with open("coco.names","r") as f:
    classes = [line.strip() for line in f.readlines()]

In [63]:
#net = cv2.dnn.readNet("yolov3-tiny.weights","yolov3-tiny.cfg") #Tiny Yolo

In [54]:
print(classes)

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


In [55]:
#layer_names = net.getLayerNames()
#outputlayers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
outputlayers = net.getUnconnectedOutLayersNames()
outputlayers

('yolo_82', 'yolo_94', 'yolo_106')

In [56]:
colors = np.random.uniform(0,255,size=(len(classes),3))

In [64]:
cap = cv2.VideoCapture(0)
print("Detected Object positions: ")
print("Center Points: ")

while True:
    _,frame= cap.read() 
    height,width,channels = frame.shape
    
    #detecting objects
    blob = cv2.dnn.blobFromImage(frame,0.00392,(320,320),(0,0,0),True,crop=False) 
   
    net.setInput(blob)
    outs = net.forward(outputlayers)

    class_ids=[]
    confidences=[]
    boxes=[]
    centerPos=[]
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.3:
                center_x= int(detection[0]*width)
                center_y= int(detection[1]*height)
                w = int(detection[2]*width)
                h = int(detection[3]*height)

                #rectangle co-ordinaters
                x=int(center_x - w/2)
                y=int(center_y - h/2)
                centerPos.append([center_x, center_y])
                boxes.append([x,y,w,h])
                confidences.append(float(confidence)) #confidence percentage
                class_ids.append(class_id) #Detected object name

    indexes = cv2.dnn.NMSBoxes(boxes,confidences,0.4,0.6)

    for i in range(len(boxes)):
        if i in indexes:
            x,y,w,h = boxes[i]
            center_x, center_y = centerPos[i]
            label = str(classes[class_ids[i]])
            confidence= confidences[i]
            color = colors[class_ids[i]]
            cv2.rectangle(frame,(x,y),(x+w,y+h),color,2)
            cv2.putText(frame,label+" "+str(round(confidence,2)),(x,y+30),font,1,(255,255,255),2)
            print(label,"-->",(center_x,center_y))
            if label != "person":
                cv2.circle(frame,(center_x,center_y),8,(0,255,0),2)
            
    cv2.imshow("Image",frame)
    if cv2.waitKey(1) == 27: #esc key stops the process
        break;
    
cap.release()    
cv2.destroyAllWindows()    

Detected Object positions: 
Center Points: 
person --> (302, 260)
person --> (305, 271)
person --> (333, 267)
person --> (329, 261)
person --> (343, 253)
person --> (301, 252)
cell phone --> (117, 318)
person --> (301, 252)
cell phone --> (114, 313)
person --> (301, 252)
cell phone --> (120, 315)
person --> (302, 253)
cell phone --> (130, 320)
person --> (306, 253)
cell phone --> (158, 314)
person --> (325, 253)
cell phone --> (167, 317)
person --> (330, 252)
cell phone --> (171, 326)
person --> (329, 253)
cell phone --> (155, 324)
person --> (331, 255)
cell phone --> (165, 326)


In [None]:
#font = cv2.FONT_HERSHEY_PLAIN
#starting_time= time.time()
#frame_id = 0
#While True:
    #elapsed_time = time.time() - starting_time
    #fps=frame_id/elapsed_time
    #cv2.putText(frame,"FPS:"+str(round(fps,2)),(10,50),font,2,(0,0,0),1)