In [10]:
import cv2
#openCV
import numpy as np
from scipy.stats import kurtosis,skew
#scipy is the statistical elder brother of numpy
import os
#os helps in fetching files from different directories i.e. communication with the operating system

In [11]:
class Feature_vector:
    #As given in  https://arxiv.org/abs/2004.07941, create the motion, location and appearance metric for the objects
    motion=[]
    location=[]
    appearance=[]
    
    #weight corresponding to the motion metric
    w1=1
    
    #weight corresponding to the location metric
    w2=0.4
    
    #weight corresponding to the appearance metric
    w3=0.9
    
    #constructor in python
    def __init__(self,motion,location,appearance):
        self.motion=motion
        self.location=location
        self.appearance=appearance
        
    #change weighting  
    def set_weights(self,w11,w22,w33):
        self.w1=w11
        self.w2=w22
        self.w3=w33

In [12]:
def get_patch(img,boxes,idx):
    #Extract a patch in the image corresponding to the object's bounding box in the frame.
    imgs=[]
    if len(idx)>0:
        for i in idx.flatten():
            center_x,center_y,w,h=boxes[i]
            # Using the center x, y coordinates to derive the top
            # and the left corner of the bounding box
            x=int(center_x-(w/2))
            y=int(center_y-(h/2))
            ##
            #The following is done to deal with typical situation when x or y are going out of frame
            #i.e not whole bounding box in the frame
            
            #stores part of the width to be cropped which is out of frame
            subx=0
            #stores part of the height to be cropped which is out of frame
            suby=0
            
            if x<0 or x>=img.shape[1]:
                subx=x if x<0 else -1*x
                x=0 if x<0 else img.shape[1]-1
            if y<0 or y>=img.shape[0]:
                suby=y if y<0 else -1*y
                y=0 if y<0 else img.shape[0]-1
            ##
            imgs.append(img[y:y+int(h)+suby,x:x+int(w)+subx,:])
    return imgs       

In [13]:
def frames_process(frames_path, feature_matrix):
    frame_prev=None
    for img in os.listdir(frames_path):
        if img.endswith(".jpg"):
            # RUNNING YOLOv4 OBJECT DETECTION FIRST
            # returns a deep learning network using the yolov4 format
            
            net = cv2.dnn.readNet('yolov4.weights', 'yolov4.cfg')
            # cv2.dnn.readNet=>https://docs.opencv.org/3.4/d6/d0f/group__dnn.html#ga3b34fe7a29494a6a4295c169a7d32422
            # type net=Net object=>https://docs.opencv.org/3.4/db/d30/classcv_1_1dnn_1_1Net.html
            
            # for running optical flow algorithm we need previous frame as well
            if frame_prev is None:
                frame_prev=cv2.imread(img)
                continue
            
            frame=cv2.imread(img)
            
            (height, width, _) = frame.shape
            
            # preprocessing the frame before feeding it to the neural net.
            # scale the pixel values to 1/255=>1/255
            # Resizing frame to (416,416) pixels as yolov4 architecture works on frame of that size.
            # No mean supplied to the three R,G,B channels=>(0,0,0)
            # OpenCV assumes images are in BGR channel order, thus we must swap the R and B channels of the original RGB frame=> swapRB=true
            # No cropping of the frame=>crop=False
            blob = cv2.dnn.blobFromImage(frame,1 / 255,(416, 416),(0, 0, 0),swapRB=True,crop=False)
            # above parameters are needed for yolov4 detection!
            # cv2.dnn.blobFromImage=> creates processed 4-dimensional blob for use in our neural net.Further info=> https://www.pyimagesearch.com/2017/11/06/deep-learning-opencvs-blobfromimage-works/

            # setting input to neural net
            net.setInput(blob)

            # net.getUnconnectedOutLayers(): It gives you the final layers number in the list from net.getLayerNames().
            output_layer_names = net.getUnconnectedOutLayersNames()

            # Runs forward pass to compute output of layer with name outputName
            layerOutputs = net.forward(output_layer_names)

            # model output=>https://stackoverflow.com/questions/57112038/yolo-v3-model-output-clarification-with-keras
            # There are 3 output layers in YOLO for 3 different resolutions of grid boxes over which object is detected(13,13)(26,26)(52,52)

            # boxes stores the location properties of detected objects
            # confidences stores the confidence score of detecting that object

            boxes = []
            confidences = []
            appearance = []

            # traversing through the 3 outputs at varying resolution

            for output in layerOutputs:
                for detection in output:
                    # detection holds the location(first 5 elements) and class probabilities(rest 80 elements)
                    scores = detection[5:]

                    # selecting the index of maximum class probabilty of total 80 classes and its probability value as well

                    class_id = np.argmax(scores)
                    confidence = scores[class_id]

                    # check to filter objects we are entirely sure belong to some class

                    if confidence > 0.6:

                        # Extracting the center coordinates of the bounding box
                        # we need to convert back to original dimensions thus multiply by width,height is important
                        # print(detection[0])

                        center_x = detection[0] * width
                        center_y = detection[1] * height

                        # extracting the width and height of bounding box

                        w = detection[2] * width
                        h = detection[3] * height
                        boxes.append([center_x, center_y, w, h])
                        confidences.append(float(confidence))
                        appearance.append(scores)

            indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.6, 0.4)

            # NMSBoxes=>https://towardsdatascience.com/non-maximum-suppression-nms-93ce178e177c

            i = 0
            if boxes != []:

                # extract image corresponsing to bounding box coordinates for previous frame

                patch_prev = get_patch(frame_prev, boxes, indexes)

                # extract image corresponding to bounding box coordinates for current frame

                patch = get_patch(frame, boxes, indexes)

                if patch_prev != [] and patch != []:

                    # looping through objects detected to find their OPTICAL FLOW

                    for (prev, actual) in zip(patch_prev, patch):

                        # Preprocessing to gray scale

                        prvs = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
                        next_ = cv2.cvtColor(actual, cv2.COLOR_BGR2GRAY)

                        # ##TRIAL--flow = np.zeros([prev.shape[0],prev.shape[1],2])
                        # ##TRIAL--cv2.optflow.calcOpticalFlowDenseRLOF(prev, actual,flow,None)

                        # Dense optical flow algorithm

                        flow = cv2.calcOpticalFlowFarneback(prvs,next_,None,0.5,3,15,3,5,1.2,0,)

                        # getting magnitude of velocity vectors of optical flow

                        (mag, _) = cv2.cartToPolar(flow[..., 0],flow[..., 1])

                        # computing mean, variance, kurtosis and skew of the magnitude of velocity vectors

                        mean = np.mean(mag)
                        variance = np.var(mag)
                        kurtosis_ = kurtosis(mag, None)
                        skew_ = skew(mag, None)

                        # now creating feature vector for the object
                        # motion metric

                        motion = [mean, variance, kurtosis_, skew_]
                        (cx, cy, wi, hi) = boxes[indexes.flatten()[i]]

                        # location metric

                        location = [cx, cy, wi * hi]

                        # appearance metric

                        appear = appearance[indexes.flatten()[i]]

                        # Appending feature vector of given object to feature matrix

                        feature_matrix.append(Feature_vector(motion,location, appear))
                        #print(feature_matrix)

                        i = i + 1

            # updating previous frame

            frame_prev = frame
        
    cv2.destroyAllWindows()

        

In [14]:
def vid_process(video_path,feature_matrix):
    #get video capture object for the camera for the specified video file 
    cap=cv2.VideoCapture(video_path)
    
    #RUNNING YOLOv4 OBJECT DETECTION FIRST
    
    #returns a deep learning network using the yolov4 format 
    net=cv2.dnn.readNet('yolov4.weights','yolov4.cfg')
    #cv2.dnn.readNet=>https://docs.opencv.org/3.4/d6/d0f/group__dnn.html#ga3b34fe7a29494a6a4295c169a7d32422
    #type net=Net object=>https://docs.opencv.org/3.4/db/d30/classcv_1_1dnn_1_1Net.html
    
    #Sometimes, cap may not have initialized the capture. 
    #You can check whether it is initialized or not by the method cap.isOpened().
    if(cap.isOpened()==False): 
        print("Error connecting to camera")
        return -1
    
    #for running optical flow algorithm we need previous frame as well
    ret,frame_prev=cap.read()
    #cap.read() returns a bool (True/False) which is stored in "ret" here. 
    #If the frame is read correctly, it will be True    
    
    if ret==False:
        print("Error loading frame")
        return
    while cap.isOpened():
        
        ret,frame=cap.read()
        
        if ret:
            #storing original frame dimensions
            height,width,_=frame.shape
            
            #preprocessing the frame before feeding it to the neural net. 
            #scale the pixel values to 1/255=>1/255
            #Resizing frame to (416,416) pixels as yolov4 architecture works on frame of that size.
            #No mean supplied to the three R,G,B channels=>(0,0,0)
            #OpenCV assumes images are in BGR channel order, thus we must swap the R and B channels of the original RGB frame=> swapRB=true
            #No cropping of the frame=>crop=False
            blob=cv2.dnn.blobFromImage(frame,1/255,(416,416),(0,0,0),swapRB=True,crop=False)
            #above parameters are needed for yolov4 detection!
            #cv2.dnn.blobFromImage=> creates processed 4-dimensional blob for use in our neural net.Further info=> https://www.pyimagesearch.com/2017/11/06/deep-learning-opencvs-blobfromimage-works/
            
            #setting input to neural net
            net.setInput(blob)
            
            #net.getUnconnectedOutLayers(): It gives you the final layers number in the list from net.getLayerNames().
            output_layer_names=net.getUnconnectedOutLayersNames()
            
            #Runs forward pass to compute output of layer with name outputName
            layerOutputs=net.forward(output_layer_names)
            #model output=>https://stackoverflow.com/questions/57112038/yolo-v3-model-output-clarification-with-keras
            #There are 3 output layers in YOLO for 3 different resolutions of grid boxes over which object is detected(13,13)(26,26)(52,52)
            
            #boxes stores the location properties of detected objects
            #confidences stores the confidence score of detecting that object
            boxes=[]
            confidences=[]
            appearance=[]
            
            #traversing through the 3 outputs at varying resolution
            for output in layerOutputs:
                for detection in output:
                    #detection holds the location(first 5 elements) and class probabilities(rest 80 elements)
                    scores=detection[5:]
                    #print(scores)
                    #selecting the index of maximum class probabilty of total 80 classes and its probability value as well
                    class_id=np.argmax(scores)
                    confidence=scores[class_id]
                    #check to filter objects we are entirely sure belong to some class
                    if(confidence>0.6):
                        #Extracting the center coordinates of the bounding box
                        #we need to convert back to original dimensions thus multiply by width,height is important
                        #print(detection[0])
                        center_x=detection[0]*width
                        center_y=detection[1]*height
                        #extracting the width and height of bounding box
                        w=detection[2]*width
                        h=detection[3]*height
                        boxes.append([center_x,center_y,w,h])
                        confidences.append(float(confidence))
                        appearance.append(scores)
            
            indexes=cv2.dnn.NMSBoxes(boxes,confidences,0.6,0.4)
            #NMSBoxes=>https://towardsdatascience.com/non-maximum-suppression-nms-93ce178e177c
            i=0
            if boxes!=[]:
                #extract image corresponsing to bounding box coordinates for previous frame
                patch_prev=get_patch(frame_prev,boxes,indexes)
                #extract image corresponding to bounding box coordinates for current frame
                patch=get_patch(frame,boxes,indexes)
                
                if patch_prev!=[] and patch!=[]:
                    #looping through objects detected to find their OPTICAL FLOW
                    for prev,actual in zip(patch_prev,patch):
                        #Preprocessing to gray scale
                        prvs  = cv2.cvtColor(prev, cv2.COLOR_BGR2GRAY)
                        next_ = cv2.cvtColor(actual, cv2.COLOR_BGR2GRAY)
                        
                        ###TRIAL--flow = np.zeros([prev.shape[0],prev.shape[1],2])
                        ###TRIAL--cv2.optflow.calcOpticalFlowDenseRLOF(prev, actual,flow,None)
                        
                        #Dense optical flow algorithm
                        flow=cv2.calcOpticalFlowFarneback(prvs, next_, None, 0.5, 3, 15, 3, 5, 1.2, 0)
                        #getting magnitude of velocity vectors of optical flow 
                        mag, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1])
                        #computing mean, variance, kurtosis and skew of the magnitude of velocity vectors
                        mean=np.mean(mag)
                        variance=np.var(mag)
                        kurtosis_=kurtosis(mag,None)
                        skew_=skew(mag,None)
                        #now creating feature vector for the object
                        
                        #motion metric
                        motion=[mean,variance,kurtosis_,skew_]
                        cx,cy,wi,hi=boxes[indexes.flatten()[i]]
                        #location metric
                        location=[cx,cy,wi*hi]
                        
                        #appearance metric
                        appear=appearance[indexes.flatten()[i]]
                        
                        #Appending feature vector of given object to feature matrix
                        feature_matrix.append(Feature_vector(motion,location,appear))
                        print(feature_matrix)
                       
                        i=i+1
                        
                        
            #updating previous frame            
            frame_prev=frame
            key=cv2.waitKey(1)
            
            #cv2.waitKey([delay])=>The function waitKey waits for a key event infinitely and the delay is in milliseconds. waitKey(0) means forever.
            #For more details=>https://stackoverflow.com/questions/57690899/how-cv2-waitkey1-0xff-ordq-works
            
            #if pressed key has ASCII value 27 i.e q
            if(key==27):
                break
            
            #if len(indexes)>0:
            #for i in indexes.flatten():                        
            
        else:
            print("Error loading frame")
            return
    cap.release()
    cv2.destroyAllWindows()
        

In [15]:
#TRAINING THE DATASET
train_vector=[]
#Getting the parent directory
parent_directory=os.path.dirname(os.getcwd())
#os.path.dirname() method in Python is used to get directory name from the specified path 
#i.e the directory that hold the current file
#Python method os.getcwd() returns current working directory of a process.
#For more info->https://www.geeksforgeeks.org/python-os-path-dirname-method/

#accesing the DATASET directory
direc=os.path.join(parent_directory,'Dataset')
#os.path.join-> https://www.geeksforgeeks.org/python-os-path-join-method/

#first using the AVENUE DATASET training videos for training
curr_direc=os.path.join(direc,'Avenue Dataset','training_videos')
for filename in os.listdir(curr_direc):
    #os.listdir->https://www.geeksforgeeks.org/python-os-listdir-method/
    if filename.endswith(".avi"):
        vid_process(filename,train_vector)

#Using the SHANGAI TECH DATASET training videos
curr_direc=os.path.join(direc,'ShangaiTech_training','videos')
for filename in os.listdir(curr_direc):
    if filename.endswith(".avi"):
        vid_process(filename,train_vector)

#using the PEDESTRIAN training dataset
curr_direc=os.path.join(direc,'ped2','training','frames')
for subdirec in os.listdir(curr_direc):
    video_frames=os.path.join(curr_direc,subdirec)
    frames_process(video_frames,train_vector)
    

print(train_vector)

[<__main__.Feature_vector object at 0x000001C07D060310>]
[<__main__.Feature_vector object at 0x000001C07D060310>, <__main__.Feature_vector object at 0x000001C07D06A7F0>]
[<__main__.Feature_vector object at 0x000001C07D060310>, <__main__.Feature_vector object at 0x000001C07D06A7F0>, <__main__.Feature_vector object at 0x000001C07D06AC10>]
[<__main__.Feature_vector object at 0x000001C07D060310>, <__main__.Feature_vector object at 0x000001C07D06A7F0>, <__main__.Feature_vector object at 0x000001C07D06AC10>, <__main__.Feature_vector object at 0x000001C07F4B3F10>]
[<__main__.Feature_vector object at 0x000001C07D060310>, <__main__.Feature_vector object at 0x000001C07D06A7F0>, <__main__.Feature_vector object at 0x000001C07D06AC10>, <__main__.Feature_vector object at 0x000001C07F4B3F10>, <__main__.Feature_vector object at 0x000001C07F47A520>]
[<__main__.Feature_vector object at 0x000001C07D060310>, <__main__.Feature_vector object at 0x000001C07D06A7F0>, <__main__.Feature_vector object at 0x00000

KeyboardInterrupt: 