## Imports

In [None]:
import numpy as np
import keras as k
import tkinter as tk
from keras.models import model_from_json
from tkinter.filedialog import askopenfilename
from PIL import ImageTk,Image

## Global Variables

In [None]:
#For Prediction
CLASS_LABELS = ['non-violent','violent']

## Loading Model Weights

In [None]:
file = open("ViolentModel/violent_model_json", 'r')
model_json = file.read()
file.close()

loaded_model = model_from_json(model_json)
loaded_model.load_weights("ViolentModel/violent_model_weights.h5")

## Globals & Prepare Model

---

Object Detection

---

In [None]:
#Import Object Detection Model Parameters
OBJ_config = "Model/yolov4.cfg"
OBJ_weights = "Model/yolov4.weights"
OBJ_class_names_file = "Model/coco.names"

# read class names from text file
OBJ_class_names = None
with open(OBJ_class_names_file, 'r') as f:
    OBJ_class_names = [line.strip() for line in f.readlines()]

OBJ_threshold = 0.1

---

Object Tracking

---

In [None]:
TRACK_list = cv2.legacy.MultiTracker_create()

---

Pose Estimation

---

In [None]:
#Import Pose Estimation Model Parameters
POSE_proto = "Model/pose_deploy_linevec.prototxt"
POSE_weights = "Model/pose_iter_440000.caffemodel"

POSE_body_parts = { "Nose": 0, "Neck": 1, "RShoulder": 2, "RElbow": 3, "RWrist": 4,
               "LShoulder": 5, "LElbow": 6, "LWrist": 7, "RHip": 8, "RKnee": 9,
               "RAnkle": 10, "LHip": 11, "LKnee": 12, "LAnkle": 13, "REye": 14,
               "LEye": 15, "REar": 16, "LEar": 17, "Background": 18 }

POSE_part_pairs = [ ["Neck", "RShoulder"], ["Neck", "LShoulder"], ["RShoulder", "RElbow"],
               ["RElbow", "RWrist"], ["LShoulder", "LElbow"], ["LElbow", "LWrist"],
               ["Neck", "RHip"], ["RHip", "RKnee"], ["RKnee", "RAnkle"], ["Neck", "LHip"],
               ["LHip", "LKnee"], ["LKnee", "LAnkle"], ["Neck", "Nose"], ["Nose", "REye"],
               ["REye", "REar"], ["Nose", "LEye"], ["LEye", "LEar"] ]

POSE_threshold = 0.1

In [None]:
#Create Models
OBJ_net = cv2.dnn.readNet(OBJ_config, OBJ_weights)
POSE_net = cv2.dnn.readNetFromCaffe(POSE_proto, POSE_weights)

#Set Models to use GPU
OBJ_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
OBJ_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
POSE_net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
POSE_net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)


## Prediction Video

In [None]:
PREDICTION_VIDEO = "Dataset/videos_test/violent/cam1/VideoDemo.mp4"


## Detect People

In [None]:
def get_output_layers(net):

    layer_names = net.getLayerNames()

    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]

    return output_layers

def detect_individuals_from_image(OBJ_img):

    OBJ_width = OBJ_img.shape[1]
    OBJ_height = OBJ_img.shape[0]
    OBJ_scale = 0.00392

    OBJ_blob = cv2.dnn.blobFromImage(OBJ_img, OBJ_scale, (416, 416), (0,0,0), True, crop=False)

    OBJ_net.setInput(OBJ_blob)
    outs = OBJ_net.forward(get_output_layers(OBJ_net))

    # initialization
    class_ids = []
    confidences = []
    boxes = []
    nms_threshold = 0.4

    # for each detection from each output layer
    # get the confidence, class id, bounding box params
    # and ignore weak detections (confidence < 0.5)
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > OBJ_threshold:
                center_x = int(detection[0] * OBJ_width)
                center_y = int(detection[1] * OBJ_height)
                w = int(detection[2] * OBJ_width)
                h = int(detection[3] * OBJ_height)
                x = center_x - w / 2
                y = center_y - h / 2
                class_ids.append(class_id)
                confidences.append(float(confidence))
                boxes.append([x, y, w, h])

    #Apply non-max suppression
    indices = cv2.dnn.NMSBoxes(boxes, confidences, OBJ_threshold, nms_threshold)

    confident_people_box = []

    for i in indices:
        #Only return if detected object is a Person
        if class_ids[i] == 0:
            confident_people_box.append(boxes[i])

    return confident_people_box

## Track People

In [None]:
def start_tracking_from_boxes(TRACK_BBoxes, TRACK_img):

    #New Empty Track list
    new_TRACK_list = cv2.legacy.MultiTracker_create()

    #Fill new Track list with the new Bounding boxes
    for TRACK_box in TRACK_BBoxes:

        tracker = cv2.legacy.TrackerCSRT_create()
        new_TRACK_list.add(tracker, TRACK_img, TRACK_box)

    global TRACK_list
    TRACK_list = new_TRACK_list

def track_using_trackers(TRACK_img, FINAL_img):

    global TRACK_list

    # grab the updated bounding box coordinates (if any) for each object that is being tracked
    (success, TRACK_boxes) = TRACK_list.update(TRACK_img)

def expand_tracking_box(image, FINAL_img):

    global TRACK_list
    TRACK_boxes = TRACK_list.getObjects()
    TRACK_expanded_boxes = [[] for i in range(len(TRACK_boxes))]

    new_OBJ_boxes = detect_individuals_from_image(image)

    # loop over the bounding boxes and draw them on the frame
    for ibox in range(len(TRACK_boxes)):

        x = int(TRACK_boxes[ibox][0])
        y = int(TRACK_boxes[ibox][1])
        w = int(TRACK_boxes[ibox][2])
        h = int(TRACK_boxes[ibox][3])

        cv2.rectangle(FINAL_img, (x,y), (x+w,y+h), (0,0,0), 2)
        cv2.putText(FINAL_img, str(ibox), (x+10,y+10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0,0,255), 2)

    #Take account of tracking boxes
    for TRACK_box_i in range(len(TRACK_boxes)):
        b_box = TRACK_boxes[TRACK_box_i]
        TRACK_expanded_boxes[TRACK_box_i].append((int(b_box[0]),int(b_box[1]),int(b_box[2]),int(b_box[3])))

    #Take account new object detection boxes
    for OBJ_box_i in range(len(new_OBJ_boxes)):
        new_OBJ_Center = get_center_of_box(new_OBJ_boxes[OBJ_box_i])

        distance = sys.maxsize
        trackerIndex = -1

        for TRACK_box_i in range(len(TRACK_boxes)):
            curr_TRACK_Center = get_center_of_box(TRACK_boxes[TRACK_box_i])

            curr_distance = math.dist(curr_TRACK_Center, new_OBJ_Center)

            if curr_distance < distance:
                distance = curr_distance
                trackerIndex = TRACK_box_i

        b_box = new_OBJ_boxes[OBJ_box_i]
        TRACK_expanded_boxes[trackerIndex].append((int(b_box[0]),int(b_box[1]),int(b_box[2]),int(b_box[3])))

    return TRACK_expanded_boxes

def get_center_of_box(boundingBox):

    boundingBox_Xcenter = int((boundingBox[0] + (boundingBox[0] + boundingBox[2])) / 2)
    boundingBox_Ycenter = int((boundingBox[1] + (boundingBox[1] + boundingBox[3])) / 2)

    return(boundingBox_Xcenter, boundingBox_Ycenter)

## Estimate Human Poses

In [None]:
def seperate_person(person_BB, image, finalImage):

    POSE_IMG = np.zeros(finalImage.shape, dtype=np.uint8)

    for curr_BB in person_BB:

        tl = curr_BB[0]
        tr = curr_BB[0]+curr_BB[2]
        bl = curr_BB[1]
        br = curr_BB[1]+curr_BB[3]

        #Seperate each product/contour into a new image by cropping the image to the bounding box of the product
        POSE_IMG[bl:br, tl:tr] = image[bl:br, tl:tr]

    return POSE_IMG

def get_human_pose_from_img(POSE_img_box, FINAL_img):

    IMG_HEIGHT = POSE_img_box.shape[0]
    IMG_WIDTH = POSE_img_box.shape[1]

    #Resize for prediction
    BLOB_HEIGHT=368
    BLOB_WIDTH=int((BLOB_HEIGHT/IMG_HEIGHT)*IMG_WIDTH)

    # Use the given image as input, which needs to be blob(s).
    imgBlob = cv2.dnn.blobFromImage(POSE_img_box, 1.0/255, (BLOB_WIDTH, BLOB_HEIGHT), (0,0,0), swapRB=True, crop=False)
    POSE_net.setInput(imgBlob)

    # Runs a forward pass to compute the POSE_MODEL output
    out = POSE_net.forward()
    # MobilePOSE_MODEL output [1, 57, -1, -1], we only need the first 19 elements
    out = out[:, :19, :, :]

    assert(len(POSE_body_parts) == out.shape[1])

    points = []
    for i in range(len(POSE_body_parts)):
        # Slice heatmap of corresponding body's part.
        heatMap = out[0, i, :, :]

        # Originally, we try to find all the local maximums. To simplify a sample
        # we just find a global one. However only a single pose at the same time
        # could be detected this way.
        _, conf, _, point = cv2.minMaxLoc(heatMap)
        x = (IMG_WIDTH * point[0]) / out.shape[3]
        y = (IMG_HEIGHT * point[1]) / out.shape[2]
        # Add a point if it's confidence is higher than THRESHOLD.
        points.append((int(x), int(y)) if conf > POSE_threshold else None)

    for pair in POSE_part_pairs:
        partFrom = pair[0]
        partTo = pair[1]
        assert(partFrom in POSE_body_parts)
        assert(partTo in POSE_body_parts)

        idFrom = POSE_body_parts[partFrom]
        idTo = POSE_body_parts[partTo]

        if points[idFrom] and points[idTo]:
            cv2.line(FINAL_img, points[idFrom], points[idTo], (0, 255, 0), 3)
            cv2.ellipse(FINAL_img, points[idFrom], (3, 3), 0, 0, 360, (0, 0, 255), cv2.FILLED)
            cv2.ellipse(FINAL_img, points[idTo], (3, 3), 0, 0, 360, (0, 0, 255), cv2.FILLED)

    return points


## Prepare Data

In [None]:
def augment_data(coord_loc_list):

    for person in range(len(coord_loc_list)):
            for frame_keypoints in range(len(coord_loc_list[person])):
                for spec_keypoint in range(len(coord_loc_list[person][frame_keypoints])):
                    if coord_loc_list[person][frame_keypoints][spec_keypoint] == None:
                        coord_loc_list[person][frame_keypoints][spec_keypoint] = (0,0)

## Export to CSV

In [None]:
def export_to_csv(keypoints_list):

    global check_var_list

    sorted_List = keypoints_list.copy()

    List_columns = [
                    "Action",
                    "Nose_X_1", "Nose_Y_1", "Nose_X_2", "Nose_Y_2", "Nose_X_3", "Nose_Y_3",
                    "Neck_X_1", "Neck_Y_1", "Neck_X_2", "Neck_Y_2", "Neck_X_3", "Neck_Y_3",
                    "RShoulder_X_1", "RShoulder_Y_1", "RShoulder_X_2", "RShoulder_Y_2", "RShoulder_X_3", "RShoulder_Y_3",
                    "RElbow_X_1", "RElbow_Y_1", "RElbow_X_2", "RElbow_Y_2", "RElbow_X_3", "RElbow_Y_3",
                    "RWrist_X_1", "RWrist_Y_1", "RWrist_X_2", "RWrist_Y_2", "RWrist_X_3", "RWrist_Y_3",
                    "LShoulder_X_1", "LShoulder_Y_1", "LShoulder_X_2", "LShoulder_Y_2", "LShoulder_X_3", "LShoulder_Y_3",
                    "LElbow_X_1", "LElbow_Y_1", "LElbow_X_2", "LElbow_Y_2", "LElbow_X_3", "LElbow_Y_3",
                    "LWrist_X_1", "LWrist_Y_1", "LWrist_X_2", "LWrist_Y_2", "LWrist_X_3", "LWrist_Y_3",
                    "RHip_X_1", "RHip_Y_1", "RHip_X_2", "RHip_Y_2", "RHip_X_3", "RHip_Y_3",
                    "RKnee_X_1", "RKnee_Y_1", "RKnee_X_2", "RKnee_Y_2", "RKnee_X_3", "RKnee_Y_3",
                    "RAnkle_X_1", "RAnkle_Y_1", "RAnkle_X_2", "RAnkle_Y_2", "RAnkle_X_3", "RAnkle_Y_3",
                    "LHip_X_1", "LHip_Y_1", "LHip_X_2", "LHip_Y_2", "LHip_X_3", "LHip_Y_3",
                    "LKnee_X_1", "LKnee_Y_1", "LKnee_X_2", "LKnee_Y_2", "LKnee_X_3", "LKnee_Y_3",
                    "LAnkle_X_1", "LAnkle_Y_1", "LAnkle_X_2", "LAnkle_Y_2", "LAnkle_X_3", "LAnkle_Y_3",
                    "REye_X_1", "REye_Y_1", "REye_X_2", "REye_Y_2", "REye_X_3", "REye_Y_3",
                    "LEye_X_1", "LEye_Y_1", "LEye_X_2", "LEye_Y_2", "LEye_X_3", "LEye_Y_3",
                    "REar_X_1", "REar_Y_1", "REar_X_2", "REar_Y_2", "REar_X_3", "REar_Y_3",
                    "LEar_X_1", "LEar_Y_1", "LEar_X_2", "LEar_Y_2", "LEar_X_3", "LEar_Y_3",
                    "Background_X_1", "Background_Y_1", "Background_X_2", "Background_Y_2", "Background_X_3", "Background_Y_3"
                    ]

    csv_list = []

    for x in range(len(sorted_List)):
        for y in range(0, len(sorted_List[x]),3):
            try:
                list_output_keypoints = []
                for z in range(len(sorted_List[x][y])):
                    for frame_count in range(3):
                        for coord_count in range(2):
                            list_output_keypoints.append(sorted_List[x][y + frame_count][z][coord_count])

                csv_list.append([("violent" if check_var_list[x].get() == 1 else "non-violent")] + list_output_keypoints)
            except:
                break

    np_array_rows = np.array(csv_list)
    np_array_columns = np.array(List_columns)

    df = pd.DataFrame(np_array_rows)

    if not os.path.isfile(DATASET_LOC_TRAIN):
       df.to_csv(DATASET_LOC_TRAIN, index=False, header=np_array_columns)
    else: # else it exists so append without writing the header
       df.to_csv(DATASET_LOC_TRAIN, index=False, mode='a', header=False)

## Main Pipeline

In [None]:
def num_sort(received_string):
    return list(map(int, re.findall(r'\d+', received_string)))[0]

def DataExtraction(VIDEO_LOC):

    global window
    global check_list
    global check_var_list

    frameCount = 0
    OBJ_boxes = []
    poseLoc = []

    #Initialize the video stream
    MEDIA_RAW = cv2.VideoCapture(VIDEO_LOC)

    #Loop over the frames from the video stream
    while cv2.waitKey(1) < 0:

        #Grab the frame from the threaded video stream
        hasFrame, image = MEDIA_RAW.read()

        if not hasFrame:
            cv2.waitKey()
            cv2.destroyAllWindows()
            break

        finalImage = image.copy()
        frameCount += 1

        #Only Detect and Track People in the first frame
        if frameCount == 1:
            #Detect People in Image and return their Boxes Position
            OBJ_boxes = detect_individuals_from_image(image)
            #Pass Bounding boxes to Tracker to start tracking
            start_tracking_from_boxes(OBJ_boxes, image)
            #Prepare Pose Save Locations
            poseLoc = [[] for i in range(len(OBJ_boxes))]

        #Sync trackers with new frame
        track_using_trackers(image, finalImage)

        #Expand tracker bounding box with more accurate bounding boxes
        POSE_EST_LOC = expand_tracking_box(image, finalImage)

        #Apply Pose Estimation in Bounding Boxes of every person
        for person_BBs_i in range(len(POSE_EST_LOC)):

            #Create Image focusing on a single person
            POSE_IMG = seperate_person(POSE_EST_LOC[person_BBs_i], image, finalImage)

            #Estimate Pose on a single person
            poseLoc[person_BBs_i].append(get_human_pose_from_img(POSE_IMG, finalImage))

        #Resize for better output
        finalImage = cv2.resize(finalImage, (int((700/finalImage.shape[0])*finalImage.shape[1]),700))

        #show the output frame
        cv2.imshow(VIDEO_LOC, finalImage)

        if(frameCount % 3 == 0):
            #Get Poses

            #Predict Last 3 Frames

            #Empty Pose List

    #Stop any videos
    MEDIA_RAW.release()

    #Close all windows
    cv2.destroyAllWindows()

    #Fill-in any missing keypoints
    augment_data(poseLoc)

## Main Method

In [None]:
#Go through each video in the dataset
for dataset_class in DATASET_CLASSES:
    for camera in DATASET_CAMS:

        CURRENT_DIR = "Dataset/videos_test/" + dataset_class + camera

        list_video_names = listdir(CURRENT_DIR)
        list_video_names.sort(key=num_sort)

        #For each video run the pipeline
        for video_name in list_video_names:
            DataExtraction(CURRENT_DIR + video_name)