# extract features from video per frame

## extract features function

In [1]:
import math

import numpy as np
from scipy.ndimage.filters import gaussian_filter
import cv2

import util

COCO_BODY_PARTS = ['nose', 'neck',
                   'right_shoulder', ' right_elbow', 'right_wrist',
                   'left_shoulder', 'left_elbow', 'left_wrist',
                   'right_hip', 'right_knee', 'right_ankle',
                   'left_hip', 'left_knee', 'left_ankle',
                   'right_eye', 'left_eye', 'right_ear', 'left_ear', 'background'
                   ]


def extract_parts(input_image, params, model, model_params):
    multiplier = [x * model_params['boxsize'] / input_image.shape[0] for x in params['scale_search']]

    # Body parts location heatmap, one per part (19)
    heatmap_avg = np.zeros((input_image.shape[0], input_image.shape[1], 19))
    # Part affinities, one per limb (38)
    paf_avg = np.zeros((input_image.shape[0], input_image.shape[1], 38))

    for scale in multiplier:
        tic = time.time()
        image_to_test = cv2.resize(input_image, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
        image_to_test_padded, pad = util.pad_right_down_corner(image_to_test, model_params['stride'],
                                                               model_params['padValue'])
        
        # required shape (1, width, height, channels)
        input_img = np.transpose(np.float32(image_to_test_padded[:, :, :, np.newaxis]), (3, 0, 1, 2))
        
        toc = time.time()
        print('image rescaling time is %.5f' % (toc - tic))
        
        # model
        tic = time.time()
        output_blobs = model.predict(input_img)
        toc = time.time()
        print('cnn model time is %.5f' % (toc - tic))
        
        tic = time.time()
        # extract outputs, resize, and remove padding
        heatmap = np.squeeze(output_blobs[1])  # output 1 is heatmaps
        heatmap = cv2.resize(heatmap, (0, 0), fx=model_params['stride'], fy=model_params['stride'],
                             interpolation=cv2.INTER_CUBIC)
        heatmap = heatmap[:image_to_test_padded.shape[0] - pad[2], :image_to_test_padded.shape[1] - pad[3], :]
        heatmap = cv2.resize(heatmap, (input_image.shape[1], input_image.shape[0]), interpolation=cv2.INTER_CUBIC)

        paf = np.squeeze(output_blobs[0])  # output 0 is PAFs
        paf = cv2.resize(paf, (0, 0), fx=model_params['stride'], fy=model_params['stride'],
                         interpolation=cv2.INTER_CUBIC)
        paf = paf[:image_to_test_padded.shape[0] - pad[2], :image_to_test_padded.shape[1] - pad[3], :]
        paf = cv2.resize(paf, (input_image.shape[1], input_image.shape[0]), interpolation=cv2.INTER_CUBIC)

        heatmap_avg = heatmap_avg + heatmap / len(multiplier)
        paf_avg = paf_avg + paf / len(multiplier)

    all_peaks = []
    peak_counter = 0

    for part in range(18):
        hmap_ori = heatmap_avg[:, :, part]
        hmap = gaussian_filter(hmap_ori, sigma=3)

        # Find the pixel that has maximum value compared to those around it
        hmap_left = np.zeros(hmap.shape)
        hmap_left[1:, :] = hmap[:-1, :]
        hmap_right = np.zeros(hmap.shape)
        hmap_right[:-1, :] = hmap[1:, :]
        hmap_up = np.zeros(hmap.shape)
        hmap_up[:, 1:] = hmap[:, :-1]
        hmap_down = np.zeros(hmap.shape)
        hmap_down[:, :-1] = hmap[:, 1:]

        # reduce needed because there are > 2 arguments
        peaks_binary = np.logical_and.reduce(
            (hmap >= hmap_left, hmap >= hmap_right, hmap >= hmap_up, hmap >= hmap_down, hmap > params['thre1']))
        peaks = list(zip(np.nonzero(peaks_binary)[1], np.nonzero(peaks_binary)[0]))  # note reverse
        peaks_with_score = [x + (hmap_ori[x[1], x[0]],) for x in peaks]  # add a third element to tuple with score
        idx = range(peak_counter, peak_counter + len(peaks))
        peaks_with_score_and_id = [peaks_with_score[i] + (idx[i],) for i in range(len(idx))]

        all_peaks.append(peaks_with_score_and_id)
        peak_counter += len(peaks)

    connection_all = []
    special_k = []
    mid_num = 10

    for k in range(len(util.hmapIdx)):
        score_mid = paf_avg[:, :, [x - 19 for x in util.hmapIdx[k]]]
        cand_a = all_peaks[util.limbSeq[k][0] - 1]
        cand_b = all_peaks[util.limbSeq[k][1] - 1]
        n_a = len(cand_a)
        n_b = len(cand_b)
        # index_a, index_b = util.limbSeq[k]
        if n_a != 0 and n_b != 0:
            connection_candidate = []
            for i in range(n_a):
                for j in range(n_b):
                    vec = np.subtract(cand_b[j][:2], cand_a[i][:2])
                    norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
                    # failure case when 2 body parts overlaps
                    if norm == 0:
                        continue
                    vec = np.divide(vec, norm)

                    startend = list(zip(np.linspace(cand_a[i][0], cand_b[j][0], num=mid_num),
                                        np.linspace(cand_a[i][1], cand_b[j][1], num=mid_num)))

                    vec_x = np.array(
                        [score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 0]
                         for I in range(len(startend))])
                    vec_y = np.array(
                        [score_mid[int(round(startend[I][1])), int(round(startend[I][0])), 1]
                         for I in range(len(startend))])

                    score_midpts = np.multiply(vec_x, vec[0]) + np.multiply(vec_y, vec[1])
                    score_with_dist_prior = sum(score_midpts) / len(score_midpts) + min(
                        0.5 * input_image.shape[0] / norm - 1, 0)
                    criterion1 = len(np.nonzero(score_midpts > params['thre2'])[0]) > 0.8 * len(
                        score_midpts)
                    criterion2 = score_with_dist_prior > 0
                    if criterion1 and criterion2:
                        connection_candidate.append([i, j, score_with_dist_prior,
                                                     score_with_dist_prior + cand_a[i][2] + cand_b[j][2]])

            connection_candidate = sorted(connection_candidate, key=lambda x: x[2], reverse=True)
            connection = np.zeros((0, 5))
            for c in range(len(connection_candidate)):
                i, j, s = connection_candidate[c][0:3]
                if i not in connection[:, 3] and j not in connection[:, 4]:
                    connection = np.vstack([connection, [cand_a[i][3], cand_b[j][3], s, i, j]])
                    if len(connection) >= min(n_a, n_b):
                        break

            connection_all.append(connection)
        else:
            special_k.append(k)
            connection_all.append([])

    # last number in each row is the total parts number of that person
    # the second last number in each row is the score of the overall configuration
    subset = np.empty((0, 20))
    candidate = np.array([item for sublist in all_peaks for item in sublist])

    for k in range(len(util.hmapIdx)):
        if k not in special_k:
            part_as = connection_all[k][:, 0]
            part_bs = connection_all[k][:, 1]
            index_a, index_b = np.array(util.limbSeq[k]) - 1

            for i in range(len(connection_all[k])):  # = 1:size(temp,1)
                found = 0
                subset_idx = [-1, -1]
                for j in range(len(subset)):  # 1:size(subset,1):
                    if subset[j][index_a] == part_as[i] or subset[j][index_b] == part_bs[i]:
                        subset_idx[found] = j
                        found += 1

                if found == 1:
                    j = subset_idx[0]
                    if subset[j][index_b] != part_bs[i]:
                        subset[j][index_b] = part_bs[i]
                        subset[j][-1] += 1
                        subset[j][-2] += candidate[part_bs[i].astype(int), 2] + connection_all[k][i][2]
                elif found == 2:  # if found 2 and disjoint, merge them
                    j1, j2 = subset_idx
                    membership = ((subset[j1] >= 0).astype(int) + (subset[j2] >= 0).astype(int))[:-2]
                    if len(np.nonzero(membership == 2)[0]) == 0:  # merge
                        subset[j1][:-2] += (subset[j2][:-2] + 1)
                        subset[j1][-2:] += subset[j2][-2:]
                        subset[j1][-2] += connection_all[k][i][2]
                        subset = np.delete(subset, j2, 0)
                    else:  # as like found == 1
                        subset[j1][index_b] = part_bs[i]
                        subset[j1][-1] += 1
                        subset[j1][-2] += candidate[part_bs[i].astype(int), 2] + connection_all[k][i][2]

                # if find no partA in the subset, create a new subset
                elif not found and k < 17:
                    row = -1 * np.ones(20)
                    row[index_a] = part_as[i]
                    row[index_b] = part_bs[i]
                    row[-1] = 2
                    row[-2] = sum(candidate[connection_all[k][i, :2].astype(int), 2]) + connection_all[k][i][2]
                    subset = np.vstack([subset, row])

    # delete some rows of subset which has few parts occur
    delete_idx = []
    for i in range(len(subset)):
        if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
            delete_idx.append(i)
    subset = np.delete(subset, delete_idx, axis=0)
    points = []
    for peak in all_peaks:
        try:
            points.append((peak[0][:2]))
        except IndexError:
            points.append((None, None))
    body_parts = dict(zip(COCO_BODY_PARTS, points))
    
    toc = time.time()
    print(f"Post-processing time {tic-toc}")
    
    return body_parts, all_peaks, subset, candidate

## Running for video

In [2]:
import os
import sys
import argparse
import time
from config_reader import config_reader

from model.cmu_model import get_testing_model

#sys.path.append(os.path.join(os.path.dirname(__file__), ".."))

currentDT = time.localtime()
start_datetime = time.strftime("-%m-%d-%H-%M-%S", currentDT)

Using TensorFlow backend.


In [10]:
# constants
keras_weights_file = 'model/keras/model.h5'
frame_rate_ratio = 5
process_speed = 1
ending_frame = None

# Video input
video = 'smoking2hd.mp4'
video_path = 'videos/'
video_file = video_path + video

# Output location
output_path = 'videos/outputs/'
output_format = '.mp4'
video_output = output_path + video + str(start_datetime) + output_format

In [11]:
# load model
# authors of original model don't use
# vgg normalization (subtracting mean) on input images
model = get_testing_model()
model.load_weights(keras_weights_file)

# load config
params, model_params = config_reader()

# Video reader
cam = cv2.VideoCapture(video_file)
input_fps = cam.get(cv2.CAP_PROP_FPS)
ret_val, orig_image = cam.read()
video_length = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))

if ending_frame is None:
    ending_frame = video_length

In [12]:
scale_search = [1, .5, 1.5, 2]  # [.5, 1, 1.5, 2]
scale_search = scale_search[0:process_speed]

params['scale_search'] = scale_search

output_data = []
i = 0  # default is 0
while(cam.isOpened()) and ret_val is True and i < ending_frame:
    if i % frame_rate_ratio == 0:

        input_image = cv2.cvtColor(orig_image, cv2.COLOR_RGB2BGR)

        #tic = time.time()

        # generate image with body parts
        body_parts, all_peaks, subset, candidate = extract_parts(input_image, params, model, model_params)
        #canvas = draw(orig_image, all_peaks, subset, candidate)

        print('Processing frame: ', i)
#         toc = time.time()
#         print('processing time is %.5f' % (toc - tic))

        #out.write(canvas)
        
        # create dictionary for each frame
        frame_dict = {'frame_id': i, 'body_parts': body_parts, 'all_peaks': all_peaks, 'subset': subset, 'candidate': candidate}
        output_data.append(frame_dict)
    
    ret_val, orig_image = cam.read()

    i += 1

image rescaling time is 0.00342
cnn model time is 2.07086
Processing frame:  0
image rescaling time is 0.00371
cnn model time is 0.31377
Processing frame:  5
image rescaling time is 0.00490
cnn model time is 0.15369
Processing frame:  10
image rescaling time is 0.00349
cnn model time is 0.15525
Processing frame:  15
image rescaling time is 0.00373
cnn model time is 0.15456
Processing frame:  20
image rescaling time is 0.00374
cnn model time is 0.15420
Processing frame:  25
image rescaling time is 0.00339
cnn model time is 0.15545
Processing frame:  30
image rescaling time is 0.00505
cnn model time is 0.15429
Processing frame:  35
image rescaling time is 0.00371
cnn model time is 0.15485
Processing frame:  40
image rescaling time is 0.00466
cnn model time is 0.15479
Processing frame:  45
image rescaling time is 0.00372
cnn model time is 0.15393
Processing frame:  50
image rescaling time is 0.00374
cnn model time is 0.15383
Processing frame:  55
image rescaling time is 0.00341
cnn model 

In [7]:
import pickle
import os
with open('videos/video_data/smokin2mp4.pkl', 'wb') as file:
    pickle.dump(output_data, file)

In [9]:
output_data[1]

{'frame_id': 5,
 'body_parts': {'nose': (234, 87),
  'neck': (170, 99),
  'right_shoulder': (230, 96),
  ' right_elbow': (227, 114),
  'right_wrist': (177, 121),
  'left_shoulder': (178, 98),
  'left_elbow': (251, 124),
  'left_wrist': (244, 142),
  'right_hip': (169, 134),
  'right_knee': (224, 155),
  'right_ankle': (223, 175),
  'left_hip': (179, 132),
  'left_knee': (177, 155),
  'left_ankle': (173, 174),
  'right_eye': (233, 84),
  'left_eye': (237, 85),
  'right_ear': (175, 90),
  'left_ear': (244, 85)},
 'all_peaks': [[(234, 87, 0.8884656429290771, 0),
   (182, 95, 0.676986575126648, 1)],
  [(170, 99, 0.7536784410476685, 2),
   (241, 100, 0.8877450823783875, 3),
   (195, 119, 0.8514382243156433, 4)],
  [(230, 96, 0.8524492979049683, 5),
   (164, 101, 0.8598827123641968, 6),
   (209, 124, 0.7947495579719543, 7)],
  [(227, 114, 0.6775968670845032, 8),
   (159, 121, 0.922542154788971, 9),
   (217, 152, 0.8079068064689636, 10)],
  [(177, 121, 0.5875738859176636, 11),
   (225, 129, 0