In [35]:
# tutorial from https://huggingface.co/docs/transformers/en/model_doc/vitpose
from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation
import torch
import requests
from PIL import Image
import numpy as np
import cv2
import h5py
from datetime import datetime

In [36]:
import sys

In [3]:
def fill_in_res(res: list, key: str, size: tuple, top_k: int, box=0): 
    if box == 0:
        data = [res[i][key].cpu().numpy() for i in range(len(res))]
    else:
        data = [np.array([i.cpu().numpy()]) for i in res["scores"]]
    if len(data) < top_k: # fills in if there are less than 5 pers
        fill_in = [np.full(size, np.nan) for _ in range(int(top_k - len(data)))]
        data.extend(fill_in)
    
    data = data[0:5]
    data = np.stack(data, axis=-1) # stacks them along the last dimension
    data = data.flatten(order='F') # vectorizes it fortran style (column-major like matlab)
    return data 
# EOF

In [34]:
# 1 - load video
path2mod = "/Volumes/TIZIANO/models"

# 2 - load models
device = "cuda" if torch.cuda.is_available() else "cpu"
person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365") # loads a preprocessing pipeline img (to ensure same preproc.g)
person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device) # loads the object detection model:  RT-DETR object detection model (detectiion + label.g -> label 0 = person)
#inputs = person_image_processor(images=image, return_tensors="pt").to(device) # preprocesses the image and returns it as a tensor
# returns an object of type <class 'transformers.image_processing_base.BatchFeature'> -> behaves like a dict e.g. inputs.keys() -> 'pixel_values' i.e. the normalized img tensor of shape 2, 3, 256, 192
image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple") # downloads a processor tailored for the vitpose-base-simple model, it resizes, normalizes, and formats input data (cropping each detected person), automatically includes COCO keypoint configuration.
model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device) #downloads ViTPose
feats = {
    "kpts" : [],
    "boxes" : [], 
    "score_boxes" : [],
    "score_kpts" : [], 
    "head_kpts" : [],
    "score_heads" : []
}
# 3 - read frame and preprocess it
runs = [1] #,2,3]
for irun in runs:  
    path2vid = f"/Volumes/TIZIANO/stimuli/Project1917_movie_part{irun}_24Hz.mp4"
    reader = cv2.VideoCapture(path2vid)
    count = 0
    reader.set(cv2.CAP_PROP_POS_FRAMES, 2947)
    for i in range(2):
        count += 1
        print(datetime.now().strftime("%H:%M:%S")," - frame", count, flush=True)
        ret, frame = reader.read()
        
        if ret == False:
            break
        # end if ret==False:
    
        frame_rgb = cv2.cvtColor(
            frame, cv2.COLOR_BGR2RGB
        )  # converts to bgr to rgb color codes
    
        inputs = person_image_processor(frame_rgb, return_tensors="pt")
        
    # 4 - detect people
        with torch.no_grad():
            outputs = person_model(**inputs) # performs object detection on the input
    
    # 5 - get box predictions
        result = person_image_processor.post_process_object_detection(
            outputs, target_sizes=torch.tensor([(frame_rgb.shape[0], frame_rgb.shape[1])]), threshold=0.3 # converts raw model outputs into interpretable bounding box predictions 
        )[0] # selects the first element in the list bc only one img
        print(result)
        
        person_boxes = result["boxes"][result["labels"] == 0] # index only the boxes associated with label 0 (person) in COCO class labels
        print(person_boxes)
        if person_boxes.numel() == 0: # predef dimensionalities, sorry for hardcoding
            person_boxes_store = np.full((20,), np.nan)
            score_boxes_store = np.full((5,), np.nan)
            kpts_store = np.full((170,), np.nan)
            kpts_scores_store = np.full((85,), np.nan)
            print("true")
        else:
            score_boxes = result["scores"][result["labels"] == 0]
            score_boxes = score_boxes.cpu().numpy()
            # score_boxes_store = fill_in_res(person_boxes, "scores", (1, 1), 5) 
            # feats["score_boxes"].append(score_boxes_store)
            # converts boxes from VOC format: (x1, y1, x2, y2) to COCO format: N pers detected x 4 -> 4 cols are => (x, y, width, height)
            person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0] 
            person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1] 
            
        # 6 - preprocess for kpt detection
            inputs = image_processor(frame_rgb, boxes=[person_boxes], return_tensors="pt").to(device) # processes the original image using the bounding boxes -> ViTPose expects tightly cropped pics
            # inputs is a dict like type with "pixels_value" as only entry. It is a tensor [Batch, Channels, Height, Width] -> Batch is the number of people detected
            with torch.no_grad():
                outputs = model(**inputs) # runs ViTPose
            pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])[0]
            kpts_store = fill_in_res(pose_results, "keypoints", (17,2), 5)
            kpts_scores_store = fill_in_res(pose_results, "scores", (17), 5)
            person_boxes_store = fill_in_res(pose_results, "bbox", (4), 5) 
            score_boxes_store = fill_in_res(result, "scores", (1), 5, box=1)
        feats["boxes"].append(person_boxes_store) # FIXME it's a list of dicts
        feats["kpts"].append(kpts_store)
        print(len(feats["kpts"]))
        feats["score_kpts"].append(kpts_scores_store)
        feats["score_boxes"].append(score_boxes_store)
    with h5py.File(f"{path2mod}/Project1917_dummyViTPose_run0{irun}.h5", "w") as f:
        # Iterate over dictionary items and save them in the HDF5 file
        for key, value in feats.items():
            f.create_dataset(key, data=value)  # Create a dataset for each key-value pair

18:36:48  - frame 1
{'scores': tensor([0.5962, 0.5746, 0.4045, 0.3717, 0.3420, 0.3384, 0.3337, 0.3311, 0.3063,
        0.3063]), 'labels': tensor([24, 20, 24, 20, 24, 20, 20, 20, 20, 20]), 'boxes': tensor([[ 9.0683e-01,  1.1853e+02,  6.2574e+02,  5.2034e+02],
        [ 1.7806e+01, -2.4284e-01,  8.0000e+02,  4.2720e+02],
        [ 7.7372e+02,  1.7308e+02,  1.2702e+03,  3.9557e+02],
        [ 4.0987e+02,  5.4247e+01,  7.9916e+02,  3.7140e+02],
        [ 6.4565e+02,  1.7210e+02,  1.2768e+03,  5.2075e+02],
        [ 4.7507e+02,  7.9370e+01,  7.9856e+02,  4.4439e+02],
        [ 1.8027e+01,  3.2638e-02,  7.9925e+02,  3.7070e+02],
        [ 2.2157e+01,  1.3992e-01,  4.9222e+02,  1.2460e+02],
        [ 2.2035e+01,  1.9193e-02,  2.8273e+02,  1.2323e+02],
        [ 4.0397e+02,  2.2636e+01,  7.9856e+02,  4.2984e+02]])}
tensor([], size=(0, 4))
true
1
18:36:49  - frame 2
{'scores': tensor([0.5910, 0.5569, 0.4947, 0.3728, 0.3626, 0.3396, 0.3366, 0.3051, 0.3010]), 'labels': tensor([ 0, 24, 24, 24, 24

In [26]:
len(feats["kpts"])


16

In [29]:
model.config.label2id 

{'L_Ankle': 15,
 'L_Ear': 3,
 'L_Elbow': 7,
 'L_Eye': 1,
 'L_Hip': 11,
 'L_Knee': 13,
 'L_Shoulder': 5,
 'L_Wrist': 9,
 'Nose': 0,
 'R_Ankle': 16,
 'R_Ear': 4,
 'R_Elbow': 8,
 'R_Eye': 2,
 'R_Hip': 12,
 'R_Knee': 14,
 'R_Shoulder': 6,
 'R_Wrist': 10}