<h3>Imports</h3>

In [29]:
from ultralytics import YOLO
import cv2
from tqdm import tqdm
import sys

<h3>Model paths and diverse variables</h3>

In [30]:
model_1 = YOLO("/home/theo/Documents/Unif/Master/ChimpRec/Code/Body_detection/YOLO_small/runs/detect/train9/weights/best.pt")
model_2 = YOLO("/home/theo/Documents/Unif/Master/ChimpRec/Code/Face_detection/runs/detect/train3/weights/best.pt")

video_path = "/home/theo/Documents/Unif/Master/Chimprec - Extra/videos/20241023 - 09h28.MP4"
output_path = "output.mp4"

cap = cv2.VideoCapture(video_path)
frame_width, frame_height = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

n = 3  # Process one frame every n frames
max_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))  # Limit frame processing for efficiency

<h2>First prediction in the pipeline</h2>

In [31]:
def predict_1(image, t_confidence=0.4):
    return tuple(
        (x1, y1, x2, y2, score)
        for x1, y1, x2, y2, score, _ in model_1.predict(image, verbose=False)[0].boxes.data.tolist()
        if score >= t_confidence
    )

<h2>Second prediction in the pipeline</h2>

In [32]:
def predict_2(image, t_confidence=0.6):
    results = model_2.predict(image, verbose=False)
    return max(
        ((int(x1), int(y1), int(x2), int(y2), score) for result in results for x1, y1, x2, y2, score, _ in result.boxes.data.tolist()),
        default=None, key=lambda x: x[-1] if x[-1] >= t_confidence else float('-inf')
    )

<h2>Util functions</h2>

In [33]:
def crop(image, bbox):
    x1, y1, x2, y2, _ = map(int, bbox)
    return image[max(y1, 0):y2, max(x1, 0):x2]

def face_to_src(body_bbox, face_bbox):
    bx1, by1, _, _, _ = body_bbox
    fx1, fy1, fx2, fy2, score = face_bbox
    return (bx1 + fx1, by1 + fy1, bx1 + fx2, by1 + fy2, score)

def predict_frame(image):
    body_bboxes = predict_1(image)
    face_bboxes = tuple(
        face_to_src(body_bbox, face_bbox)
        for body_bbox in body_bboxes
        if (face_bbox := predict_2(crop(image, body_bbox)))
    )
    return body_bboxes, face_bboxes

<h1>Main code</h1>

<h3>Bounding boxes extraction</h3>

In [34]:
bboxes = []
with tqdm(total=max_frames, desc="Processing frames") as pbar:
    for frame_idx in range(max_frames):
        ret, frame = cap.read()
        if not ret:
            break
        bboxes.append(predict_frame(frame) if frame_idx % n == 0 else bboxes[-1])
        pbar.update(1)

cap.release()
cv2.destroyAllWindows()

size_in_mb = sys.getsizeof(bboxes) / (1024 * 1024)
print(f"Memory usage of bboxes: {size_in_mb:.6f} MB")

Processing frames: 100%|██████████| 7032/7032 [16:18<00:00,  7.18it/s]

Memory usage of bboxes: 0.056969 MB





<h3>Displaying the bboxes onto the video</h3>

In [35]:
def draw_bbox(image, color, bbox, label):
    x1, y1, x2, y2, score = bbox
    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
    factor = 0.65 if label == "Face" else 0.3
    font_scale = max(0.4, (x2 - x1 + y2 - y1) / 300) * factor

    cv2.rectangle(image, (x1, y1), (x2, y2), color, 4)
    label_text = f"{label}: {score:.2f}"
    (w, h), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_COMPLEX, font_scale, 1)

    overlay = image.copy()
    cv2.rectangle(overlay, (x2 - w - 10, y2 - h - 10), (x2, y2), color, -1)
    cv2.addWeighted(overlay, 0.5, image, 0.5, 0, image)

    cv2.putText(image, label_text, (x2 - w - 5, y2 - 5), cv2.FONT_HERSHEY_COMPLEX, font_scale, (255,255,255), 1)
    return image

In [36]:
cap = cv2.VideoCapture(video_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS)

max_frames = len(bboxes)

out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), fps, (frame_width, frame_height))

with tqdm(total=max_frames, desc="Processing frames") as pbar:
    for frame_idx in range(max_frames):
        ret, frame = cap.read()
        if not ret:
            break

        body_bboxes, face_bboxes = bboxes[frame_idx]

        for bbox in body_bboxes:
            draw_bbox(frame, (254, 122, 51), bbox, "Body")

        for bbox in face_bboxes:
            draw_bbox(frame, (66, 66, 255), bbox, "Face")

        out.write(frame)
        pbar.update(1)

cap.release()
out.release()
cv2.destroyAllWindows()

Processing frames: 100%|██████████| 7032/7032 [02:57<00:00, 39.56it/s]
