In [1]:
pip install --upgrade opencv-python opencv-python-headless


Note: you may need to restart the kernel to use updated packages.




In [2]:
pip install --upgrade onnxruntime


Note: you may need to restart the kernel to use updated packages.




In [3]:
pip install opencv-python opencv-python-headless numpy


Note: you may need to restart the kernel to use updated packages.




In [4]:
!pip install ultralytics






In [None]:
import onnxruntime as ort
import numpy as np
import cv2
from collections import deque
from ultralytics import YOLO

# Parameters class
class Parameters:
    def __init__(self):
        self.CLASSES = open("model/action_recognition_kinetics.txt").read().strip().split("\n")
        self.ACTION_RESNET = 'model/resnet-34_kinetics.onnx'
        self.SAMPLE_DURATION = 16  
        self.SAMPLE_SIZE = 112  
        self.VIDEO_PATH = 0  
        self.DEVICE = "CPU" 


param = Parameters()


print("[INFO] Loading Human Activity Recognition model with ONNX Runtime...")
ort_session = ort.InferenceSession(param.ACTION_RESNET, providers=['CPUExecutionProvider'])


print("[INFO] Loading YOLOv8 model for person detection...")
yolo_model = YOLO("yolov8m.pt")  


print("[INFO] Accessing webcam...")
vs = cv2.VideoCapture(param.VIDEO_PATH)
if not vs.isOpened():
    print("Error: Could not open webcam.")
    exit()

print("Webcam opened successfully.")


person_captures = {}

while True:
   
    grabbed, frame = vs.read()
    if not grabbed:
        print("[INFO] No frame read from webcam - exiting")
        break

  
    frame = cv2.resize(frame, (640, 448))

    
    results = yolo_model(frame)

    for result in results:
        for bbox in result.boxes.xyxy:
            x1, y1, x2, y2 = map(int, bbox)

            
            person_id = f"{x1}-{y1}-{x2}-{y2}"

            
            person_crop = frame[y1:y2, x1:x2]

            
            person_crop = cv2.resize(person_crop, (param.SAMPLE_SIZE, param.SAMPLE_SIZE))

            
            if person_id not in person_captures:
                person_captures[person_id] = deque(maxlen=param.SAMPLE_DURATION)
            person_captures[person_id].append(person_crop)

           
            while len(person_captures[person_id]) < param.SAMPLE_DURATION:
                person_captures[person_id].append(person_captures[person_id][-1])  # Duplicate last frame

            
            if len(person_captures[person_id]) == param.SAMPLE_DURATION:
                
                imageBlob = cv2.dnn.blobFromImages(list(person_captures[person_id]), 1.0,
                                                   (param.SAMPLE_SIZE, param.SAMPLE_SIZE),
                                                   (114.7748, 107.7354, 99.4750),
                                                   swapRB=True, crop=True)

                imageBlob = np.transpose(imageBlob, (1, 0, 2, 3))
                imageBlob = np.expand_dims(imageBlob, axis=0)

                
                ort_inputs = {ort_session.get_inputs()[0].name: imageBlob}
                outputs = ort_session.run(None, ort_inputs)

                
                predicted_index = np.argmax(outputs)
                label = param.CLASSES[predicted_index]

                print(f"Person ID: {person_id}, Action: {label}")

              
                cv2.rectangle(frame, (x1, y1 - 30), (x2, y1), (255, 255, 255), -1)
                cv2.putText(frame, label, (x1 + 5, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX,
                            0.6, (0, 0, 0), 2)

    
    cv2.imshow("Multi-Person Activity Recognition", frame)

    
    key = cv2.waitKey(1) & 0xFF
    if key == ord("q"):
        break

vs.release()
cv2.destroyAllWindows()


[INFO] Loading ONNX model...
[INFO] Loading YOLOv8 model...
[INFO] Accessing webcam...
Webcam opened successfully.

0: 448x640 1 person, 736.9ms
Speed: 45.6ms preprocess, 736.9ms inference, 32.5ms postprocess per image at shape (1, 3, 448, 640)
[INFO] FPS: 1

0: 448x640 1 person, 625.0ms
Speed: 3.5ms preprocess, 625.0ms inference, 2.2ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 person, 890.0ms
Speed: 7.8ms preprocess, 890.0ms inference, 1.7ms postprocess per image at shape (1, 3, 448, 640)
[INFO] FPS: 2

0: 448x640 1 person, 941.0ms
Speed: 4.2ms preprocess, 941.0ms inference, 3.4ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 person, 1085.7ms
Speed: 10.9ms preprocess, 1085.7ms inference, 3.0ms postprocess per image at shape (1, 3, 448, 640)
[INFO] FPS: 2

0: 448x640 1 person, 792.4ms
Speed: 4.4ms preprocess, 792.4ms inference, 4.4ms postprocess per image at shape (1, 3, 448, 640)

0: 448x640 1 person, 735.8ms
Speed: 11.3ms preprocess, 735.8ms infere