In [3]:
import pickle

# Load the annotation pickle file
annotation_file = '/home/samer/Desktop/Projects/mmaction2/scripts/SBAR_YOLO_KPTS.pkl'

with open(annotation_file, 'rb') as f:
    data = pickle.load(f)

# Print the paths of videos in each split
print("Training videos:")
for video in data['split']['train']:
    print(video)

print("\nValidation videos:")
for video in data['split']['val']:
    print(video)

print("\nTesting videos:")
for video in data['split']['test']:
    print(video)

Training videos:
shoplifting/sequence_4
normal_shopping/sequence_16
shoplifting/sequence_15
shoplifting/sequence_21
shoplifting/sequence_5
normal_shopping/sequence_27
normal_shopping/sequence_24
normal_shopping/sequence_14
normal_shopping/sequence_15
normal_shopping/sequence_22
shoplifting/sequence_2
shoplifting/sequence_26
shoplifting/sequence_22
normal_shopping/sequence_10
normal_shopping/sequence_26
normal_shopping/sequence_1
shoplifting/sequence_27
shoplifting/sequence_8
normal_shopping/sequence_20
normal_shopping/sequence_17
shoplifting/sequence_12
shoplifting/sequence_11
shoplifting/sequence_14
normal_shopping/sequence_19
shoplifting/sequence_10
normal_shopping/sequence_13
shoplifting/sequence_6
normal_shopping/sequence_23
normal_shopping/sequence_21
shoplifting/sequence_7
normal_shopping/sequence_29
normal_shopping/sequence_6
normal_shopping/sequence_18
normal_shopping/sequence_0
shoplifting/sequence_17
normal_shopping/sequence_11

Validation videos:
normal_shopping/sequence_9
s

In [24]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from mmaction.apis import init_recognizer, inference_recognizer
from mmengine.config import Config

# Paths
config_file = '/home/samer/Desktop/Projects/mmaction2/configs/skeleton/stgcnpp/stgcnpp_custom.py'
checkpoint_file = '/home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_8.pth'
video_path = '/home/samer/Desktop/Projects/mmaction2/data/Recorded_Videos/shoplifting/sequence_13/sequence_video.avi'
output_video_path = '/home/samer/Desktop/Projects/mmaction2/data/Skeletons_Visualization/inference_video_v4.mp4'

# Define action labels
action_labels = ['normal_shopping', 'shoplifting']

# Initialize YOLO model
yolo_model = YOLO('yolov8m-pose.pt')

# Initialize action recognition model
config = Config.fromfile(config_file)
action_model = init_recognizer(config, checkpoint_file, device='cuda:0')

# Open video file
cap = cv2.VideoCapture(video_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Setup video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

frame_num = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Extract keypoints using YOLO
    results = yolo_model(frame, stream=True)
    keypoints = []
    scores = []
    bboxes = []

    for result in results:
        if result.keypoints is not None:
            keypoints.append(result.keypoints.xyn[0].cpu().numpy())  # pixel coordinates
            scores.append(result.keypoints.conf[0].cpu().numpy())  # confidence scores
            bboxes.append(result.boxes.xyxy[0].cpu().numpy())  # bounding boxes

    if keypoints:
        keypoints = np.array(keypoints)
        scores = np.array(scores)

        # Prepare data in the required shape for the action recognition model
        keypoints = keypoints.reshape(1, keypoints.shape[0], keypoints.shape[1], 2)
        scores = scores.reshape(1, scores.shape[0], scores.shape[1])

        # Create the action recognition model input
        action_input = {
            'keypoint': torch.tensor(keypoints, dtype=torch.float32),
            'keypoint_score': torch.tensor(scores, dtype=torch.float32),
            'total_frames': keypoints.shape[1]
        }

        # Run inference
        prediction = inference_recognizer(action_model, action_input)

        # Extract prediction results
        predicted_action = prediction.pred_label.item()
        action_score = prediction.pred_score[predicted_action].item()

        # Overlay the prediction on the frame
        action_text = f'Action: {action_labels[predicted_action]}, Score: {action_score:.2f}'
        (w, h), _ = cv2.getTextSize(action_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        cv2.rectangle(frame, (10, 5), (10 + w, 30 + h), (0, 0, 0), -1)
        cv2.putText(frame, action_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

        # Draw keypoints on the frame
        for i, person in enumerate(keypoints[0]):
            bbox = bboxes[i]
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 2)
            for (x, y) in person:
                cv2.circle(frame, (int(x), int(y)), 3, (255, 0, 0), -1)

    # Write the annotated frame to the output video
    out.write(frame)
    frame_num += 1

cap.release()
out.release()

print(f"Annotated video with skeletons and action predictions saved to {output_video_path}")


Loads checkpoint by local backend from path: /home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_8.pth

0: 480x640 1 person, 10.8ms
Speed: 3.8ms preprocess, 10.8ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.9ms
Speed: 1.0ms preprocess, 6.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 9.2ms
Speed: 1.2ms preprocess, 9.2ms inference, 1.3ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
Speed: 1.1ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.8ms
Speed: 3.2ms preprocess, 7.8ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
Speed: 1.1ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
Speed: 1.1ms preprocess, 7.0ms inference, 1.0ms postprocess per image at sh

In [31]:
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from mmaction.apis import init_recognizer, inference_recognizer
from mmengine.config import Config

# Paths
config_file = '/home/samer/Desktop/Projects/mmaction2/configs/skeleton/stgcnpp/stgcnpp_custom.py'
checkpoint_file = '/home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_8.pth'
video_path = '/home/samer/Desktop/Projects/mmaction2/data/Recorded_Videos/shoplifting/sequence_16/sequence_video.avi'
output_video_path = '/home/samer/Desktop/Projects/mmaction2/data/Skeletons_Visualization/inference_video_v7.mp4'

# Define action labels
action_labels = ['normal_shopping', 'shoplifting']

# Initialize YOLO model
yolo_model = YOLO('yolov8m-pose.pt')

# Initialize action recognition model
config = Config.fromfile(config_file)
action_model = init_recognizer(config, checkpoint_file, device='cuda:0')

# Open video file
cap = cv2.VideoCapture(video_path)
frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))

# Setup video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

frame_num = 0

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Extract keypoints using YOLO
    results = yolo_model(frame, stream=True)
    keypoints = []
    scores = []
    bboxes = []

    for result in results:
        if result.keypoints is not None:
            keypoints.append(result.keypoints.xyn[0].cpu().numpy())  # pixel coordinates
            scores.append(result.keypoints.conf[0].cpu().numpy())  # confidence scores
            bboxes.append(result.boxes.xyxy[0].cpu().numpy())  # bounding boxes

    if keypoints:
        keypoints = np.array(keypoints)
        scores = np.array(scores)

        # Prepare data in the required shape for the action recognition model
        keypoints = keypoints.reshape(1, keypoints.shape[0], keypoints.shape[1], 2)
        scores = scores.reshape(1, scores.shape[0], scores.shape[1])

        # Create the action recognition model input
        action_input = {
            'keypoint': torch.tensor(keypoints, dtype=torch.float32),
            'keypoint_score': torch.tensor(scores, dtype=torch.float32),
            'total_frames': keypoints.shape[1]
        }

        # Run inference
        prediction = inference_recognizer(action_model, action_input)

        # Extract prediction results
        predicted_action = prediction.pred_label.item()
        action_score = prediction.pred_score[predicted_action].item()

        # Overlay the prediction on the frame
        action_text = f'Action: {action_labels[predicted_action]}, Score: {action_score:.2f}'
        (w, h), _ = cv2.getTextSize(action_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
        cv2.rectangle(frame, (10, 5), (10 + w, 30 + h), (0, 0, 0), -1)
        cv2.putText(frame, action_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)

        # Draw keypoints on the frame
        for person in keypoints[0]:
            for (x, y) in person:
                cv2.circle(frame, (int(x), int(y)), 3, (255, 0, 0), -1)

        # Draw bounding boxes on the frame
        for i, person in enumerate(keypoints[0]):
            bbox = bboxes[i]
            cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), (0, 0, 255), 2)
            # for (x, y) in person:
            #     cv2.circle(frame, (int(x), int(y)), 3, (0, 255, 0), -1)

    # Write the annotated frame to the output video
    out.write(frame)
    frame_num += 1

cap.release()
out.release()

print(f"Annotated video with skeletons and action predictions saved to {output_video_path}")


Loads checkpoint by local backend from path: /home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_8.pth

0: 480x640 1 person, 12.3ms
Speed: 4.2ms preprocess, 12.3ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.9ms
Speed: 1.0ms preprocess, 6.9ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.8ms
Speed: 0.9ms preprocess, 6.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
Speed: 2.4ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.9ms
Speed: 0.9ms preprocess, 6.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.9ms
Speed: 0.9ms preprocess, 6.9ms inference, 1.0ms postprocess per image at sh

In [23]:
import os
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from mmaction.apis import init_recognizer, inference_recognizer
from mmengine.config import Config

# Paths
config_file = '/home/samer/Desktop/Projects/mmaction2/configs/skeleton/stgcnpp/stgcnpp_custom.py'
checkpoint_file = '/home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_8.pth'
video_path = '/home/samer/Desktop/Projects/mmaction2/data/Recorded_Videos/normal_shopping/sequence_28/sequence_video.avi'

# Initialize YOLO model
yolo_model = YOLO('yolov8m-pose.pt')

# Initialize action recognition model
config = Config.fromfile(config_file)
action_model = init_recognizer(config, checkpoint_file, device='cuda:0')

# Create a dictionary to map label indices to action names
action_labels = {0: 'normal_shopping', 1: 'shoplifting'}

# Function to run inference on a single video
def run_inference_on_video(video_path):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    keypoints_list = []
    keypoint_scores_list = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Extract keypoints using YOLO
        results = yolo_model(frame, stream=True)

        for result in results:
            if result.keypoints is not None:
                keypoints = result.keypoints.xyn[0].cpu().numpy()  # normalized coordinates
                scores = result.keypoints.conf[0].cpu().numpy()  # confidence scores
                keypoints_list.append(keypoints)
                keypoint_scores_list.append(scores)

    cap.release()

    if keypoints_list:
        # Convert lists to numpy arrays
        keypoints = np.array(keypoints_list)
        keypoint_scores = np.array(keypoint_scores_list)

        # Prepare data in the required shape for the action recognition model
        keypoints = keypoints.reshape(1, keypoints.shape[0], keypoints.shape[1], 2)
        keypoint_scores = keypoint_scores.reshape(1, keypoint_scores.shape[0], keypoint_scores.shape[1])

        # Create the action recognition model input
        action_input = {
            'keypoint': torch.tensor(keypoints, dtype=torch.float32),
            'keypoint_score': torch.tensor(keypoint_scores, dtype=torch.float32),
            'total_frames': total_frames
        }

        # Run inference
        prediction = inference_recognizer(action_model, action_input)

        # Get the predicted action and score
        predicted_action_idx = prediction.pred_label.item()
        predicted_action = action_labels[predicted_action_idx]
        action_score = prediction.pred_score[predicted_action_idx].item()

        # Debugging: Print shapes
        print(f"Keypoints shape: {keypoints.shape}")
        print(f"Keypoint scores shape: {keypoint_scores.shape}")
        print(f"Total frames: {total_frames}")

        # Debug: print keypoints and scores
        print("Sample keypoints:", keypoints[0, :2])
        print("Sample keypoint scores:", keypoint_scores[0, :2])

        # Print the entire prediction object to debug
        print("Prediction:", prediction)

        # Print the predictions
        print(f'Predicted Action: {predicted_action}, Score: {action_score:.2f}')

        return predicted_action, action_score

# Run inference on the specified video and print results
predicted_action, action_score = run_inference_on_video(video_path)
print(f"Video: {os.path.basename(video_path)}, Action: {predicted_action}, Score: {action_score:.2f}\n")

Loads checkpoint by local backend from path: /home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_8.pth

0: 480x640 1 person, 11.0ms
Speed: 3.3ms preprocess, 11.0ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.8ms
Speed: 1.0ms preprocess, 6.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.8ms
Speed: 0.9ms preprocess, 6.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.6ms
Speed: 0.9ms preprocess, 6.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.6ms
Speed: 0.9ms preprocess, 6.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.6ms
Speed: 0.9ms preprocess, 6.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.2ms
Speed: 0.9ms preprocess, 7.2ms inference, 1.0ms postprocess per image at sh

In [32]:
import os
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from mmaction.apis import init_recognizer, inference_recognizer
from mmengine.config import Config

# Paths
config_file = '/home/samer/Desktop/Projects/mmaction2/configs/skeleton/stgcnpp/stgcnpp_custom.py'
checkpoint_file = '/home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_10.pth'
video_path = '/home/samer/Desktop/Projects/mmaction2/data/Recorded_Videos/shoplifting/sequence_13/sequence_video.avi'
output_video_path = '/home/samer/Desktop/Projects/mmaction2/data/Skeletons_Visualization/inference_output_120.mp4'

# Initialize YOLO model
yolo_model = YOLO('yolov8m-pose.pt')

# Initialize action recognition model
config = Config.fromfile(config_file)
action_model = init_recognizer(config, checkpoint_file, device='cuda:0')

# Create a dictionary to map label indices to action names
action_labels = {0: 'normal_shopping', 1: 'shoplifting'}

# Function to run frame-by-frame inference on a video
def run_frame_by_frame_inference(video_path, output_video_path):
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    # Setup video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    frame_num = 0
    keypoints_list = []
    keypoint_scores_list = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Extract keypoints using YOLO
        results = yolo_model(frame, stream=True)

        for result in results:
            if result.keypoints is not None:
                keypoints = result.keypoints.xyn[0].cpu().numpy()  # pixel coordinates
                scores = result.keypoints.conf[0].cpu().numpy()  # confidence scores
                keypoints_list.append(keypoints)
                keypoint_scores_list.append(scores)

        # Prepare data in the required shape for the action recognition model
        if keypoints_list:
            keypoints = np.array(keypoints_list)
            keypoint_scores = np.array(keypoint_scores_list)
            keypoints = keypoints.reshape(1, keypoints.shape[0], keypoints.shape[1], 2)  # (1, T, V, C)
            keypoint_scores = keypoint_scores.reshape(1, keypoint_scores.shape[0], keypoint_scores.shape[1])  # (1, T, V)

            # Create the action recognition model input
            action_input = {
                'keypoint': torch.tensor(keypoints, dtype=torch.float32),
                'keypoint_score': torch.tensor(keypoint_scores, dtype=torch.float32),
                'total_frames': keypoints.shape[1]
            }

            # Run inference
            prediction = inference_recognizer(action_model, action_input)

            # Get the predicted action and score
            predicted_action_idx = prediction.pred_label.item()
            predicted_action = action_labels[predicted_action_idx]
            action_score = prediction.pred_score[predicted_action_idx].item()

            # Overlay the prediction on the frame
            label_text = f'Action: {predicted_action}, Score: {action_score:.2f}'
            (label_width, label_height), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
            label_bg_rect = (10, 10, 10 + label_width, 30 + label_height)
            cv2.rectangle(frame, (label_bg_rect[0], label_bg_rect[1]), (label_bg_rect[2], label_bg_rect[3]), (0, 0, 0), cv2.FILLED)
            cv2.putText(frame, label_text, (10, 30 + label_height), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

            # Draw keypoints on the frame
            for person in keypoints_list:
                for (x, y) in person:
                    cv2.circle(frame, (int(x), int(y)), 3, (255, 0, 0), -1)

            # Draw bounding boxes
            for box in result.boxes.xyxy:
                x1, y1, x2, y2 = map(int, box)
                cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 0, 255), 2)

            # Clear keypoints lists for next frame
            keypoints_list.clear()
            keypoint_scores_list.clear()

        # Write the annotated frame to the output video
        out.write(frame)
        frame_num += 1
        if frame_num % 50 == 0:
            print(f"Processed {frame_num} frames")

    cap.release()
    out.release()
    print(f"Inference video saved to {output_video_path}")

# Run frame-by-frame inference on the specified video and save the output
run_frame_by_frame_inference(video_path, output_video_path)


Loads checkpoint by local backend from path: /home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_10.pth

0: 480x640 1 person, 7.2ms
Speed: 2.1ms preprocess, 7.2ms inference, 1.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.8ms
Speed: 1.0ms preprocess, 6.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.9ms
Speed: 0.9ms preprocess, 6.9ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.8ms
Speed: 0.9ms preprocess, 6.8ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 7.1ms
Speed: 1.1ms preprocess, 7.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 6.8ms
Speed: 1.0ms preprocess, 6.8ms inference, 1.0ms postprocess per image at sha

In [None]:
import os
import cv2
import numpy as np
import torch
from ultralytics import YOLO
from mmaction.apis import init_recognizer, inference_recognizer
from mmengine.config import Config

# Paths
config_file = '/home/samer/Desktop/Projects/mmaction2/configs/skeleton/stgcnpp/stgcnpp_custom.py'
checkpoint_file = '/home/samer/Desktop/Projects/mmaction2/work_dirs/skeleton_models/stgcnpp_custom/best_acc_top1_epoch_8.pth'
video_path = '/home/samer/Desktop/Projects/mmaction2/data/Recorded_Videos/normal_shopping/sequence_28/sequence_video.avi'
output_video_path = '/home/samer/Desktop/Projects/mmaction2/data/Skeletons_Visualization/inference_output_1.mp4'

# Initialize YOLO model
yolo_model = YOLO('yolov8m-pose.pt')

# Initialize action recognition model
config = Config.fromfile(config_file)
action_model = init_recognizer(config, checkpoint_file, device='cuda:0')

# Create a dictionary to map label indices to action names
action_labels = {0: 'normal_shopping', 1: 'shoplifting'}


# Function to run inference on a single video
def run_frame_by_frame_inference(video_path, output_video_path):
    cap = cv2.VideoCapture(video_path)
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = int(cap.get(cv2.CAP_PROP_FPS))

    # Setup video writer
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))

    keypoints_list = []
    keypoint_scores_list = []
    frames = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frames.append(frame)

        # Extract keypoints using YOLO
        results = yolo_model(frame, stream=True)

        for result in results:
            if result.keypoints is not None:
                keypoints = result.keypoints.xyn[0].cpu().numpy()  # pixel coordinates
                scores = result.keypoints.conf[0].cpu().numpy()  # confidence scores
                keypoints_list.append(keypoints)
                keypoint_scores_list.append(scores)

    cap.release()

    if keypoints_list:
        # Convert lists to numpy arrays
        keypoints = np.array(keypoints_list)
        keypoint_scores = np.array(keypoint_scores_list)
        total_frames = len(keypoints_list)

        # Prepare data in the required shape for the action recognition model
        keypoints = keypoints.reshape(1, keypoints.shape[0], keypoints.shape[1], 2)  # (1, T, V, C)
        keypoint_scores = keypoint_scores.reshape(1, keypoint_scores.shape[0], keypoint_scores.shape[1])  # (1, T, V)

        # Create the action recognition model input
        action_input = {
            'keypoint': torch.tensor(keypoints, dtype=torch.float32),
            'keypoint_score': torch.tensor(keypoint_scores, dtype=torch.float32),
            'total_frames': total_frames
        }

        # Run inference
        prediction = inference_recognizer(action_model, action_input)

        # Get the predicted action and score
        predicted_action_idx = prediction.pred_label.item()
        predicted_action = action_labels[predicted_action_idx]
        action_score = prediction.pred_score[predicted_action_idx].item()

        # Debugging: Print shapes
        print(f"Keypoints shape: {keypoints.shape}")
        print(f"Keypoint scores shape: {keypoint_scores.shape}")
        print(f"Total frames: {total_frames}")

        # Debug: print keypoints and scores
        print("Sample keypoints:", keypoints[0, :2])
        print("Sample keypoint scores:", keypoint_scores[0, :2])

        # Print the entire prediction object to debug
        print("Prediction:", prediction)

        # Print the predictions
        print(f'Predicted Action: {predicted_action}, Score: {action_score:.2f}')

        # Overlay predictions on the frames
        for frame in frames:
            # Overlay the prediction on the frame
            label_text = f'Action: {predicted_action}, Score: {action_score:.2f}'
            (label_width, label_height), _ = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
            label_bg_rect = (10, 10, 10 + label_width, 30 + label_height)
            cv2.rectangle(frame, (label_bg_rect[0], label_bg_rect[1]), (label_bg_rect[2], label_bg_rect[3]), (0, 0, 0), cv2.FILLED)
            cv2.putText(frame, label_text, (10, 30 + label_height), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2, cv2.LINE_AA)

            # Draw keypoints on the frame
            for person in keypoints_list:
                for (x, y) in person:
                    x = int(x * frame_width)  # Convert back to pixel coordinates for drawing
                    y = int(y * frame_height)  # Convert back to pixel coordinates for drawing
                    cv2.circle(frame, (x, y), 3, (0, 255, 0), -1)

            # Write the annotated frame to the output video
            out.write(frame)

    out.release()
    print(f"Inference video saved to {output_video_path}")

# Run frame-by-frame inference on the specified video and save the output
run_frame_by_frame_inference(video_path, output_video_path)
