In [1]:
video_path = "Video/Video_1.mp4"

In [2]:
import torch
import torchvision
import matplotlib.pyplot as plt
import numpy as np
from skimage import io, segmentation, color
import networkx as nx
import cv2

In [3]:
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant',
    'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', 'handbag',
    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat',
    'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass',
    'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli',
    'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed',
    'N/A', 'dining table', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard',
    'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 'clock',
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [4]:
def load_image(filepath):
    image_path = filepath
    image = io.imread(image_path)

    # If the image has 4 channels (RGBA), remove the alpha channel
    if image.shape[-1] == 4: image = image[..., :3]
        
    return image

In [5]:
def show_images(images):
    n = len(images)

    fig, axes = plt.subplots(1, n, figsize=(5 * n, 5))
    if n == 1: axes = [axes]

    for i, ax in enumerate(axes):
        if len(images[i].shape) == 3: ax.imshow(images[i])
        else: ax.imshow(images[i], cmap='gray')
        
        ax.axis('off')

    plt.show()

In [6]:
# Load a pretrained Mask R-CNN model from torchvision
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
model.eval()



MaskRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(in

In [7]:
# Load a pretrained Mask R-CNN model from torchvision
model = torchvision.models.detection.maskrcnn_resnet50_fpn(pretrained=True)
model.eval()

def process_image(image, threshold=0.75):
    """
    Processes the image using Mask R-CNN, and extracts bounding boxes for 'person' class 
    objects with confidence scores above the given threshold.
    """

    # Convert the image to float and normalize it to the range [0, 1] in a single step
    image_tensor = torchvision.transforms.functional.to_tensor(image).unsqueeze(0)  # Convert to tensor directly

    # Perform inference with Mask R-CNN (without unnecessary tensor-to-numpy conversions)
    with torch.no_grad():
        predictions = model(image_tensor)

    # Extract predictions
    scores = predictions[0]['scores']
    labels = predictions[0]['labels']
    boxes = predictions[0]['boxes']

    extracted_boxes = []

    # Iterate over the objects and apply their bounding boxes
    for idx in range(len(scores)):
        if scores[idx] > threshold and COCO_INSTANCE_CATEGORY_NAMES[labels[idx]] == "person":
            box = boxes[idx].int().tolist()  # Convert bounding box to integers and lists directly
            extracted_boxes.append(box)

    return extracted_boxes


In [8]:
def draw_boxes(image, boxes):
    """
    Draw the bounding boxes on the frame.
    :param frame: The image frame.
    :param boxes: A list of bounding boxes. Each bounding box can be either (x, y, w, h) or (x, y, size).
    :return: The frame with the boxes drawn on it.
    """
    boxed_image = image
    for box in boxes:


        # Ensure coordinates are integers
        x1, y1, x2, y2 = box

        # Draw the rectangle on the frame
        cv2.rectangle(boxed_image, (x1, y1), (x2, y2), (255, 0, 0), 3)

    return boxed_image

In [9]:
def array_to_video(images, output_path, fps, frame_size):
    # Initialize the video writer object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4
    out = cv2.VideoWriter(output_path, fourcc, fps, frame_size)

    for img in images:
        out.write(img)

    out.release()  # Release the video writer

In [10]:
def process_video(video_path, frame_limit=10000):
    # Open the video file
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error: Unable to open video file {video_path}")
        return

    # Get video properties like FPS (Frames Per Second) and frame size
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    frame_size = (frame_width, frame_height)

    new_images = []

    nframes = 0

    boxes = None

    while cap.isOpened():
        ret, frame = cap.read()

        if not ret or nframes > frame_limit:
            break  # Exit loop if there are no more frames

        if nframes % 5 == 0:
            # Process the current frame (only one image over 5)
            boxes = process_image(frame)

        new_image = draw_boxes(frame, boxes)
        new_images.append(new_image)

        nframes+=1
        print(nframes)

    cap.release()  # Release the video capture object

    # Create a new video from the processed frames
    output_path = 'out_2.mp4'
    array_to_video(new_images, output_path, fps, frame_size)

    print(f"Video saved as {output_path}")

In [11]:
process_video(video_path)

Error: Unable to open video file Video/Video_1.mp4
