In [2]:
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
%matplotlib inline 
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
import os
import cv2
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional
import torch

In [None]:
root_dir = ""
video_name = ""
video_path = os.path.join(root_dir,video_name)
frame_folder = os.path.join(root_dir,video_name, "frames")
output_frames_folder = os.path.join(root_dir,video_name, "output_frames")
final_video_output_path = root_dir

# Extract Frames From Video

In [4]:
def get_frames(input_path):
    video = cv2.VideoCapture(input_path)
    frame_count = 0

    while video.isOpened():
        success,frame = video.read()
        if not success:
            break
        frame_path = os.path.join(frames_folder, f'frame_{frame_count}.jpg')
        cv2.imwrite(frame_path, frame)
        frame_count += 1

    video.release()
    return frame_count

In [5]:
# Process each frame and draw bounding boxes
os.makedirs(frames_folder, exist_ok=True)
frame_count = get_frames(video_path)

# Get Pretrained Model

In [15]:
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()

#fasterrcnn_resnet50_fpn outcome labels
COCO_INSTANCE_CATEGORY_NAMES = ['__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table', 'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
search_objects = ['car']



FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

# Detect Cars and Plot Boxes

In [19]:
#Use model to detect label and box on one frame
def detect_boxes(image, model):
    tensor_image = functional.to_tensor(image).unsqueeze(0)
    
    with torch.no_grad():
            output = model(tensor_image)
    
    boxes = output[0]['boxes'].numpy()
    labels = output[0]['labels'].numpy()
    scores = output[0]['scores'].numpy()
    return boxes, labels, scores

# Plot Boxes on individual frame
def plot_boxes(image, boxes, labels, scores):
    np_image = np.array(image)
    for box, label, score in zip(boxes, labels, scores):
        if score > .5 and COCO_INSTANCE_CATEGORY_NAMES[label] in search_objects:
            x1, y1, x2, y2 = map(int, box)
            class_name = COCO_INSTANCE_CATEGORY_NAMES[label]
            cv2.rectangle(np_image, (x1, y1), (x2, y2), (0, 0, 255), 2) 
            cv2.putText(np_image, class_name, (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    return np_image

# Process each frame and draw bounding boxes
os.makedirs(output_frames_folder, exist_ok=True)

In [20]:
#Itterate through all frames to get frames with bo
output_frame_count = 0
for j in range(0,frame_count+1):
    image = Image.open(os.path.join(frames_folder, f'frame_{j}.jpg')).convert("RGB")
    boxes, labels, scores = detect_boxes(image, model)
    plotted_image = plot_boxes(image, boxes, labels, scores)
    cv2.imwrite(os.path.join(output_frames_folder, f'output_frame_{output_frame_count}.jpg'), plotted_image)
    output_frame_count+=1

# Compile Frames into MP4

In [22]:
# Set frame from the first image
plotted_frame0 = cv2.imread(os.path.join(output_frames_folder, 'output_frame_281.jpg'))
height, width, layers = plotted_frame0.shape

# Video writer to create .avi file
video = cv2.VideoWriter("cars_detected_video.mp4", cv2.VideoWriter_fourcc(*'mp4v'), 30, (width, height))

for j in range(0,output_frame_count+1):
    video.write(cv2.imread(os.path.join(output_frames_folder, f'output_frame_{j}.jpg')))

video.release()