## Part 1

Using available pre-trained models for object detection, conduct inference on a short video (5-10 seconds) of a street scene drawing bounding boxes around detected vehicles.

In [2]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.1-py3-none-any.whl.metadata (34 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.8-py3-none-any.whl.metadata (9.3 kB)
Downloading ultralytics-8.3.1-py3-none-any.whl (881 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m881.3/881.3 kB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.8-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.1 ultralytics-thop-2.0.8


In [3]:
# Lib

import os
import cv2
import torch
from torchvision import models, transforms
from PIL import Image
from torchvision.utils import draw_bounding_boxes
import torchvision
import numpy as np
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


### Step 1
Collect a source video. It may be necessary to divide the video into discrete image frames.

In [1]:
# Setting up file path

input_file_path = 'Source.mp4'
output_frames_dir = 'frames'
boxed_frames_dir = 'frames_with_boxes'
final_video_path = 'processed_video.mp4'

In [15]:
frame_name_padding = 7
confidence_threshold = 0.5
video_fps = 30

In [5]:
# Creating directories to store extracted and processed frames

os.makedirs(output_frames_dir, exist_ok=True)
os.makedirs(boxed_frames_dir, exist_ok=True)

In [14]:
# Function to extract frames from the input video

def extract_frames(video_path, frames_folder, padding):

    video_capture = cv2.VideoCapture(video_path)    # Opening the video file
    success, frame_image = video_capture.read()
    frame_count = 0
    while success:              # Looping through video frames and saving them as images
        frame_filename = f"frame{str(frame_count).zfill(padding)}.jpg"
        cv2.imwrite(os.path.join(frames_folder, frame_filename), frame_image)
        success, frame_image = video_capture.read()
        frame_count += 1
    video_capture.release()
    print(f"Extracted {frame_count} frames.")

In [16]:
# Extracting frames from the input video
extract_frames(input_file_path, output_frames_dir, frame_name_padding)

Extracted 252 frames.


### Step 2

Conduct inference on each frame of the video, drawing bounding boxes around detected vehicles.

In [17]:
# Loading the YOLO model for object detection

yolo_model = YOLO('yolov8n.pt')

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 113MB/s]


In [18]:
# Setting up image transformation for the model

image_transform = transforms.Compose([
    transforms.ToTensor(),
])

In [19]:
# Defining classes of vehicles to detect

target_vehicle_classes = ['car', 'motorcycle']

In [20]:
# Function to detect vehicles in frames and draw bounding boxes

def detect_and_draw_boxes(frames_folder, output_folder, model, classes, threshold, padding):
    for frame_file in sorted(os.listdir(frames_folder)):
        if frame_file.endswith(".jpg"):
            frame_number = int(''.join(filter(str.isdigit, frame_file)))
            frame_path = os.path.join(frames_folder, frame_file)

            # Loading and converting the image
            frame_image = Image.open(frame_path).convert("RGB")

            # Performing object detection on the image
            detection_results = model(frame_image)

            bounding_boxes = []
            labels = []

            # Iterating through detection results to filter vehicle classes and confidence score
            for result in detection_results:
                for box in result.boxes:
                    detected_class = model.names[int(box.cls)]
                    confidence_score = box.conf.item()
                    if detected_class in classes and confidence_score >= threshold:
                        bounding_boxes.append([int(box.xyxy[0][0]), int(box.xyxy[0][1]),
                                               int(box.xyxy[0][2]), int(box.xyxy[0][3])])
                        labels.append(f"{detected_class} {confidence_score:.2f}")

            # Converting image to OpenCV format for drawing
            image_for_drawing = cv2.cvtColor(np.array(frame_image), cv2.COLOR_RGB2BGR)

            # Drawing bounding boxes and labels on the image
            for bbox, label in zip(bounding_boxes, labels):
                x1, y1, x2, y2 = bbox
                cv2.rectangle(image_for_drawing, (x1, y1), (x2, y2), (0, 0, 255), 2)  # Red color boxes
                cv2.putText(image_for_drawing, label, (x1, y1 - 10),
                            cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

            # Saving the processed image with bounding boxes
            processed_frame_name = f"frame{str(frame_number).zfill(padding)}_boxed.jpg"
            cv2.imwrite(os.path.join(output_folder, processed_frame_name), image_for_drawing)

            # Logging every 50 frames processed
            if frame_number % 50 == 0:
                print(f"Processed frame {frame_number}")

In [22]:
# Detecting vehicles and drawing bounding boxes on extracted frames
detect_and_draw_boxes(output_frames_dir, boxed_frames_dir, yolo_model, target_vehicle_classes, confidence_threshold, frame_name_padding)



0: 640x384 2 cars, 377.5ms
Speed: 26.7ms preprocess, 377.5ms inference, 36.7ms postprocess per image at shape (1, 3, 640, 384)
Processed frame 0

0: 640x384 2 cars, 169.7ms
Speed: 6.5ms preprocess, 169.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)
Processed frame 0

0: 640x384 3 cars, 184.0ms
Speed: 5.6ms preprocess, 184.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 cars, 163.2ms
Speed: 3.0ms preprocess, 163.2ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 2 cars, 166.7ms
Speed: 4.2ms preprocess, 166.7ms inference, 1.4ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 179.5ms
Speed: 3.3ms preprocess, 179.5ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 242.8ms
Speed: 3.9ms preprocess, 242.8ms inference, 1.8ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 car, 250.8ms
Speed: 6.1ms preprocess, 250.8ms inference, 1.5ms postp

### Step 3

Format the results back into a video.

In [27]:
# Function to stitch processed frames back into a video

def create_video_from_frames(processed_folder, output_video, padding, input_video_path):
    # Getting frames per second (FPS) from the original video
    video_capture = cv2.VideoCapture(input_video_path)
    fps = video_capture.get(cv2.CAP_PROP_FPS)
    video_capture.release()

    # Sorting and listing all processed frame images
    processed_images = sorted(
        [img for img in os.listdir(processed_folder) if img.endswith("_boxed.jpg")],
        key=lambda x: int(''.join(filter(str.isdigit, x)))
    )

    if not processed_images:
        raise ValueError("No processed frames found in the specified folder.")

    # Getting the dimensions of the first frame for the video
    first_frame_path = os.path.join(processed_folder, processed_images[0])
    first_frame = cv2.imread(first_frame_path)
    if first_frame is None:
        raise ValueError(f"First frame {processed_images[0]} could not be read.")

    height, width, layers = first_frame.shape

     # Setting up video writer for the final output video

    fourcc_code = cv2.VideoWriter_fourcc(*'mp4v')
    video_writer = cv2.VideoWriter(output_video, fourcc_code, fps, (width, height))

    # Writing each processed frame into the video

    for image_file in processed_images:
        image_path = os.path.join(processed_folder, image_file)
        frame = cv2.imread(image_path)
        if frame is None:
            print(f"Warning: {image_file} could not be read and will be skipped.")
            continue
        video_writer.write(frame)

    # Releasing the video writer

    video_writer.release()
    print(f"Processed video saved as {output_video}")

In [29]:
# Creating a video from processed frames
create_video_from_frames(boxed_frames_dir, final_video_path, frame_name_padding, input_file_path)

'Processed video saved as processed_video.mp4'