# DATA 255 Deep Learning Technologies
## Homework - 4 : Transfer Learning and Bounding Boxes and YOLOV8

### Submitted by - Ravjot Singh
### 017443361

### PART-1

Using available pre-trained models for object detection, conduct inference on a short video (5-10 seconds) of a street scene drawing bounding boxes around detected vehicles.

Step 1. Collect a source video. It may be necessary to divide the video into discrete image frames.

Step 2. Conduct inference on each frame of the video, drawing bounding boxes around detected vehicles.

Step 3. Format the results back into a video.

Use Pytorch.

In [1]:
# Importing the required libraries
import os
import cv2
import torch
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.transforms import functional as F
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Step 1: Extract frames from video
def extract_frames(video_path, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    video = cv2.VideoCapture(video_path)
    frame_count = 0
    
    while video.isOpened():
        success, frame = video.read()
        if not success:
            break
        frame_path = os.path.join(output_folder, f'frame_{frame_count}.jpg')
        cv2.imwrite(frame_path, frame)
        frame_count += 1
    
    video.release()
    return frame_count

In [3]:
# Path to input video and output folder for frames
video_path = 'cars_clip.mp4'
output_folder = 'frames'

In [4]:
# Extract frames and get number of frames
num_frames = extract_frames(video_path, output_folder)

In [5]:
# COCO class names for object detection
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 
    'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 
    'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 
    'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 
    'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 
    'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 
    'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 
    'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

In [6]:
# Step 2: Load pre-trained Faster R-CNN model
model = fasterrcnn_resnet50_fpn(pretrained=True)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [7]:
# Step 3: Perform inference on each frame
def detect_vehicles_on_frame(frame_path, model):
    image = Image.open(frame_path).convert("RGB")
    image_tensor = F.to_tensor(image).unsqueeze(0).to(device)
    
    with torch.no_grad():
        outputs = model(image_tensor)
    
    # Extract boxes, labels, and scores
    boxes = outputs[0]['boxes'].cpu().numpy()
    labels = outputs[0]['labels'].cpu().numpy()
    scores = outputs[0]['scores'].cpu().numpy()
    return boxes, labels, scores

In [8]:
def draw_boxes(image, boxes, labels, scores, threshold=0.5):
    for box, label, score in zip(boxes, labels, scores):
        if score > threshold:
            x1, y1, x2, y2 = map(int, box)
            class_name = COCO_INSTANCE_CATEGORY_NAMES[label]  # Get class name from label
            cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
            cv2.putText(image, class_name, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (255, 255, 255), 2)
    return image

In [9]:
# Step 4: Process each frame and draw bounding boxes
output_frames_folder = 'output_frames'
os.makedirs(output_frames_folder, exist_ok=True)

In [10]:
for i in range(num_frames):
    frame_path = os.path.join(output_folder, f'frame_{i}.jpg')
    image = cv2.imread(frame_path)
    boxes, labels, scores = detect_vehicles_on_frame(frame_path, model)
    image_with_boxes = draw_boxes(image, boxes, labels, scores)
    cv2.imwrite(os.path.join(output_frames_folder, f'output_frame_{i}.jpg'), image_with_boxes)

In [11]:
# Step 5: Recompile processed frames into a video
def frames_to_video(input_folder, output_video_path, frame_rate=30):
    frames = sorted([os.path.join(input_folder, f) for f in os.listdir(input_folder) if f.endswith('.jpg')])
    frame_height, frame_width, _ = cv2.imread(frames[0]).shape
    out = cv2.VideoWriter(output_video_path, cv2.VideoWriter_fourcc(*'mp4v'), frame_rate, (frame_width, frame_height))
    
    for frame_path in frames:
        frame = cv2.imread(frame_path)
        out.write(frame)
    
    out.release()

In [12]:
# Path to output video
output_video_path = 'output_clip.mp4'
frames_to_video(output_frames_folder, output_video_path)