## Draw sample picture

In [2]:
import os
import random

import cv2
import numpy as np

import torch
from torch.utils.data import DataLoader
from torchvision import models

from src.resnet_yolo import resnet50
from src.dataset import VocDetectorDataset
from src.eval_voc import evaluate
from src.predict import predict_image
from src.config import VOC_CLASSES, COLORS
from kaggle_submission import output_submission_csv

import matplotlib.pyplot as plt
import collections

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

load_network_path = "checkpoints/best_detector.pth" #'checkpoints/best_detector.pth' 
pretrained = True

# use to load a previously trained network
if load_network_path is not None:
    print('Loading saved network from {}'.format(load_network_path))
    net = resnet50().to(device)
    net.load_state_dict(torch.load(load_network_path, map_location=torch.device(device)))
else:
    print('Load pre-trained model')
    net = resnet50(pretrained=pretrained).to(device)

Loading saved network from checkpoints/best_detector.pth


In [11]:
# YOLO network hyperparameters
B = 2  # number of bounding box predictions per cell
S = 14  # width/height of network output grid (larger than 7x7 from paper since we use a different network)

file_root_train = 'data/VOCdevkit_2007/VOC2007/JPEGImages/'
annotation_file_train = 'data/voc2007.txt'

train_dataset = VocDetectorDataset(root_img_dir=file_root_train,dataset_file=annotation_file_train,train=True, S=S)
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=2)
print('Loaded %d train images' % len(train_dataset))

file_root_test = 'data/VOCdevkit_2007/VOC2007test/JPEGImages/'
annotation_file_test = 'data/voc2007test.txt'

test_dataset = VocDetectorDataset(root_img_dir=file_root_test,dataset_file=annotation_file_test,train=False, S=S)
test_loader = DataLoader(test_dataset,batch_size=batch_size,shuffle=False,num_workers=2)
print('Loaded %d test images' % len(test_dataset))

data = train_dataset[0]

Initializing dataset
Loaded 5011 train images
Initializing dataset
Loaded 4950 test images


In [40]:
from tqdm import tqdm
from src.predict import predict_frame

def process_video(video_path, output_path, net, fps_reduction_factor=2):
    """
    Process a video to perform object detection on its frames.

    Parameters:
    - video_path: path to the input video file.
    - output_path: path to save the processed video file.
    - net: object detection network model.
    - fps_reduction_factor: factor to reduce FPS by, for speeding up processing.
    """
    # Load the video
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print("Error opening video file.")
        return

    # Get video properties
    orig_fps = int(cap.get(cv2.CAP_PROP_FPS))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    # Define the codec and create VideoWriter object
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, orig_fps, (width, height))

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    try:
        with tqdm(total=frame_count) as pbar:
            frame_id = 0
            while cap.isOpened():
                ret, frame = cap.read()
                if ret and frame_id < frame_count:
                    # Convert frame to RGB
                    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

                    # Predict and annotate the frame
                    result = predict_frame(net, frame)
                    for left_up, right_bottom, class_name, prob in result:
                        color = COLORS[VOC_CLASSES.index(class_name)]
                        cv2.rectangle(frame, left_up, right_bottom, color, 2)
                        label = class_name + str(round(prob, 2))
                        text_size, baseline = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.4, 1)
                        p1 = (left_up[0], left_up[1] - text_size[1])
                        cv2.rectangle(frame, (p1[0] - 2 // 2, p1[1] - 2 - baseline), (p1[0] + text_size[0], p1[1] + text_size[1]), color, -1)
                        cv2.putText(frame, label, (p1[0], p1[1] + baseline), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (255, 255, 255), 1, 8)

                    # Convert back to BGR for opencv video write
                    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
                    out.write(frame)

                    pbar.update(1)
                    frame_id += 1
                else:
                    break
    finally:
        # Release everything if job is finished
        cap.release()
        out.release()
        cv2.destroyAllWindows()
    print("Video processing completed.")


In [None]:
process_video('sample.mp4', 'sample_labeled.mp4', net)

 97%|████████████████████████████████████████████████████████████████████████████▎  | 976/1011 [04:06<00:08,  4.01it/s]

## 