In [1]:
import os
import cv2
import sys
import shutil
import pathlib
import functools

import numpy as np

import torch

In [2]:
# include submodule paths
sys.path.insert(0, './detector/scaledyolov4/parent')

In [3]:
from detector.scaledyolov4.parent.utils.general import non_max_suppression, scale_coords, xyxy2xywh, plot_one_box
from detector.scaledyolov4.parent.utils.torch_utils import select_device, time_synchronized

from detector.scaledyolov4.parent.models.models import Darknet, load_darknet_weights
from detector.scaledyolov4.parent.utils.datasets import letterbox
from detector.scaledyolov4.parent.detect import load_classes

In [4]:
class AttrDict(dict):
    __getattr__ = dict.__getitem__
    __setattr__ = dict.__setitem__

In [5]:
class VideoLoader:
    # No support for parallel processing
    
    CODEC = cv2.CAP_FFMPEG

    def __init__(self, path, post_process = lambda x: x):
        self._path = path
        self._video_buffer = cv2.VideoCapture(path, cv2.CAP_FFMPEG)
        self._post_process = post_process
    
    def __len__(self):
        n_frames = self._video_buffer.get(cv2.CAP_PROP_FRAME_COUNT)
        assert int(n_frames) == n_frames, 'Frame count has to be an integer'
        return int(n_frames)

    def __iter__(self):
        if self._video_buffer.isOpened():
            self._video_buffer.set(cv2.CAP_PROP_POS_FRAMES, 0)
        else:
            self._video_buffer = cv2.VideoCapture(self._path, self.CODEC)
        return self

    def __next__(self):
        success, raw_frame = self._video_buffer.read()
        if success:
            return self._post_process(raw_frame), raw_frame
        else:
            self._video_buffer.release()
            raise StopIteration

    def get_param(self, key):
        return self._video_buffer.get(key)

In [6]:
def process_frame(frame, img_size):
    """
    Resizes images as per the model input size
    Changes the channel order
    """
    img, *_ = letterbox(frame, img_size)
    img = img[:, :, ::-1].transpose(2, 0, 1)    # BGR to RGB
    img = np.ascontiguousarray(img)
    return img

In [7]:
def load_model(img_size=640,
               device='cpu',
               half=False,
               cfg='yolov4/models/yolov4-csp.cfg',
               weights='yolov4/weights/yolov4-csp.weights'):
    model = Darknet(cfg, img_size)
    load_darknet_weights(model, weights)
    model.to(device)
    model.eval()
    return model.half() if half else model

In [8]:
args = AttrDict()
args.video_path = pathlib.Path('data/ai_city/counting_gt_sample/counting_example_cam_5_1min.mp4')

args.model_args = AttrDict()
args.model_args.img_size = 640
args.model_args.device = 'cuda' if torch.cuda.is_available() else 'cpu'
args.model_args.half = False

args.model_paths = AttrDict()
args.model_paths.cfg = 'detector/scaledyolov4/parent/models/yolov4-csp.cfg'
args.model_paths.weights = 'detector/scaledyolov4/weights/yolov4-csp.weights'
    
args.det_params = AttrDict()
args.det_params.conf_thres = 0.4
args.det_params.iou_thres = 0.5
args.det_params.agnostic_nms = False

In [9]:
frame_loader = VideoLoader(str(args.video_path),
                           post_process=functools.partial(process_frame,
                                                          img_size=args.model_args.img_size))

model = load_model(**args.model_args, **args.model_paths)

Model Summary: 342 layers, 5.29214e+07 parameters, 5.29214e+07 gradients


In [10]:
 # cudnn.benchmark = True  # set True to speed up constant image size inference

In [11]:
def detect(model, frame, raw_frame, model_args, params):
    frame = torch.tensor(frame, device=model_args.device,
                         dtype=torch.float16 if model_args.half else torch.float32)
    frame /= 255.0

    if frame.dim() == 3:
        frame.unsqueeze_(dim=0)

    # Inference
    pred = model(frame, augment=False)[0]
    # Apply NMS
    pred = non_max_suppression(pred,
                               params.conf_thres,
                               params.iou_thres,
                               classes=None,
                               agnostic=params.agnostic_nms)
    # Process detections and save results
    for i, det in enumerate(pred):
        if det is not None and len(det):
            # Rescale boxes from processed frame coord to original frame coord
            det[:, :4] = scale_coords(frame.shape[2:], det[:, :4], raw_frame.shape).round()
    return pred

In [12]:
save_txt = True
save_vid = True
view_vid = False
out_folder = 'outputs'

In [13]:
# Initialize
out_path = pathlib.Path(out_folder) / pathlib.Path(args.video_path).stem
if os.path.exists(out_path):
    shutil.rmtree(out_path)  # delete output folder
os.makedirs(out_path)

# Get names and colors
names = 'detector/scaledyolov4/parent/data/coco.names'
names = load_classes(names)
colors = [[np.random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]

In [14]:
with torch.no_grad():
    # Process frames one-by-one sequentially
    for i, (inp_frame, raw_frame) in enumerate(frame_loader):
        t1 = time_synchronized()
        detections = detect(model, inp_frame, raw_frame,
                            model_args=args.model_args,
                            params=args.det_params)[0]
        t2 = time_synchronized()
        
        if ((save_txt or save_vid or view_vid) and 
            detections is not None and len(detections)):
            gn = torch.tensor(raw_frame.shape)[[1, 0, 1, 0]]
            for *xyxy, conf, cls in detections:
                # write restults to file
                if save_txt:
                    xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist()  # normalized xywh
                    with open(out_path / 'results.txt', 'a') as f:
                        f.write(('%g ' * 6 + '\n') % (i, cls, *xywh))  # label format
                # Add bounding box to image
                if save_vid or view_vid:
                        label = '%s %.2f' % (names[int(cls)], conf)
                        plot_one_box(xyxy, raw_frame, label=label,
                                     color=colors[int(cls)], line_thickness=3)
            # Stream results
            if view_vid:
                cv2.imshow(str(args.video_path), raw_frame)
                if cv2.waitKey(1) == ord('q'):  # q to quit
                    raise StopIteration

            # Save results (image with detections)
            if save_vid:
                if i == 0:
                    vid_path = out_path / pathlib.Path(args.video_path).name
                    fourcc = 'mp4v'  # output video codec
                    fps = frame_loader.get_param(cv2.CAP_PROP_FPS)
                    w = int(frame_loader.get_param(cv2.CAP_PROP_FRAME_WIDTH))
                    h = int(frame_loader.get_param(cv2.CAP_PROP_FRAME_HEIGHT))
                    vid_writer = cv2.VideoWriter(str(vid_path), cv2.VideoWriter_fourcc(*fourcc), fps, (w, h))
                vid_writer.write(raw_frame)
    if save_txt or save_vid:
        print('Results saved to %s' % out_path)
    if save_vid:
        vid_writer.release()
    if view_vid:
        cv2.destroyAllWindows()

Results saved to outputs/counting_example_cam_5_1min
