This sample illustrates torch segmentation. \
To illustrate interop with cvcuda and nvimgcodec, they are used to:
- OSD operations like bbox and label drawing.
- JPEG compression.

For memory sharing both DLPack and CAI (CUDA Array Interface) are used.

In [None]:
class StopExecution(Exception):
    def _render_traceback_(self):
        return []

In [None]:
# Starting from Python 3.8 DLL search policy has changed.
# We need to add path to CUDA DLLs explicitly.
import os

if os.name == "nt":
    # Add CUDA_PATH env variable
    cuda_path = os.environ["CUDA_PATH"]
    if cuda_path:
        os.add_dll_directory(os.path.join(cuda_path, "bin"))
    else:
        raise StopExecution

In [None]:
from PIL import Image
from IPython.display import display
from ipywidgets import interact
import ipywidgets as widgets

import python_vali as vali
import numpy as np

import torch
import torchvision

import cvcuda
from nvidia import nvimgcodec

In [None]:
url = "../tests/data/test.mp4"

In [None]:
coco_names = [
    "__background__",
    "person",
    "bicycle",
    "car",
    "motorcycle",
    "airplane",
    "bus",
    "train",
    "truck",
    "boat",
    "traffic light",
    "fire hydrant",
    "N/A",
    "stop sign",
    "parking meter",
    "bench",
    "bird",
    "cat",
    "dog",
    "horse",
    "sheep",
    "cow",
    "elephant",
    "bear",
    "zebra",
    "giraffe",
    "N/A",
    "backpack",
    "umbrella",
    "N/A",
    "N/A",
    "handbag",
    "tie",
    "suitcase",
    "frisbee",
    "skis",
    "snowboard",
    "sports ball",
    "kite",
    "baseball bat",
    "baseball glove",
    "skateboard",
    "surfboard",
    "tennis racket",
    "bottle",
    "N/A",
    "wine glass",
    "cup",
    "fork",
    "knife",
    "spoon",
    "bowl",
    "banana",
    "apple",
    "sandwich",
    "orange",
    "broccoli",
    "carrot",
    "hot dog",
    "pizza",
    "donut",
    "cake",
    "chair",
    "couch",
    "potted plant",
    "bed",
    "N/A",
    "dining table",
    "N/A",
    "N/A",
    "toilet",
    "N/A",
    "tv",
    "laptop",
    "mouse",
    "remote",
    "keyboard",
    "cell phone",
    "microwave",
    "oven",
    "toaster",
    "sink",
    "refrigerator",
    "N/A",
    "book",
    "clock",
    "vase",
    "scissors",
    "teddy bear",
    "hair drier",
    "toothbrush",
]

In [None]:
# Prepare model
model = torchvision.models.detection.ssd300_vgg16(
    weights=torchvision.models.detection.SSD300_VGG16_Weights.COCO_V1)
model.eval()
model.to("cuda")

In [None]:
# GPU-accelerated decoder
pyDec = vali.PyDecoder(
    url,
    {},
    gpu_id=0)

# GPU-accelerated converters
pyCvt = [
    vali.PySurfaceConverter(
        pyDec.Format,
        vali.PixelFormat.RGB,
        gpu_id=0),

    vali.PySurfaceConverter(
        vali.PixelFormat.RGB,
        vali.PixelFormat.RGB_PLANAR,
        gpu_id=0)
]

# nvimagecodec JPEG encoder is used instead of vali.PyNvJpegEncoder.
# It's done just for illustration purposes, to show the CAI memory sharing.
encoder = nvimgcodec.Encoder()

In [None]:
# Allocate Surfaces
surfaces = [
    vali.Surface.Make(
        format=pyDec.Format,
        width=pyDec.Width,
        height=pyDec.Height,
        gpu_id=0),

    vali.Surface.Make(
        format=vali.PixelFormat.RGB,
        width=pyDec.Width,
        height=pyDec.Height,
        gpu_id=0),

    vali.Surface.Make(
        format=vali.PixelFormat.RGB_PLANAR,
        width=pyDec.Width,
        height=pyDec.Height,
        gpu_id=0)
]

In [None]:
def decode_to_tensor(seek_frame: int) -> torch.tensor:
    """
    This function decodes single video frame and exports it
    to torch cuda tensor.

    Args:
        seek_frame(int): number of frame to decode

    Returns:
        torch.tensor: Planar RGB CUDA float tensor normalized to
        model liking.
    """
    # Decode single Surface
    seek_ctx = vali.SeekContext(seek_frame)
    success, details = pyDec.DecodeSingleSurface(surfaces[0], seek_ctx)
    if not success:
        print(details)
        raise StopExecution

    # Go through color conversion chain
    for i in range(0, len(pyCvt)):
        success, details = pyCvt[i].Run(surfaces[i], surfaces[i+1])
        if not success:
            print(details)
            raise StopExecution

    img_tensor = torch.from_dlpack(surfaces[2])
    img_tensor = img_tensor.clone().detach()
    img_tensor = img_tensor.type(dtype=torch.cuda.FloatTensor)

    # Normalize tensor to meet the NN expectations.
    img_tensor = torch.divide(img_tensor, 255.0)
    data_transforms = torchvision.transforms.Normalize(
        mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
    )

    return data_transforms(img_tensor)

In [None]:
def run_inference(surface_tensor: torch.tensor) -> tuple[list[list[np.int32]], list[str]]:
    """
    Runs inference on input tensor.

    Args:
        surface_tensor(torch.tensor): input tensor

    Returns:
        list[list[np.int32]]: List of detection bboxes
        list[str]: Labels
    """
    input_batch = surface_tensor.unsqueeze(0).to("cuda")

    # Run inference.
    with torch.no_grad():
        outputs = model(input_batch)

    # Collect segmentation results.
    pred_scores = outputs[0]["scores"].detach().cpu().numpy()
    pred_bboxes = outputs[0]["boxes"].detach().cpu().numpy()

    confidence = 0.74

    labels = [coco_names[i] for i in outputs[0]["labels"].cpu().numpy()]
    bboxes = pred_bboxes[pred_scores >= confidence].astype(np.int32)

    return bboxes, labels

In [None]:
def draw_bboxes(surface_rgb: vali.Surface, bboxes: list[list[np.int32]],
                labels: list[str]) -> nvimgcodec.nvimgcodec_impl.Image:
    """
    Runs inference on input tensor.

    Args:
        surface_rgb(vali.Surface): interleaved RGB Surface which corresponds
        to tensor
        bboxes(list[list[np.int32]]): detection bboxes
        labels(list[str]): detection labels

    Returns:
        nvimgcodec.nvimgcodec_impl.Image: nvcv image with bboxes drawn
    """

    # Create tensor from RGB Surface for OSD operations.
    nvcv_tensor = cvcuda.as_tensor(surface_rgb, "HWC")

    if len(bboxes) > len(labels):
        print("Some detections don't have labels")
        raise StopExecution

    # Draw bounding boxes and labels.
    bbox_list = []
    label_list = []

    for i in range(0, len(bboxes)):
        bbox_list.append(
            cvcuda.BndBoxI(
                box=tuple(bboxes[i]),
                thickness=5,
                borderColor=(0, 255, 0, 255),
                fillColor=(0, 0, 255, 0)))

        label_list.append(
            cvcuda.Label(
                utf8Text=labels[i],
                fontSize=12,
                tlPos=(bboxes[i][0], bboxes[i][1]),
                fontColor=(0, 255, 0, 255),
                bgColor=(0, 0, 255, 0)))

    batch_bounding_boxes = cvcuda.Elements(elements=[bbox_list])
    batch_labels = cvcuda.Elements(elements=[label_list])

    cvcuda.osd_into(nvcv_tensor, nvcv_tensor, batch_bounding_boxes)
    cvcuda.osd_into(nvcv_tensor, nvcv_tensor, batch_labels)

    # Both nvcv image and tensor are sharing actual vRAM memory with
    # RGB Surface. Hence we can draw bboxes over tensor but return image.
    return nvimgcodec.as_image(surface_rgb)

Inference doesn't facilitate batching for simplicity. \
Hence detection results may not be the best. \
Please take that into account and don't scold sloppy bboxes.

In [None]:
@interact(seek_frame=widgets.IntSlider(
    min=0, max=pyDec.NumFrames - 1, step=1, value=0))
def show(seek_frame: int) -> None:
    video_frame = decode_to_tensor(seek_frame)
    detections, labels = run_inference(video_frame)
    rgb_frame = draw_bboxes(surfaces[1], detections, labels)
    print(rgb_frame)

    encoder.write("frame.jpg", rgb_frame)
    display(Image.open("frame.jpg"), display_id="decoded_frame")

In [None]:
os.remove("frame.jpg")