# Get Visual Output of Model Inference

`model-validation.ipynb` takes a video as input and runs inference on it. There are 2 possible ways of handling it:

1. With Post Processing: The detections are made in the first phase and saved to a `json` file. IDs, coordinates, and detection confidence are recorded. There is also a dictionary that handles the times each ID is encountered. If the ID count is less than `min_count`, those detections will not be added to the final video.

2. Without Post Processing: Default Ultralytics YOLO functions handle everything from inference to final output video creation.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install ultralytics opencv-python filterpy ffmpeg supervision tqdm
!pip uninstall -y wandb
!pip install -U lap

In [None]:
# Replace IDs with desired video inputs
!gdown 1qXLrFWWfO-s4ZUqpEoQwJA1tjU1kOIXw # Stationary

In [None]:
import os
import cv2
import sys
import json
import torch
import locale
import numpy as np

from ultralytics import YOLO
os.environ["WANDB_DISABLED"] = "True"
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# Replace weight file with custom version or use default weights
model = YOLO("18042025.pt")

In [None]:
# Replace source path with custom file
source = "/content/Stationary.mp4"

# 1. Post Processing

In [None]:
total_detections = {}
total_detections_count = {}

# Gathering phase - For each frame, save results & update separate object count
with open("data.json", "w", encoding = "utf-8") as f:
  for i, result in enumerate(model.track(source=SOURCE, stream=True, tracker="bytetrack.yaml", conf=0.3)):
    frame_info = {"index" : i, "detections" : []}
    for box in result.boxes:
        if box.id != -1 and box.id is not None:
          arr = box.xyxy[0].tolist()
          frame_info["detections"].append(
              {
                  "id" : int(box.id),
                  "conf" : float(box.conf),
                  "xyxy" : arr
              }
          )
          total_detections_count[int(box.id)] = total_detections_count.get(int(box.id), 0) + 1

    json.dump(frame_info, f, ensure_ascii = False)
    f.write("\n")

In [None]:
# Adapted from https://docs.opencv.org/4.x/dd/d43/tutorial_py_video_display.html

vid = cv2.VideoCapture(source)

# Shows how many objects were filtered at the end
filtered_no = 0

# Filtering threshold. Objects that appear for less than 5 frames are not displayed in the final output video
min_count = 5

fps = vid.get(cv2.CAP_PROP_FPS)
width = int(vid.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(vid.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*'XVID')
out = cv2.VideoWriter('output-stationary.avi', fourcc, fps, (width,  height))

frame_index = 0
with open("data.json", "r", encoding = "utf-8") as f:
  for line in f:
    print(f"PROCESSING FRAME {frame_index}")
    try:
      frame_info = json.loads(line)
    except Exception as e:
      print("JSON ERROR:", e)

    ret, frame = vid.read()

    if not ret:
      print("ERROR receiving frame (stream end?). Exiting ...")
      break

    # If checks pass, this section draws the detections for a frame on said frame
    # while including details such as ID and confidence
    if frame_info['index'] == frame_index:
      for detection in frame_info['detections']:
        if total_detections_count.get(detection['id']) >= min_count:
          filtered_no += 1
          x1, y1, x2, y2 = map(int, detection['xyxy'])
          label = f"ID: {detection['id']} Conf: {detection['conf']:.2f}"
          cv2.rectangle(frame, (x1, y1), (x2, y2), (255, 127, 0), 2)

          # Text might flow out of the frame
          try:
            cv2.rectangle(frame, (x1, y1), (x1 + 150, y1 + 40), (255, 127, 0), cv2.FILLED) # Draw background for text
            cv2.putText(frame, label, (x1 + 5, y1 + 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 255, 255), 2)
          except Exception as e:
            print("ERROR WITH LABEL GENERATION:", e)

    out.write(frame)
    frame_index += 1

vid.release()
out.release()

print(f"TOTAL NUMBER OF DETECTIONS: {sum(total_detections_count.values())}")
print(f"FILTERED NUMBER OF DETECTIONS: {filtered_no}")

In [None]:
# Move output file to a secure location so it does not disappear when the notebook runtime ends
!mv /content/output-stationary.avi /content/drive/MyDrive/

# 2. No Post Processing

In [None]:
# Loop forces .track() function to compute results
for i, result in enumerate(model.track(
    source = source,
    stream = True,
    save = True,
    show = False,
    tracker = "bytetrack.yaml",
    conf = 0.3
    )):
  pass

In [None]:
# Move output file to a secure location so it does not disappear when the notebook runtime ends
!mv /content/runs/detect/track/Stationary.avi /content/drive/MyDrive/