-Google Colab Code

In [None]:
!nvidia-smi 


#NVIDIA System Management Interface.
#Itâ€™s a CLI (Command Line Interface) tool that comes with the NVIDIA driver installation.

Thu Dec 25 09:47:46 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   32C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [19]:
!pip install -q supervision "ultralytics<=8.3.40" #quiet mode â€” suppresses most of the installation output
#Installs the Ultralytics YOLO package

In [20]:
# prevent ultralytics from tracking your activity (telemetry)
!yolo settings sync=False

JSONDict("/root/.config/Ultralytics/settings.json"):
{
  "settings_version": "0.0.6",
  "datasets_dir": "/content/datasets",
  "weights_dir": "weights",
  "runs_dir": "runs",
  "uuid": "569f3ba64b326db489132663f79cd37279811de477381b83ac131e6cdd129cbb",
  "sync": false,
  "api_key": "",
  "openai_api_key": "",
  "clearml": true,
  "comet": true,
  "dvc": true,
  "hub": true,
  "mlflow": true,
  "neptune": true,
  "raytune": true,
  "tensorboard": true,
  "wandb": false,
  "vscode_msg": true
}
ðŸ’¡ Learn more about Ultralytics Settings at https://docs.ultralytics.com/quickstart/#ultralytics-settings


In [21]:
import cv2
import numpy as np
import supervision as sv
from tqdm import tqdm
from ultralytics import YOLO
from supervision.assets import VideoAssets, download_assets
from collections import defaultdict, deque

download_assets(VideoAssets.VEHICLES)
#Supervision Assets (like VideoAssets.VEHICLES) are free demo videos and images
#video name is 'vehicles.mp4'


vehicles.mp4 asset download complete. 



'vehicles.mp4'

In [22]:
INPUT_VIDEO = "vehicles.mp4"
OUTPUT_VIDEO = "vehicles_output.mp4"
MODEL = "yolov8m.pt"
CONFIDENCE_THRESHOLD = 0.3
IOU_THRESHOLD = 0.5
MODEL_RESOLUTION = 1280

In [23]:
# Perspective transformation setup
# Defines a mapping from image pixel coordinates to real-world dimensions

# Coordinates of the quadrilateral on the image that represents the road surface
# Points are ordered clockwise starting from the top-left
box_coordinates = np.array([
    [1252, 787],    # top-left corner of the road
    [2298, 803],    # top-right corner of the road
    [5039, 2159],   # bottom-right corner of the road
    [-550, 2159]    # bottom-left corner of the road
])

# Real-world dimensions of the selected road area (in metres)
box_width_metres = 25     # approximate road width
box_height_metres = 250   # approximate visible road length

# Target rectangle in real-world coordinates
# This represents the same road area but flattened into a metric space
box = np.array([
    [0, 0],
    [box_width_metres - 1, 0],
    [box_width_metres - 1, box_height_metres - 1],
    [0, box_height_metres - 1],
])


In [24]:
from google.colab.patches import cv2_imshow

# Open the input video for visual inspection
cap = cv2.VideoCapture(INPUT_VIDEO)

# Loop through each frame to preview the polygon placement
while cap.isOpened():

    success, frame = cap.read()
    if not success:
        break

    # Create a copy of the frame to avoid modifying the original
    annotated_frame = frame.copy()

    # Draw the polygon used as the region of interest
    annotated_frame = sv.draw_polygon(
        scene=annotated_frame,
        polygon=box_coordinates,
        color=sv.Color.RED,
        thickness=4
    )

    # Display the frame in Google Colab to verify polygon alignment
    # Uncomment the line below when running in Colab
    # cv2_imshow(annotated_frame)

    # Allow early exit if running in an environment with keyboard input
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture once previewing is complete
cap.release()

# Window cleanup is not required in Google Colab
# cv2.destroyAllWindows()


In [25]:

import cv2
import numpy as np


homography = cv2.getPerspectiveTransform(
    box_coordinates .astype(np.float32),
    box.astype(np.float32)
)

#converts detection points from camera pixels into real-world coordinates using the perspective transform
def transform_points(points):
    if points.size == 0:
        return points
    pts = points.reshape(-1, 1, 2).astype(np.float32)
    warped = cv2.perspectiveTransform(pts, homography)
    return warped.reshape(-1, 2)



In [27]:
# Load the YOLOv8 model using the specified weights
model = YOLO(MODEL)

# Read basic metadata from the input video (FPS, resolution, frame count)
video_info = sv.VideoInfo.from_video_path(INPUT_VIDEO)

# Generator that yields frames sequentially from the input video
frame_generator = sv.get_video_frames_generator(INPUT_VIDEO)

# Initialise ByteTrack for multi-object tracking
# The frame rate is required to maintain stable track IDs over time
byte_track = sv.ByteTrack(
    frame_rate=video_info.fps,
    track_activation_threshold=CONFIDENCE_THRESHOLD
)

# Calculate annotation thickness based on video resolution
# This keeps drawings readable across different video sizes
thickness = sv.calculate_optimal_line_thickness(
    resolution_wh=video_info.resolution_wh
)

# Calculate an appropriate text scale for labels based on resolution
text_scale = sv.calculate_optimal_text_scale(
    resolution_wh=video_info.resolution_wh
)

# Annotator for drawing bounding boxes around detected vehicles
bbox_annot = sv.BoxAnnotator(thickness=thickness)

# Annotator for displaying tracking IDs and speed labels
label_annot = sv.LabelAnnotator(
    text_scale=text_scale,
    text_thickness=thickness,
    text_position=sv.Position.BOTTOM_CENTER
)

# Annotator for drawing recent movement traces behind each vehicle
# Trace length is set to approximately two seconds
trace_annot = sv.TraceAnnotator(
    thickness=thickness,
    trace_length=video_info.fps * 2,
    position=sv.Position.BOTTOM_CENTER
)

# Define a polygonal region of interest used to filter detections
# Only vehicles inside this area are tracked, counted, and analysed
polygon_zone = sv.PolygonZone(
    polygon=box_coordinates
)

# Store recent real-world positions for each tracked vehicle
# The deque length corresponds to roughly one second of motion history
coordinates = defaultdict(lambda: deque(maxlen=video_info.fps))


In [28]:
VEHICLE_CLASSES = [2, 3, 5, 7]
# car, motorcycle, bus, truck


In [29]:
# Define the video codec for MP4 output
# 'mp4v' is widely supported and works reliably in most environments
fourcc = cv2.VideoWriter_fourcc(*"mp4v")

# Create a VideoWriter to save the processed frames to an output file
# The output video will match the input video's frame rate and resolution
video_writer = cv2.VideoWriter(
    OUTPUT_VIDEO,
    fourcc,
    video_info.fps,
    video_info.resolution_wh
)



In [30]:
# Stores unique tracking IDs so each vehicle is counted only once
vehicle_count = set()

def calculate_speed(track_id, world_point):
    """
    Estimate the average speed of a tracked vehicle in km/h using
    real-world coordinates derived from the perspective transform.
    """

    # Retrieve the stored trajectory (in metres) for this vehicle
    coords = coordinates[track_id]

    # Append the current real-world position
    coords.append(world_point)

    # Speed cannot be calculated until at least two positions are available
    if len(coords) < 2:
        return None

    # Calculate the distance travelled between the first and latest points
    distance_m = np.linalg.norm(coords[-1] - coords[0])

    # Calculate the elapsed time in seconds based on frame rate
    time_s = len(coords) / video_info.fps

    # Convert metres per second to kilometres per hour
    speed_mps = distance_m / time_s
    speed_kmh = speed_mps * 3.6

    return speed_kmh



In [31]:
for frame in tqdm(frame_generator, total=video_info.total_frames):

    # Run YOLOv8 inference on the current frame
    results = model(
        frame,
        imgsz=MODEL_RESOLUTION,
        conf=CONFIDENCE_THRESHOLD,
        iou=IOU_THRESHOLD,
        verbose=False
    )[0]

    # Convert YOLO output into Supervision detections
    detections = sv.Detections.from_ultralytics(results)

    # Keep only vehicle-related classes (car, motorcycle, bus, truck)
    mask = np.isin(detections.class_id, VEHICLE_CLASSES)
    detections = detections[mask]

    # Keep detections that fall inside the polygon region of interest
    detections = detections[polygon_zone.trigger(detections)]

    # Apply ByteTrack to assign consistent tracking IDs
    detections = byte_track.update_with_detections(detections)

    # If no vehicles are present, write the original frame and continue
    if len(detections) == 0:
        video_writer.write(frame)
        continue

    # Extract bottom-centre anchor points for each detection
    points = detections.get_anchors_coordinates(
        anchor=sv.Position.BOTTOM_CENTER
    )

    # Transform image coordinates into real-world metres
    world_points = transform_points(points)

    # Prepare labels containing tracking ID and estimated speed
    labels = []
    for tracker_id, world_pt in zip(detections.tracker_id, world_points):

        speed = calculate_speed(tracker_id, world_pt)

        # Count each vehicle once using its unique tracking ID
        vehicle_count.add(tracker_id)

        if speed is None:
            labels.append(f"ID {tracker_id}")
        else:
            labels.append(f"ID {tracker_id} | {int(speed)} km/h")

    # Create a copy of the frame for drawing annotations
    annotated = frame.copy()

    # Draw the polygon defining the monitored road area
    annotated = sv.draw_polygon(
        annotated,
        box_coordinates,
        color=sv.Color.RED,
        thickness=2
    )

    # Draw bounding boxes, tracking traces, and labels
    annotated = bbox_annot.annotate(annotated, detections)
    annotated = trace_annot.annotate(annotated, detections)
    annotated = label_annot.annotate(annotated, detections, labels)

    # Display the total vehicle count at the top centre with a background panel
    text = f"Vehicles counted: {len(vehicle_count)}"
    font = cv2.FONT_HERSHEY_SIMPLEX

    font_scale = video_info.resolution_wh[0] / 1200
    thickness = 2

    (text_width, text_height), _ = cv2.getTextSize(
        text, font, font_scale, thickness
    )

    x = (video_info.resolution_wh[0] - text_width) // 2
    y = text_height + 20

    pad_x = 20
    pad_y = 15

    overlay = annotated.copy()

    cv2.rectangle(
        overlay,
        (x - pad_x, y - text_height - pad_y),
        (x + text_width + pad_x, y + pad_y),
        (40, 40, 40),
        -1
    )

    cv2.addWeighted(overlay, 0.6, annotated, 0.4, 0, annotated)

    cv2.putText(
        annotated,
        text,
        (x, y),
        font,
        font_scale,
        (255, 255, 255),
        thickness,
        cv2.LINE_AA
    )

    # Write the fully annotated frame to the output video
    video_writer.write(annotated)

# Finalise and close the output video file
video_writer.release()



100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 538/538 [01:25<00:00,  6.29it/s]
