#### 0. Setup

Pip install `ultralytics` and [dependencies](https://github.com/ultralytics/ultralytics/blob/main/pyproject.toml) and check software and hardware.


In [1]:
# import required libarries and check the environment
import ultralytics
import cv2
from ultralytics import solutions

# Environment checks
ultralytics.checks()

Ultralytics 8.3.95  Python-3.11.9 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-1365U)
Setup complete  (12 CPUs, 15.3 GB RAM, 450.8/474.1 GB disk)
Setup complete  (12 CPUs, 15.3 GB RAM, 450.8/474.1 GB disk)


### 1. Counting specific classes of objects


##### 1.1 define input path, output path, model, classes to track


In [2]:
video_path = 'sample-video.mp4'  # Path to your input video file
output_video_path = 'output_specific_classes.avi' # Path to save the output video
model_path = 'yolo11s.pt' # Path to your YOLOv11 model file. 'yolo11n-obb.pt' for OBB detection
classes_to_count = [0, 2, 3]  # Classes to count (0: person, 1: bicycle, 3: car etc.)

# cocoClassNames = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus", "train", "truck", "boat",
#                   "traffic light", "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat",
#                   "dog", "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella",
#                   "handbag", "tie", "suitcase", "frisbee", "skis", "snowboard", "sports ball", "kite", "baseball bat",
#                   "baseball glove", "skateboard", "surfboard", "tennis racket", "bottle", "wine glass", "cup",
#                   "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli",
#                   "carrot", "hot dog", "pizza", "donut", "cake", "chair", "sofa", "pottedplant", "bed",
#                   "diningtable", "toilet", "tvmonitor", "laptop", "mouse", "remote", "keyboard", "cell phone",
#                   "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors",
#                   "teddy bear", "hair drier", "toothbrush"
#                   ]

#### 1.2 Read the Video File

1. You can either read the video file directly or stream the content from an RTSP (Real-Time Streaming Protocol) source, allowing for flexible video input depending on your needs.
2. We will also set up the video writer to handle the output video writing.


In [3]:
cap = cv2.VideoCapture(video_path)
assert cap.isOpened(), "Error reading video file"
w, h, fps = (int(cap.get(x)) for x in (cv2.CAP_PROP_FRAME_WIDTH, 
                                        cv2.CAP_PROP_FRAME_HEIGHT, 
                                        cv2.CAP_PROP_FPS))
video_writer = cv2.VideoWriter(output_video_path, 
                                cv2.VideoWriter_fourcc(*"mp4v"), 
                                fps, (w, h))

#### 1.3 Define Region Coordinates

Here, we set the coordinates for specific regions to ensure accurate object tracking and analysis within the video or stream. This helps monitor and track objects effectively in different areas.


In [4]:
# Define region points
region_points = [(10, 300), (1275, 700)]  # For line tracking
# region_points = [(20, 400), (1080, 400), (1080, 360), (20, 360)]  # For rectangle region tracking
# region_points = [(20, 400), (1080, 400), (1080, 360), (20, 360), (20, 400)]  # For polygon region tracking

#### 1.4 Initialize the Object counter Class

Next, let's initialize the object counter class to track objects in each frame of the video.


In [5]:
# Init ObjectCounter
counter = solutions.ObjectCounter(
    view_image=True, # Display the image during processing
    show=True,  # Display the output
    region=region_points,  # Region of interest points
    model=model_path,  # model="yolo11n-obb.pt" for object counting using YOLO11 OBB model.
    classes=classes_to_count,  # Class names from the YOLO model
    show_in=True,  # Display in counts
    show_out=False,  # Display out counts
    # show_count=True,  # Display count on the video
    # line_width=1,  # Adjust the line width for bounding boxes and text display
    # line_color=(0, 255, 0),  # Adjust the color of the bounding boxes and text display
    line_thickness=2, # Thickness of the lines drawn
    draw_tracks=True,  # Draw tracking lines for objects
)

Ultralytics Solutions:  {'region': [(10, 300), (1275, 700)], 'show_in': True, 'show_out': False, 'colormap': None, 'up_angle': 145.0, 'down_angle': 90, 'kpts': [6, 8, 10], 'analytics_type': 'line', 'json_file': None, 'records': 5, 'view_image': True, 'show': True, 'model': 'yolo11s.pt', 'classes': [0, 2, 3], 'line_thickness': 2, 'draw_tracks': True}


#### 1.5 Process Video Frames

In this step, we will process each frame of the video to detect and analyze objects. This allows for real-time tracking, based on the visual data in the frames.


In [None]:
while cap.isOpened():
    success, im0 = cap.read()
    if not success:
        print("Video frame is empty or processing is complete.")
        break
    results = counter(im0)
    video_writer.write(results.plot_im)

cap.release()
video_writer.release()
cv2.destroyAllWindows()

# Print counting results:
# print(f'In: {counter.in_counts}\nOut: {counter.out_counts}\nTotal: {counter.in_counts + counter.out_counts}')
# print(f'Saves output video to {output_video_path}')


0: 384x640 2 persons, 9 cars, 135.3ms
Speed: 2.0ms preprocess, 135.3ms inference, 1.8ms postprocess per image at shape (1, 3, 384, 640)
 Results: SolutionResults(classwise_count={'car': {'IN': 0, 'OUT': 0}, 'person': {'IN': 0, 'OUT': 0}}, total_tracks=11)

0: 384x640 1 person, 8 cars, 135.2ms
Speed: 3.2ms preprocess, 135.2ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
 Results: SolutionResults(classwise_count={'car': {'IN': 0, 'OUT': 0}, 'person': {'IN': 0, 'OUT': 0}}, total_tracks=9)

0: 384x640 1 person, 8 cars, 142.9ms
Speed: 2.3ms preprocess, 142.9ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)
 Results: SolutionResults(classwise_count={'car': {'IN': 0, 'OUT': 0}, 'person': {'IN': 0, 'OUT': 0}}, total_tracks=9)

0: 384x640 1 person, 8 cars, 127.0ms
Speed: 1.3ms preprocess, 127.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
 Results: SolutionResults(classwise_count={'car': {'IN': 0, 'OUT': 0}, 'person': {'IN': 0, 'OUT':

### Live stream
