# Обучение модели

In [1]:
from ultralytics import YOLO
import torch
torch.cuda.empty_cache()


# Load a model
model = YOLO("yolo11s.pt")

# Train the model
results = model.train(data="coco8.yaml", epochs=100, imgsz=640)


Ultralytics 8.3.223 🚀 Python-3.9.23 torch-2.8.0+cu129 CUDA:0 (NVIDIA GeForce RTX 3050 6GB Laptop GPU, 5805MiB)
[34m[1mengine/trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=coco8.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None, epochs=100, erasing=0.4, exist_ok=False, fliplr=0.5, flipud=0.0, format=torchscript, fraction=1.0, freeze=None, half=False, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4, imgsz=640, int8=False, iou=0.7, keras=False, kobj=1.0, line_width=None, lr0=0.01, lrf=0.01, mask_ratio=4, max_det=300, mixup=0.0, mode=train, model=yolo11s.pt, momentum=0.937, mosaic=1.0, multi_scale=False, name=train, nbs=64, nms=False, opset=None, optimize=False, optimizer=auto, overlap_mask=True, patience=100, perspective

# Инференс модели

## Стандартное предсказание обнаружения людей:

In [30]:
import cv2
import random


def process_video_with_tracking(model, input_video_path, show_video=True, save_video=False, output_video_path="detect_output_video.mp4"):
    cap = cv2.VideoCapture(input_video_path)
    
    if not cap.isOpened():
        raise Exception("Error: Could not open video file.")
    
    # Get input video frame rate and dimensions
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Define the output video writer
    if save_video:
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
    
    color = (0, 0, 255)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        results = model(frame)
        boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
        cls = results[0].boxes.cls.cpu().numpy().astype(int)
        confs = results[0].boxes.conf.cpu().numpy().astype(float)
        
        
        for box, cl, conf in zip(boxes, cls, confs):
            if cl == 0:
                cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), color, 2)
                cv2.putText(
                    frame,
                    f"Person: {conf:.2f}",
                    (box[0], box[1]),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.5,
                    (0, 255, 255),
                    2,
                )
        if save_video:
            out.write(frame)
            
        if show_video:
            frame = cv2.resize(frame, (0, 0), fx=0.75, fy=0.75)
            cv2.imshow("frame", frame)
        
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    
    # Release the input video capture and output video writer
    cap.release()
    if save_video:
        out.release()
    
    # Close all OpenCV windows
    cv2.destroyAllWindows()

# Example usage:
model = YOLO("runs/detect/train/weights/best.pt")
model.fuse()
process_video_with_tracking(model, "crowd.mp4", show_video=True, save_video=True, output_video_path="detect_output_video.mp4")                

YOLO11s summary (fused): 100 layers, 9,443,760 parameters, 0 gradients, 21.5 GFLOPs

0: 384x640 12 persons, 1 stop sign, 2 umbrellas, 2 handbags, 1 chair, 1 dining table, 10.0ms
Speed: 18.4ms preprocess, 10.0ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 stop sign, 2 umbrellas, 2 handbags, 1 chair, 1 dining table, 8.0ms
Speed: 1.7ms preprocess, 8.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 11 persons, 1 stop sign, 2 umbrellas, 1 handbag, 1 chair, 1 dining table, 8.9ms
Speed: 1.9ms preprocess, 8.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 stop sign, 2 umbrellas, 2 handbags, 1 chair, 1 dining table, 9.2ms
Speed: 1.5ms preprocess, 9.2ms inference, 1.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 12 persons, 1 stop sign, 2 umbrellas, 2 handbags, 1 chair, 1 dining table, 7.8ms
Speed: 1.8ms preprocess, 7.8ms inference, 1.7ms postprocess per i

## Трикинг людей, у каждого из которых свой цвет в соответствии их id:

In [None]:
import cv2
import random


def process_video_with_tracking(model, input_video_path, show_video=True, save_video=False, output_video_path="tracking_output_video.mp4"):
    cap = cv2.VideoCapture(input_video_path)
    
    if not cap.isOpened():
        raise Exception("Error: Could not open video file.")
    
    # Get input video frame rate and dimensions
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frame_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    
    # Define the output video writer
    if save_video:
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(output_video_path, fourcc, fps, (frame_width, frame_height))
    
    # color = (255, 0, 0)
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        results = model.track(frame, iou=0.4, conf=0.5, persist=True, imgsz=640, verbose=False, tracker="botsort.yaml")
        
        if results[0].boxes.id != None: # this will ensure that id is not None -> exist tracks
            boxes = results[0].boxes.xyxy.cpu().numpy().astype(int)
            ids = results[0].boxes.id.cpu().numpy().astype(int)
            cls = results[0].boxes.cls.cpu().numpy().astype(int)
            confs = results[0].boxes.conf.cpu().numpy().astype(float)
            
            
            for box, id, cl, conf in zip(boxes, ids, cls, confs):
                if cl == 0:
                    # Generate a random color for each object based on its ID
                    random.seed(int(id))
                    color = (random.randint(0, 255), random.randint(0, 255), random.randint(0, 255))
                    
                    
                    cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), color, 2)
                    cv2.putText(
                        frame,
                        f"Person{id}: {conf:.2f}",
                        (box[0], box[1]),
                        cv2.FONT_HERSHEY_SIMPLEX,
                        0.5,
                        (0, 255, 255),
                        2,
                    )
        if save_video:
            out.write(frame)
            
        if show_video:
            frame = cv2.resize(frame, (0, 0), fx=0.75, fy=0.75)
            cv2.imshow("frame", frame)
        
        if cv2.waitKey(1) & 0xFF == ord("q"):
            break
    
    # Release the input video capture and output video writer
    cap.release()
    if save_video:
        out.release()
    
    # Close all OpenCV windows
    cv2.destroyAllWindows()

# Example usage:
model = YOLO("runs/detect/train/weights/best.pt")
model.fuse()
process_video_with_tracking(model, "crowd.mp4", show_video=True, save_video=True, output_video_path="tracking_output_video.mp4")                                         

YOLO11s summary (fused): 100 layers, 9,443,760 parameters, 0 gradients, 21.5 GFLOPs
