In [1]:
import torch
import cv2

In [2]:
from ultralytics import YOLO

In [3]:
model = YOLO('./yolov8n.pt')

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [5]:
model.to(device)

YOLO(
  (model): DetectionModel(
    (model): Sequential(
      (0): Conv(
        (conv): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (1): Conv(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (2): C2f(
        (cv1): Conv(
          (conv): Conv2d(32, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
          (act): SiLU(inplace=True)
        )
        (cv2): Conv(
          (conv): Conv2d(48, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_s

In [6]:
classes = model.names
classes

{0: 'person',
 1: 'bicycle',
 2: 'car',
 3: 'motorcycle',
 4: 'airplane',
 5: 'bus',
 6: 'train',
 7: 'truck',
 8: 'boat',
 9: 'traffic light',
 10: 'fire hydrant',
 11: 'stop sign',
 12: 'parking meter',
 13: 'bench',
 14: 'bird',
 15: 'cat',
 16: 'dog',
 17: 'horse',
 18: 'sheep',
 19: 'cow',
 20: 'elephant',
 21: 'bear',
 22: 'zebra',
 23: 'giraffe',
 24: 'backpack',
 25: 'umbrella',
 26: 'handbag',
 27: 'tie',
 28: 'suitcase',
 29: 'frisbee',
 30: 'skis',
 31: 'snowboard',
 32: 'sports ball',
 33: 'kite',
 34: 'baseball bat',
 35: 'baseball glove',
 36: 'skateboard',
 37: 'surfboard',
 38: 'tennis racket',
 39: 'bottle',
 40: 'wine glass',
 41: 'cup',
 42: 'fork',
 43: 'knife',
 44: 'spoon',
 45: 'bowl',
 46: 'banana',
 47: 'apple',
 48: 'sandwich',
 49: 'orange',
 50: 'broccoli',
 51: 'carrot',
 52: 'hot dog',
 53: 'pizza',
 54: 'donut',
 55: 'cake',
 56: 'chair',
 57: 'couch',
 58: 'potted plant',
 59: 'bed',
 60: 'dining table',
 61: 'toilet',
 62: 'tv',
 63: 'laptop',
 64: 'mou

In [13]:
cap = cv2.VideoCapture(0) # for opening video
#cap = cv2.VideoCapture('./3 minutes walk through the streets of Kaunas, Lithuania.mp4')

fourcc = cv2.VideoWriter_fourcc(*'MJPG')
out = cv2.VideoWriter('Video.mp4', fourcc, 24, (1280, 720))

while cap.isOpened():
    success, img = cap.read()
    if not success:
        break
    width = img.shape[1]
    height = img.shape[0]
    results = model(img)
    for result in results:
        objects = result.boxes.cls
        bboxes = result.boxes.xyxyn  # box for person shows coordination for object(calculation: coordinate/ pixels)
        confidences = result.boxes.conf
        for obj, conf, box in zip(objects, confidences, bboxes):
            conf = conf.tolist()
            if conf < 0.5:
                continue
            conf = int(conf * 100)
            x1, y1 = int(box[0] * width), int(box[1] * height)
            x2, y2 = int(box[2] * width), int(box[3] * height)
            label = classes[obj.tolist()]
            cv2.rectangle(img, (x1, y1), (x2, y2), (0, 0, 255), 2)   #bgr
            cv2.putText(img, f'{label}:({conf})%', (x1, y1+30), 
                       cv2.FONT_HERSHEY_COMPLEX, 1, (255, 240, 50), 2
                       )
    out.write(img)                       
    cv2.imshow('image', img)
    if cv2.waitKey(1) & 0xff == 27: #wait for user to give input and esc click for true
        cap.release()  # break to see frame by frame
        cv2.destroyAllWindows()  # shows the photo in new tab
        break
        
cap.release()  # break to see frame by frame
cv2.destroyAllWindows() 


0: 480x640 1 person, 122.6ms
Speed: 2.0ms preprocess, 122.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bird, 119.7ms
Speed: 3.0ms preprocess, 119.7ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 117.6ms
Speed: 2.0ms preprocess, 117.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 bird, 1 cat, 101.5ms
Speed: 1.0ms preprocess, 101.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 104.5ms
Speed: 2.0ms preprocess, 104.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 cat, 112.6ms
Speed: 2.0ms preprocess, 112.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 cat, 117.6ms
Speed: 1.0ms preprocess, 117.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 cat, 110.5ms
Speed: 1.0m

Speed: 2.0ms preprocess, 134.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 130.6ms
Speed: 2.0ms preprocess, 130.6ms inference, 2.1ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 132.5ms
Speed: 2.0ms preprocess, 132.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 129.6ms
Speed: 2.0ms preprocess, 129.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 136.6ms
Speed: 2.0ms preprocess, 136.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 131.5ms
Speed: 3.0ms preprocess, 131.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 135.2ms
Speed: 4.0ms preprocess, 135.2ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 cat, 136.5ms
Speed: 2.0ms preprocess, 136.5ms inference, 2.0ms postprocess per image at shape (1, 3, 48

Speed: 2.0ms preprocess, 140.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 4 persons, 122.5ms
Speed: 2.0ms preprocess, 122.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 124.5ms
Speed: 2.5ms preprocess, 124.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 144.1ms
Speed: 2.0ms preprocess, 144.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 3 persons, 124.5ms
Speed: 1.0ms preprocess, 124.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 119.5ms
Speed: 2.0ms preprocess, 119.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 traffic light, 121.1ms
Speed: 2.0ms preprocess, 121.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 1 traffic light, 119.5ms
Speed: 2.0ms preprocess, 119.5ms inference, 2.0ms postprocess p

Speed: 2.0ms preprocess, 136.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 2 persons, 120.5ms
Speed: 2.0ms preprocess, 120.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 128.5ms
Speed: 2.0ms preprocess, 128.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 133.5ms
Speed: 2.0ms preprocess, 133.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 136.5ms
Speed: 1.0ms preprocess, 136.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 horse, 117.5ms
Speed: 2.0ms preprocess, 117.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 vase, 133.0ms
Speed: 3.0ms preprocess, 133.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 vase, 131.6ms
Speed: 2.0ms preprocess, 131.6ms inference, 2.0ms postprocess per image at shape (1, 3,


0: 480x640 1 cat, 103.5ms
Speed: 2.0ms preprocess, 103.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 115.0ms
Speed: 3.0ms preprocess, 115.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 133.5ms
Speed: 2.0ms preprocess, 133.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 bear, 138.1ms
Speed: 1.0ms preprocess, 138.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 131.6ms
Speed: 2.0ms preprocess, 131.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 112.5ms
Speed: 1.0ms preprocess, 112.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 128.5ms
Speed: 1.0ms preprocess, 128.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 133.5ms
Speed: 2.0ms preprocess, 133.5ms inference, 2.0ms postprocess per image at shape 


0: 480x640 1 cat, 133.5ms
Speed: 3.0ms preprocess, 133.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 1 bear, 112.8ms
Speed: 2.0ms preprocess, 112.8ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 127.8ms
Speed: 2.0ms preprocess, 127.8ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 1 bear, 117.5ms
Speed: 2.0ms preprocess, 117.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 cat, 120.4ms
Speed: 1.0ms preprocess, 120.4ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 130.5ms
Speed: 2.0ms preprocess, 130.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 bear, 124.5ms
Speed: 2.0ms preprocess, 124.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 128.6ms
Speed: 2.0ms preprocess, 128.6ms inference, 1.0ms postprocess per im


0: 480x640 (no detections), 133.0ms
Speed: 1.0ms preprocess, 133.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 114.5ms
Speed: 2.0ms preprocess, 114.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 124.1ms
Speed: 2.0ms preprocess, 124.1ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 115.5ms
Speed: 2.0ms preprocess, 115.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 132.5ms
Speed: 2.0ms preprocess, 132.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 121.5ms
Speed: 2.0ms preprocess, 121.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 128.6ms
Speed: 2.0ms preprocess, 128.6ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 139.2ms
Speed: 2.0ms prepr


0: 480x640 (no detections), 119.5ms
Speed: 2.0ms preprocess, 119.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 (no detections), 128.1ms
Speed: 3.0ms preprocess, 128.1ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 dog, 134.2ms
Speed: 2.0ms preprocess, 134.2ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 dog, 138.0ms
Speed: 3.0ms preprocess, 138.0ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 dog, 127.5ms
Speed: 2.0ms preprocess, 127.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 dog, 132.0ms
Speed: 2.0ms preprocess, 132.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 121.6ms
Speed: 2.0ms preprocess, 121.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 dog, 125.9ms
Speed: 3.1ms preprocess, 125.9ms inference, 1.0ms postprocess per


0: 480x640 1 person, 133.5ms
Speed: 2.0ms preprocess, 133.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 111.5ms
Speed: 2.0ms preprocess, 111.5ms inference, 1.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 123.5ms
Speed: 2.0ms preprocess, 123.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 125.5ms
Speed: 2.0ms preprocess, 125.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 130.0ms
Speed: 2.0ms preprocess, 130.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 136.6ms
Speed: 2.0ms preprocess, 136.6ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 1 cat, 136.5ms
Speed: 2.5ms preprocess, 136.5ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)

0: 480x640 1 person, 128.6ms
Speed: 1.0ms preprocess, 128.6ms inference, 2.0ms postprocess per i


0: 480x640 1 person, 133.0ms
Speed: 1.0ms preprocess, 133.0ms inference, 2.0ms postprocess per image at shape (1, 3, 480, 640)
