In [1]:
from ultralytics import YOLO
import random, os, tqdm

SRC_IMAGES_PATH = "./unlabeled_images"
DST_LABELS_PATH = "./labels"

if not os.path.exists(DST_LABELS_PATH):
    os.makedirs(DST_LABELS_PATH)

# Initialize a YOLO-World model
model = YOLO('yolov8x-worldv2.pt', verbose=False)  # or choose yolov8m/l-world.pt

## Test on custom Classes

In [9]:
classes = [
    "Rackets"
]
model.set_classes(classes)

In [11]:
IMAGE_PATHS = random.choices([os.path.join(SRC_IMAGES_PATH, im_name) for im_name in os.listdir(SRC_IMAGES_PATH) if os.path.isfile(os.path.join(SRC_IMAGES_PATH, im_name))], k = 1)
predictions = model.predict(IMAGE_PATHS, save = True, stream=False)

print(predictions)




0: 640x640 (no detections), 323.4ms
Speed: 3.0ms preprocess, 323.4ms inference, 0.5ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns\detect\predict[0m
[ultralytics.engine.results.Results object with attributes:

boxes: ultralytics.engine.results.Boxes object
keypoints: None
masks: None
names: ['Rackets']
obb: None
orig_img: array([[[19, 20, 11],
        [19, 20, 11],
        [19, 20, 11],
        ...,
        [99, 64,  1],
        [99, 64,  1],
        [99, 64,  1]],

       [[19, 20, 11],
        [19, 20, 11],
        [19, 20, 11],
        ...,
        [99, 64,  1],
        [99, 64,  1],
        [99, 64,  1]],

       [[19, 20, 11],
        [19, 20, 11],
        [19, 20, 11],
        ...,
        [99, 64,  1],
        [99, 64,  1],
        [99, 64,  1]],

       ...,

       [[13,  5,  5],
        [13,  5,  5],
        [13,  5,  5],
        ...,
        [13,  3,  3],
        [12,  2,  2],
        [12,  2,  2]],

       [[13,  5,  5],
        [13,  5,  5],
 

## Train on COCO

In [6]:
classes = [
    "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", 
    "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", 
    "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", 
    "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", 
    "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", 
    "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", 
    "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", 
    "cell phone", "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", 
    "scissors", "teddy bear", "hair drier", "toothbrush"
]


# Define custom classes
model.set_classes(classes)

In [3]:
batch_size = 20

# IMAGE_PATHS = random.choices([os.path.join(UNLABELED_IMAGES_PATH, im_name) for im_name in os.listdir(UNLABELED_IMAGES_PATH) if os.path.isfile(os.path.join(UNLABELED_IMAGES_PATH, im_name))], k = batch_size)
IMAGE_PATHS = [os.path.join(SRC_IMAGES_PATH, im_name) for im_name in os.listdir(SRC_IMAGES_PATH) if os.path.isfile(os.path.join(SRC_IMAGES_PATH, im_name))]
batches = [IMAGE_PATHS[i:i+batch_size] for i in range(0, len(IMAGE_PATHS), batch_size)]


for batch in tqdm.tqdm(batches, desc="Batch:"):
  # Execute prediction for specified categories on an image
  predictions = model.predict(batch, save = False, stream=True)

  for prediction in predictions:
    image_path = os.path.normpath(prediction.path)
    _, image_name = os.path.split(image_path)
    label_name = os.path.splitext(image_name)[0] + ".txt"
    
    classes = prediction.boxes.cls
    
    with open(os.path.join(DST_LABELS_PATH, label_name), '+w') as file:
      for idx, box in enumerate(prediction.boxes.xywhn):
        cls = int(classes[idx].item())
        file.write(f"{cls} {box[0].item()} {box[1].item()} {box[2].item()} {box[3].item()}\n")

Batch::   0%|          | 0/54 [00:00<?, ?it/s]


0: 640x640 1 person, 1: 640x640 4 bottles, 2: 640x640 (no detections), 3: 640x640 (no detections), 4: 640x640 1 bottle, 5: 640x640 1 bottle, 6: 640x640 1 bottle, 7: 640x640 1 bottle, 8: 640x640 1 bottle, 9: 640x640 5 bottles, 10: 640x640 1 book, 11: 640x640 3 bottles, 1 cup, 12: 640x640 1 book, 13: 640x640 1 bottle, 14: 640x640 1 bottle, 15: 640x640 1 bottle, 1 scissors, 16: 640x640 (no detections), 17: 640x640 1 bottle, 1 cup, 18: 640x640 2 bottles, 1 scissors, 19: 640x640 1 book, 533.3ms
Speed: 1.7ms preprocess, 26.7ms inference, 2.6ms postprocess per image at shape (1, 3, 640, 640)


Batch::   2%|▏         | 1/54 [00:01<01:36,  1.82s/it]


0: 640x640 1 bottle, 1: 640x640 1 motorcycle, 2: 640x640 1 bottle, 3: 640x640 1 bottle, 1 cup, 4: 640x640 1 book, 2 scissorss, 5: 640x640 1 bottle, 6: 640x640 2 bottles, 7: 640x640 1 parking meter, 8: 640x640 1 bottle, 1 cup, 9: 640x640 1 bottle, 1 cup, 10: 640x640 1 bottle, 11: 640x640 1 bottle, 1 cup, 1 vase, 12: 640x640 2 bottles, 13: 640x640 1 cup, 14: 640x640 1 fire hydrant, 2 bottles, 15: 640x640 1 bottle, 16: 640x640 1 bottle, 17: 640x640 (no detections), 18: 640x640 1 bottle, 19: 640x640 1 handbag, 479.1ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::   4%|▎         | 2/54 [00:02<00:59,  1.15s/it]


0: 640x640 1 bottle, 1: 640x640 1 bottle, 1 cell phone, 2: 640x640 (no detections), 3: 640x640 1 bottle, 4: 640x640 (no detections), 5: 640x640 1 bottle, 6: 640x640 3 suitcases, 2 scissorss, 7: 640x640 1 bottle, 8: 640x640 2 bottles, 1 scissors, 9: 640x640 2 bottles, 10: 640x640 5 bottles, 11: 640x640 1 bottle, 12: 640x640 (no detections), 13: 640x640 1 bottle, 14: 640x640 (no detections), 15: 640x640 2 bottles, 16: 640x640 1 bottle, 17: 640x640 1 suitcase, 18: 640x640 (no detections), 19: 640x640 1 bottle, 1 knife, 1 remote, 479.8ms
Speed: 1.6ms preprocess, 24.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)


Batch::   6%|▌         | 3/54 [00:03<00:47,  1.08it/s]


0: 640x640 3 bottles, 1: 640x640 1 bottle, 1 cup, 1 toilet, 2: 640x640 1 bottle, 1 scissors, 3: 640x640 2 bottles, 4: 640x640 1 bottle, 5: 640x640 1 scissors, 6: 640x640 1 bottle, 7: 640x640 1 bottle, 8: 640x640 1 scissors, 9: 640x640 2 bottles, 1 chair, 10: 640x640 1 bottle, 11: 640x640 1 chair, 12: 640x640 1 bottle, 13: 640x640 1 scissors, 14: 640x640 (no detections), 15: 640x640 1 bottle, 1 cell phone, 16: 640x640 1 bottle, 17: 640x640 3 bottles, 18: 640x640 1 suitcase, 2 scissorss, 19: 640x640 1 bottle, 480.1ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::   7%|▋         | 4/54 [00:03<00:41,  1.21it/s]


0: 640x640 1 bottle, 1: 640x640 1 cell phone, 2: 640x640 1 toilet, 3: 640x640 1 scissors, 4: 640x640 1 motorcycle, 1 fire hydrant, 5: 640x640 1 bottle, 1 scissors, 6: 640x640 (no detections), 7: 640x640 1 bottle, 8: 640x640 1 bicycle, 9: 640x640 3 bottles, 10: 640x640 1 book, 2 scissorss, 11: 640x640 2 bottles, 12: 640x640 2 bottles, 13: 640x640 1 bottle, 1 scissors, 14: 640x640 1 bottle, 15: 640x640 1 bottle, 16: 640x640 1 person, 1 tie, 4 bottles, 17: 640x640 1 bottle, 18: 640x640 2 suitcases, 1 scissors, 19: 640x640 2 bottles, 480.4ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::   9%|▉         | 5/54 [00:04<00:37,  1.30it/s]


0: 640x640 1 handbag, 1 scissors, 1: 640x640 1 bottle, 2: 640x640 1 scissors, 3: 640x640 1 bottle, 4: 640x640 1 scissors, 5: 640x640 1 bowl, 6: 640x640 1 car, 7: 640x640 1 toilet, 8: 640x640 1 bottle, 9: 640x640 1 bottle, 10: 640x640 1 bottle, 1 scissors, 11: 640x640 1 bottle, 1 cup, 12: 640x640 2 bottles, 1 scissors, 1 toothbrush, 13: 640x640 2 bottles, 14: 640x640 (no detections), 15: 640x640 1 bottle, 16: 640x640 1 bottle, 1 toothbrush, 17: 640x640 2 bottles, 1 knife, 18: 640x640 (no detections), 19: 640x640 3 bottles, 480.7ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  11%|█         | 6/54 [00:05<00:35,  1.36it/s]


0: 640x640 1 tv, 1: 640x640 1 bottle, 2: 640x640 (no detections), 3: 640x640 2 bottles, 1 scissors, 4: 640x640 1 bottle, 5: 640x640 (no detections), 6: 640x640 2 bottles, 1 cell phone, 7: 640x640 1 bottle, 8: 640x640 1 bottle, 1 cup, 9: 640x640 1 handbag, 10: 640x640 1 bottle, 11: 640x640 1 bottle, 12: 640x640 1 bottle, 13: 640x640 1 bottle, 1 scissors, 14: 640x640 1 book, 15: 640x640 1 bottle, 2 knifes, 16: 640x640 1 bottle, 17: 640x640 1 cup, 18: 640x640 (no detections), 19: 640x640 1 bottle, 480.0ms
Speed: 1.6ms preprocess, 24.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)


Batch::  13%|█▎        | 7/54 [00:05<00:33,  1.40it/s]


0: 640x640 2 bottles, 1: 640x640 2 bottles, 2: 640x640 (no detections), 3: 640x640 2 bottles, 4: 640x640 1 bottle, 5: 640x640 1 motorcycle, 1 bottle, 1 cup, 6: 640x640 1 cup, 7: 640x640 1 handbag, 1 scissors, 8: 640x640 1 bottle, 9: 640x640 1 bottle, 10: 640x640 1 bottle, 1 scissors, 11: 640x640 1 bottle, 12: 640x640 (no detections), 13: 640x640 1 frisbee, 1 cup, 14: 640x640 1 bottle, 1 book, 15: 640x640 1 bottle, 16: 640x640 1 bottle, 1 cup, 17: 640x640 1 cup, 18: 640x640 1 bottle, 19: 640x640 1 bottle, 480.0ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  15%|█▍        | 8/54 [00:06<00:32,  1.43it/s]


0: 640x640 1 handbag, 1 bottle, 1: 640x640 2 bottles, 1 cup, 2: 640x640 1 cup, 3: 640x640 1 bottle, 4: 640x640 1 bird, 5: 640x640 1 book, 6: 640x640 1 bottle, 7: 640x640 (no detections), 8: 640x640 3 bottles, 9: 640x640 1 bottle, 10: 640x640 (no detections), 11: 640x640 2 bottles, 1 scissors, 12: 640x640 1 bottle, 1 cup, 13: 640x640 (no detections), 14: 640x640 5 bottles, 15: 640x640 1 bottle, 1 scissors, 1 toothbrush, 16: 640x640 1 bottle, 1 cup, 17: 640x640 4 bottles, 18: 640x640 2 bottles, 19: 640x640 1 book, 480.1ms
Speed: 1.5ms preprocess, 24.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)


Batch::  17%|█▋        | 9/54 [00:07<00:31,  1.45it/s]


0: 640x640 1 scissors, 1: 640x640 1 bottle, 2: 640x640 1 bottle, 1 cup, 3: 640x640 2 bottles, 4: 640x640 1 bottle, 5: 640x640 1 bottle, 6: 640x640 2 bottles, 7: 640x640 2 bottles, 8: 640x640 1 bottle, 1 cup, 9: 640x640 (no detections), 10: 640x640 3 persons, 11: 640x640 1 bottle, 12: 640x640 2 bottles, 13: 640x640 (no detections), 14: 640x640 1 bottle, 15: 640x640 7 bottles, 2 books, 16: 640x640 1 bottle, 17: 640x640 (no detections), 18: 640x640 1 car, 19: 640x640 2 bottles, 479.0ms
Speed: 1.4ms preprocess, 23.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  19%|█▊        | 10/54 [00:07<00:30,  1.46it/s]


0: 640x640 1 bottle, 1: 640x640 1 toilet, 1 scissors, 2: 640x640 (no detections), 3: 640x640 1 bottle, 4: 640x640 1 bottle, 1 book, 5: 640x640 1 bottle, 6: 640x640 2 bottles, 1 scissors, 7: 640x640 1 car, 8: 640x640 1 bottle, 9: 640x640 2 bottles, 10: 640x640 (no detections), 11: 640x640 2 bottles, 12: 640x640 1 bottle, 13: 640x640 1 bottle, 14: 640x640 1 bottle, 15: 640x640 1 bottle, 16: 640x640 1 bottle, 17: 640x640 1 bottle, 18: 640x640 3 bottles, 19: 640x640 1 bottle, 481.1ms
Speed: 1.4ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  20%|██        | 11/54 [00:08<00:29,  1.47it/s]


0: 640x640 1 fire hydrant, 1 bottle, 1 knife, 1 toothbrush, 1: 640x640 1 bottle, 1 cell phone, 2: 640x640 (no detections), 3: 640x640 (no detections), 4: 640x640 1 bottle, 1 cup, 5: 640x640 1 bottle, 1 cup, 6: 640x640 1 bottle, 1 chair, 7: 640x640 1 bottle, 8: 640x640 1 scissors, 9: 640x640 5 bottles, 10: 640x640 2 bottles, 11: 640x640 2 bottles, 12: 640x640 (no detections), 13: 640x640 1 bottle, 1 cup, 14: 640x640 2 bottles, 15: 640x640 2 bottles, 1 cup, 16: 640x640 1 bottle, 1 cup, 17: 640x640 1 bottle, 18: 640x640 3 bottles, 19: 640x640 1 bottle, 480.0ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  22%|██▏       | 12/54 [00:09<00:28,  1.48it/s]


0: 640x640 2 bottles, 1: 640x640 1 knife, 2: 640x640 (no detections), 3: 640x640 1 bottle, 4: 640x640 (no detections), 5: 640x640 (no detections), 6: 640x640 1 bottle, 7: 640x640 2 bottles, 1 cup, 8: 640x640 (no detections), 9: 640x640 (no detections), 10: 640x640 (no detections), 11: 640x640 1 book, 12: 640x640 (no detections), 13: 640x640 1 chair, 14: 640x640 2 bottles, 15: 640x640 (no detections), 16: 640x640 (no detections), 17: 640x640 (no detections), 18: 640x640 2 bottles, 19: 640x640 (no detections), 479.9ms
Speed: 1.5ms preprocess, 24.0ms inference, 0.7ms postprocess per image at shape (1, 3, 640, 640)


Batch::  24%|██▍       | 13/54 [00:09<00:27,  1.49it/s]


0: 640x640 1 bottle, 1: 640x640 1 person, 1 tv, 2: 640x640 2 bottles, 3: 640x640 1 bottle, 1 book, 4: 640x640 1 suitcase, 1 bottle, 5: 640x640 1 bottle, 6: 640x640 1 hair drier, 7: 640x640 1 suitcase, 8: 640x640 (no detections), 9: 640x640 1 fire hydrant, 1 bottle, 10: 640x640 1 bottle, 11: 640x640 2 persons, 1 bottle, 12: 640x640 1 scissors, 13: 640x640 1 bottle, 1 toothbrush, 14: 640x640 1 bottle, 15: 640x640 (no detections), 16: 640x640 (no detections), 17: 640x640 1 fire hydrant, 18: 640x640 (no detections), 19: 640x640 (no detections), 480.0ms
Speed: 1.5ms preprocess, 24.0ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)


Batch::  26%|██▌       | 14/54 [00:10<00:26,  1.49it/s]


0: 640x640 2 persons, 2 cups, 1: 640x640 1 bottle, 2: 640x640 1 bottle, 3: 640x640 1 bottle, 4: 640x640 1 bottle, 5: 640x640 (no detections), 6: 640x640 1 person, 2 bottles, 7: 640x640 (no detections), 8: 640x640 1 person, 9: 640x640 1 toothbrush, 10: 640x640 1 person, 2 refrigerators, 11: 640x640 2 bottles, 12: 640x640 1 bottle, 13: 640x640 1 cell phone, 1 scissors, 14: 640x640 1 bottle, 15: 640x640 1 scissors, 16: 640x640 2 bottles, 17: 640x640 (no detections), 18: 640x640 1 bottle, 19: 640x640 1 truck, 479.9ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  28%|██▊       | 15/54 [00:11<00:26,  1.49it/s]


0: 640x640 3 persons, 1 truck, 1 stop sign, 1: 640x640 1 frisbee, 1 cup, 2: 640x640 1 bottle, 3: 640x640 1 bottle, 4: 640x640 2 bottles, 5: 640x640 1 bottle, 6: 640x640 1 bird, 7: 640x640 2 bottles, 1 toothbrush, 8: 640x640 1 clock, 9: 640x640 3 bottles, 10: 640x640 3 cars, 1 bottle, 11: 640x640 (no detections), 12: 640x640 1 bottle, 13: 640x640 1 scissors, 14: 640x640 4 bottles, 15: 640x640 1 bottle, 16: 640x640 1 scissors, 17: 640x640 1 bicycle, 1 motorcycle, 2 bottles, 18: 640x640 1 motorcycle, 19: 640x640 (no detections), 481.1ms
Speed: 1.4ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  30%|██▉       | 16/54 [00:11<00:25,  1.49it/s]


0: 640x640 2 toothbrushs, 1: 640x640 1 person, 2 bottles, 2: 640x640 1 motorcycle, 3: 640x640 2 bottles, 4: 640x640 1 bottle, 1 cup, 5: 640x640 (no detections), 6: 640x640 1 bottle, 7: 640x640 1 bottle, 8: 640x640 (no detections), 9: 640x640 8 bottles, 10: 640x640 2 bottles, 11: 640x640 8 persons, 1 umbrella, 1 handbag, 12: 640x640 18 persons, 1 umbrella, 1 handbag, 13: 640x640 (no detections), 14: 640x640 1 person, 1 bicycle, 1 bottle, 1 chair, 1 cell phone, 15: 640x640 1 person, 16: 640x640 1 person, 2 bottles, 17: 640x640 (no detections), 18: 640x640 1 person, 19: 640x640 1 person, 1 bottle, 481.6ms
Speed: 1.6ms preprocess, 24.1ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)


Batch::  31%|███▏      | 17/54 [00:12<00:24,  1.49it/s]


0: 640x640 1 person, 1 bottle, 1: 640x640 (no detections), 2: 640x640 1 person, 1 suitcase, 1 vase, 3: 640x640 2 bottles, 4: 640x640 (no detections), 5: 640x640 1 bicycle, 6: 640x640 1 bottle, 2 chairs, 1 refrigerator, 7: 640x640 2 bottles, 1 cup, 8: 640x640 3 bottles, 9: 640x640 3 bottles, 10: 640x640 2 bottles, 11: 640x640 1 train, 3 bottles, 12: 640x640 1 motorcycle, 1 train, 13: 640x640 4 bottles, 1 scissors, 1 toothbrush, 14: 640x640 1 bottle, 15: 640x640 1 bottle, 1 cup, 16: 640x640 1 bottle, 17: 640x640 2 bottles, 18: 640x640 1 bottle, 19: 640x640 1 handbag, 2 cups, 1 scissors, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  33%|███▎      | 18/54 [00:13<00:24,  1.48it/s]


0: 640x640 2 bottles, 1: 640x640 3 bottles, 1 cup, 2: 640x640 1 person, 1 bowl, 1 scissors, 3: 640x640 1 motorcycle, 1 cup, 1 toilet, 4: 640x640 2 persons, 5: 640x640 1 bottle, 4 cups, 6: 640x640 1 bottle, 7: 640x640 1 person, 1 bench, 1 clock, 8: 640x640 1 bottle, 9: 640x640 (no detections), 10: 640x640 1 bottle, 11: 640x640 2 benchs, 1 bottle, 3 chairs, 12: 640x640 2 bottles, 13: 640x640 2 toothbrushs, 14: 640x640 1 bicycle, 1 motorcycle, 15: 640x640 1 handbag, 1 couch, 16: 640x640 3 bottles, 17: 640x640 4 bottles, 18: 640x640 1 bottle, 19: 640x640 4 bottles, 481.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  35%|███▌      | 19/54 [00:13<00:23,  1.48it/s]


0: 640x640 4 bottles, 1: 640x640 1 bottle, 2: 640x640 (no detections), 3: 640x640 2 bottles, 4: 640x640 5 bottles, 5: 640x640 1 bottle, 6: 640x640 1 bottle, 7: 640x640 1 bicycle, 1 bottle, 8: 640x640 (no detections), 9: 640x640 1 bicycle, 1 motorcycle, 1 bottle, 10: 640x640 3 bottles, 11: 640x640 3 bottles, 12: 640x640 1 apple, 13: 640x640 1 fire hydrant, 14: 640x640 1 bottle, 15: 640x640 (no detections), 16: 640x640 (no detections), 17: 640x640 1 bottle, 1 chair, 18: 640x640 (no detections), 19: 640x640 1 suitcase, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)


Batch::  37%|███▋      | 20/54 [00:14<00:22,  1.48it/s]


0: 640x640 1 bicycle, 1: 640x640 3 bottles, 2: 640x640 2 bottles, 3: 640x640 2 bottles, 1 potted plant, 4: 640x640 (no detections), 5: 640x640 (no detections), 6: 640x640 1 toothbrush, 7: 640x640 (no detections), 8: 640x640 (no detections), 9: 640x640 1 person, 2 bottles, 10: 640x640 (no detections), 11: 640x640 (no detections), 12: 640x640 2 bottles, 13: 640x640 1 backpack, 1 bottle, 14: 640x640 4 persons, 1 bottle, 15: 640x640 (no detections), 16: 640x640 (no detections), 17: 640x640 1 person, 18: 640x640 1 motorcycle, 1 bottle, 19: 640x640 2 persons, 3 bottles, 481.0ms
Speed: 1.5ms preprocess, 24.0ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)


Batch::  39%|███▉      | 21/54 [00:15<00:22,  1.49it/s]


0: 640x640 2 persons, 1: 640x640 1 person, 1 tennis racket, 2: 640x640 3 persons, 1 handbag, 1 bottle, 3: 640x640 1 person, 4: 640x640 1 stop sign, 1 surfboard, 5: 640x640 1 person, 6: 640x640 (no detections), 7: 640x640 2 persons, 1 book, 8: 640x640 2 persons, 1 bottle, 9: 640x640 3 bottles, 10: 640x640 (no detections), 11: 640x640 (no detections), 12: 640x640 (no detections), 13: 640x640 (no detections), 14: 640x640 (no detections), 15: 640x640 (no detections), 16: 640x640 3 persons, 2 bottles, 1 book, 17: 640x640 1 person, 1 book, 18: 640x640 1 person, 3 bottles, 19: 640x640 1 person, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)


Batch::  41%|████      | 22/54 [00:15<00:21,  1.49it/s]


0: 640x640 2 bottles, 1 cup, 1: 640x640 1 bicycle, 2: 640x640 1 bottle, 3: 640x640 1 bottle, 4: 640x640 1 person, 5: 640x640 2 persons, 1 handbag, 6: 640x640 6 persons, 7: 640x640 1 person, 1 keyboard, 1 book, 8: 640x640 1 bottle, 1 cup, 9: 640x640 1 person, 10: 640x640 3 bottles, 11: 640x640 (no detections), 12: 640x640 1 cup, 13: 640x640 1 cup, 14: 640x640 1 cup, 15: 640x640 1 bottle, 16: 640x640 3 bottles, 17: 640x640 1 bottle, 18: 640x640 (no detections), 19: 640x640 1 bottle, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  43%|████▎     | 23/54 [00:16<00:20,  1.49it/s]


0: 640x640 1 person, 1 bottle, 1: 640x640 3 toothbrushs, 2: 640x640 1 dog, 3: 640x640 1 clock, 4: 640x640 1 bicycle, 5: 640x640 1 person, 1 bicycle, 1 potted plant, 6: 640x640 1 bottle, 7: 640x640 2 persons, 8: 640x640 2 bottles, 9: 640x640 (no detections), 10: 640x640 3 bottles, 1 orange, 1 scissors, 11: 640x640 2 persons, 12: 640x640 2 persons, 13: 640x640 1 person, 1 car, 1 chair, 14: 640x640 1 airplane, 15: 640x640 1 person, 1 bottle, 16: 640x640 1 person, 3 bottles, 17: 640x640 2 bottles, 18: 640x640 2 chairs, 1 potted plant, 19: 640x640 1 fire hydrant, 482.0ms
Speed: 1.6ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  44%|████▍     | 24/54 [00:17<00:20,  1.48it/s]


0: 640x640 1 person, 1 bicycle, 1: 640x640 1 person, 2: 640x640 (no detections), 3: 640x640 1 bottle, 4: 640x640 (no detections), 5: 640x640 3 bottles, 6: 640x640 (no detections), 7: 640x640 9 persons, 1 handbag, 1 bottle, 1 tv, 1 cell phone, 1 book, 8: 640x640 (no detections), 9: 640x640 1 bicycle, 1 motorcycle, 10: 640x640 (no detections), 11: 640x640 7 bottles, 12: 640x640 (no detections), 13: 640x640 1 fire hydrant, 14: 640x640 1 cup, 1 vase, 15: 640x640 2 bottles, 16: 640x640 1 tv, 1 microwave, 17: 640x640 (no detections), 18: 640x640 2 bottles, 19: 640x640 1 fire hydrant, 482.0ms
Speed: 1.4ms preprocess, 24.1ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)


Batch::  46%|████▋     | 25/54 [00:17<00:19,  1.48it/s]


0: 640x640 3 bottles, 1: 640x640 1 bottle, 2: 640x640 1 person, 3: 640x640 (no detections), 4: 640x640 1 person, 5: 640x640 1 bottle, 1 vase, 6: 640x640 1 bottle, 7: 640x640 1 bottle, 8: 640x640 6 persons, 1 handbag, 9: 640x640 1 bottle, 10: 640x640 1 bottle, 11: 640x640 1 bottle, 1 cup, 12: 640x640 (no detections), 13: 640x640 1 handbag, 14: 640x640 4 persons, 2 cups, 1 dining table, 15: 640x640 1 bottle, 16: 640x640 1 person, 17: 640x640 1 boat, 1 umbrella, 18: 640x640 1 person, 19: 640x640 2 persons, 1 bench, 1 tie, 2 cups, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  48%|████▊     | 26/54 [00:18<00:18,  1.48it/s]


0: 640x640 1 person, 1: 640x640 1 person, 2: 640x640 1 motorcycle, 1 airplane, 1 cup, 3: 640x640 1 person, 4: 640x640 1 bottle, 5: 640x640 1 person, 1 bottle, 6: 640x640 1 cat, 1 bottle, 7: 640x640 3 persons, 1 backpack, 1 bottle, 8: 640x640 1 person, 1 bottle, 9: 640x640 1 bottle, 10: 640x640 1 person, 11: 640x640 1 person, 12: 640x640 1 person, 1 bottle, 13: 640x640 1 person, 1 bottle, 14: 640x640 1 bottle, 1 book, 15: 640x640 1 person, 1 baseball bat, 16: 640x640 6 bottles, 17: 640x640 1 bottle, 1 cup, 18: 640x640 1 person, 1 cell phone, 19: 640x640 4 persons, 1 bottle, 481.3ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  50%|█████     | 27/54 [00:19<00:18,  1.48it/s]


0: 640x640 1 person, 1: 640x640 2 persons, 2 tennis rackets, 2: 640x640 1 person, 2 baseball bats, 3: 640x640 10 persons, 1 tennis racket, 4: 640x640 9 persons, 5: 640x640 5 persons, 1 handbag, 1 tennis racket, 6: 640x640 1 person, 1 frisbee, 1 sports ball, 7: 640x640 1 person, 8: 640x640 2 persons, 3 tennis rackets, 9: 640x640 1 mouse, 1 toothbrush, 10: 640x640 1 remote, 11: 640x640 (no detections), 12: 640x640 1 person, 1 sports ball, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 1 person, 2 tennis rackets, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 2 tennis rackets, 17: 640x640 2 persons, 1 frisbee, 1 bottle, 1 spoon, 18: 640x640 1 sports ball, 1 skateboard, 19: 640x640 4 persons, 1 tennis racket, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  52%|█████▏    | 28/54 [00:19<00:17,  1.47it/s]


0: 640x640 1 person, 1 frisbee, 1 tennis racket, 1: 640x640 5 persons, 2 tennis rackets, 2: 640x640 3 persons, 1 tennis racket, 3: 640x640 4 persons, 1 tennis racket, 1 bottle, 4: 640x640 3 persons, 1 tennis racket, 5: 640x640 6 persons, 1 handbag, 1 tennis racket, 6: 640x640 2 persons, 2 frisbees, 3 baseball bats, 6 tennis rackets, 1 bottle, 1 scissors, 7: 640x640 2 persons, 1 baseball glove, 2 tennis rackets, 8: 640x640 2 persons, 1 baseball glove, 1 tennis racket, 9: 640x640 2 persons, 2 frisbees, 2 baseball bats, 10 tennis rackets, 10: 640x640 2 persons, 1 frisbee, 3 baseball bats, 1 baseball glove, 7 tennis rackets, 1 bottle, 11: 640x640 2 persons, 1 handbag, 1 frisbee, 1 skis, 1 snowboard, 3 baseball bats, 1 skateboard, 1 tennis racket, 12: 640x640 3 persons, 3 frisbees, 4 baseball bats, 4 tennis rackets, 2 apples, 4 scissorss, 13: 640x640 2 persons, 1 cup, 2 apples, 14: 640x640 1 scissors, 15: 640x640 1 apple, 1 scissors, 16: 640x640 1 apple, 1 clock, 17: 640x640 2 persons, 1 f

Batch::  54%|█████▎    | 29/54 [00:20<00:17,  1.45it/s]


0: 640x640 1 person, 1: 640x640 2 persons, 1 tennis racket, 2: 640x640 1 person, 1 tennis racket, 3: 640x640 1 person, 1 sports ball, 4: 640x640 2 persons, 1 handbag, 1 bottle, 1 tv, 5: 640x640 2 persons, 1 baseball bat, 1 tennis racket, 6: 640x640 2 persons, 1 tennis racket, 7: 640x640 2 persons, 1 handbag, 1 frisbee, 1 skis, 1 skateboard, 1 bottle, 8: 640x640 3 persons, 2 tennis rackets, 1 wine glass, 1 apple, 9: 640x640 1 person, 1 tennis racket, 10: 640x640 1 person, 2 tennis rackets, 11: 640x640 2 persons, 2 tennis rackets, 12: 640x640 2 persons, 1 tennis racket, 13: 640x640 2 persons, 2 tennis rackets, 14: 640x640 2 persons, 2 tennis rackets, 15: 640x640 2 persons, 3 tennis rackets, 16: 640x640 2 persons, 3 tennis rackets, 17: 640x640 2 persons, 2 tennis rackets, 18: 640x640 2 persons, 2 tennis rackets, 19: 640x640 2 persons, 1 tennis racket, 480.9ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  56%|█████▌    | 30/54 [00:21<00:16,  1.45it/s]


0: 640x640 2 persons, 1 sports ball, 1 baseball glove, 1: 640x640 3 persons, 1 frisbee, 2: 640x640 2 persons, 2 tennis rackets, 3: 640x640 2 persons, 3 tennis rackets, 4: 640x640 2 persons, 1 tennis racket, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 1 person, 1 tennis racket, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 2 persons, 1 handbag, 5 baseball bats, 4 tennis rackets, 1 wine glass, 1 knife, 10: 640x640 2 persons, 1 frisbee, 2 apples, 6 scissorss, 11: 640x640 2 persons, 3 frisbees, 2 tennis rackets, 2 bottles, 3 scissorss, 12: 640x640 2 persons, 2 handbags, 1 frisbee, 1 skis, 1 tennis racket, 13: 640x640 1 person, 1 frisbee, 14: 640x640 1 sports ball, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 1 motorcycle, 1 frisbee, 17: 640x640 2 persons, 1 bottle, 1 toothbrush, 18: 640x640 1 person, 1 baseball bat, 19: 640x640 1 person, 1 tennis racket, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per i

Batch::  57%|█████▋    | 31/54 [00:22<00:15,  1.45it/s]


0: 640x640 1 person, 1 sports ball, 1 tennis racket, 1: 640x640 1 person, 2: 640x640 1 person, 1 tennis racket, 3: 640x640 1 person, 1 tennis racket, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 1 person, 1 tennis racket, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 1 person, 1 baseball bat, 10: 640x640 1 person, 1 tennis racket, 11: 640x640 1 person, 1 tennis racket, 12: 640x640 1 person, 1 baseball bat, 1 tennis racket, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 1 person, 1 tennis racket, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 1 tennis racket, 18: 640x640 1 person, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 481.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  59%|█████▉    | 32/54 [00:22<00:15,  1.46it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 1 person, 1 tennis racket, 2: 640x640 1 person, 4 sports balls, 1 tennis racket, 3: 640x640 1 person, 1 sports ball, 1 tennis racket, 4: 640x640 1 person, 1 handbag, 1 sports ball, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 1 person, 7: 640x640 1 person, 8: 640x640 1 person, 1 handbag, 9: 640x640 1 person, 10: 640x640 1 person, 1 tennis racket, 11: 640x640 1 person, 1 tennis racket, 12: 640x640 1 person, 13: 640x640 1 person, 1 baseball bat, 14: 640x640 1 person, 1 tennis racket, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 1 sports ball, 1 tennis racket, 18: 640x640 1 person, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 481.0ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  61%|██████    | 33/54 [00:23<00:14,  1.46it/s]


0: 640x640 1 person, 1 baseball bat, 1 tennis racket, 1: 640x640 1 person, 1 tennis racket, 2: 640x640 1 person, 1 tennis racket, 3: 640x640 1 person, 1 tennis racket, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 1 person, 6: 640x640 1 person, 7: 640x640 1 person, 1 tennis racket, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 1 person, 1 frisbee, 1 tennis racket, 10: 640x640 1 person, 2 tennis rackets, 11: 640x640 1 person, 1 sports ball, 1 tennis racket, 12: 640x640 1 person, 1 tennis racket, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 1 person, 1 tennis racket, 15: 640x640 1 person, 2 tennis rackets, 16: 640x640 1 person, 1 sports ball, 1 tennis racket, 17: 640x640 1 person, 1 tennis racket, 18: 640x640 1 person, 1 sports ball, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  63%|██████▎   | 34/54 [00:24<00:13,  1.46it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 1 person, 1 clock, 2: 640x640 1 person, 1 sports ball, 2 tennis rackets, 3: 640x640 1 person, 1 tennis racket, 4: 640x640 1 person, 1 baseball bat, 2 tennis rackets, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 1 person, 1 sports ball, 2 tennis rackets, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 1 person, 2 tennis rackets, 10: 640x640 1 person, 1 tie, 11: 640x640 1 person, 1 sports ball, 1 tennis racket, 12: 640x640 1 person, 1 tennis racket, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 1 person, 1 tennis racket, 15: 640x640 1 person, 1 sports ball, 1 tennis racket, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 18: 640x640 1 person, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  65%|██████▍   | 35/54 [00:24<00:13,  1.46it/s]


0: 640x640 1 person, 1: 640x640 1 person, 1 tennis racket, 2: 640x640 1 person, 1 sports ball, 1 tennis racket, 3: 640x640 1 person, 1 tennis racket, 4: 640x640 1 person, 2 tennis rackets, 5: 640x640 2 persons, 2 tennis rackets, 6: 640x640 1 person, 1 frisbee, 2 tennis rackets, 7: 640x640 1 person, 1 tennis racket, 8: 640x640 1 person, 1 sports ball, 1 tennis racket, 9: 640x640 1 person, 1 tennis racket, 10: 640x640 1 person, 1 tennis racket, 11: 640x640 1 person, 1 tennis racket, 12: 640x640 1 person, 1 sports ball, 1 tennis racket, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 1 person, 2 tennis rackets, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 18: 640x640 1 person, 1 tennis racket, 19: 640x640 1 person, 1 sports ball, 1 tennis racket, 481.0ms
Speed: 1.5ms preprocess, 24.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  67%|██████▋   | 36/54 [00:25<00:12,  1.46it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 1 person, 1 tennis racket, 2: 640x640 1 person, 1 tennis racket, 3: 640x640 1 person, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 1 person, 1 tennis racket, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 1 person, 1 tennis racket, 10: 640x640 2 persons, 1 tennis racket, 11: 640x640 3 persons, 1 tennis racket, 12: 640x640 1 person, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 2 persons, 1 handbag, 1 frisbee, 1 tennis racket, 15: 640x640 3 persons, 1 tennis racket, 16: 640x640 1 person, 17: 640x640 3 persons, 1 tennis racket, 18: 640x640 3 persons, 1 tennis racket, 19: 640x640 2 persons, 1 frisbee, 1 sports ball, 1 tennis racket, 481.3ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  69%|██████▊   | 37/54 [00:26<00:11,  1.46it/s]


0: 640x640 3 persons, 2 tennis rackets, 1: 640x640 4 persons, 1 tennis racket, 2: 640x640 2 persons, 1 sports ball, 1 baseball bat, 3: 640x640 1 person, 1 frisbee, 1 tennis racket, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 2 persons, 2 tennis rackets, 1 apple, 7: 640x640 1 person, 1 frisbee, 2 skateboards, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 3 persons, 1 frisbee, 1 tennis racket, 10: 640x640 1 person, 1 baseball bat, 1 tennis racket, 11: 640x640 1 person, 1 chair, 12: 640x640 1 person, 3 sports balls, 1 tennis racket, 13: 640x640 4 persons, 1 sports ball, 1 tennis racket, 14: 640x640 2 persons, 1 frisbee, 15: 640x640 3 persons, 1 baseball glove, 2 tennis rackets, 16: 640x640 2 persons, 2 tennis rackets, 17: 640x640 1 person, 1 truck, 1 tennis racket, 18: 640x640 2 persons, 1 tennis racket, 19: 640x640 2 persons, 1 frisbee, 1 tennis racket, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (

Batch::  70%|███████   | 38/54 [00:26<00:11,  1.45it/s]


0: 640x640 1 person, 1 sports ball, 1 tennis racket, 1: 640x640 2 persons, 3 sports balls, 1 tennis racket, 1 apple, 2: 640x640 2 persons, 2 tennis rackets, 3: 640x640 1 person, 2 tennis rackets, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 1 person, 1 sports ball, 1 tennis racket, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 1 person, 1 tennis racket, 10: 640x640 1 person, 11: 640x640 3 persons, 1 bench, 1 umbrella, 1 tennis racket, 12: 640x640 1 person, 1 tennis racket, 13: 640x640 2 persons, 1 frisbee, 14: 640x640 2 books, 15: 640x640 (no detections), 16: 640x640 3 persons, 1 apple, 17: 640x640 2 persons, 1 tennis racket, 18: 640x640 3 persons, 2 tennis rackets, 19: 640x640 2 persons, 1 scissors, 482.9ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  72%|███████▏  | 39/54 [00:27<00:10,  1.46it/s]


0: 640x640 5 persons, 2 tennis rackets, 1: 640x640 2 persons, 1 sports ball, 1 tennis racket, 2: 640x640 4 persons, 1 tennis racket, 3: 640x640 3 persons, 1 tennis racket, 4: 640x640 4 persons, 1 sports ball, 1 tennis racket, 5: 640x640 2 persons, 1 frisbee, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 3 persons, 1 handbag, 1 tennis racket, 8: 640x640 3 persons, 2 tennis rackets, 9: 640x640 2 persons, 2 tennis rackets, 10: 640x640 2 persons, 2 bottles, 11: 640x640 1 person, 1 tennis racket, 1 apple, 12: 640x640 5 persons, 2 tennis rackets, 1 bottle, 13: 640x640 1 person, 1 car, 1 tennis racket, 14: 640x640 6 persons, 1 bottle, 15: 640x640 2 persons, 1 apple, 16: 640x640 4 persons, 1 handbag, 1 chair, 17: 640x640 3 persons, 3 tennis rackets, 18: 640x640 2 persons, 1 tennis racket, 1 cup, 1 chair, 19: 640x640 4 persons, 2 tennis rackets, 482.8ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  74%|███████▍  | 40/54 [00:28<00:09,  1.45it/s]


0: 640x640 1 person, 1 sports ball, 1 baseball bat, 1: 640x640 1 person, 1 tennis racket, 1 apple, 2: 640x640 2 persons, 3: 640x640 1 person, 1 umbrella, 1 tennis racket, 4: 640x640 2 persons, 1 baseball glove, 1 tennis racket, 5: 640x640 2 persons, 1 car, 3 handbags, 1 bottle, 1 cell phone, 6: 640x640 1 person, 2 tennis rackets, 7: 640x640 2 persons, 1 tennis racket, 8: 640x640 1 person, 1 baseball glove, 1 tennis racket, 9: 640x640 1 person, 1 baseball glove, 1 tennis racket, 10: 640x640 1 person, 1 tennis racket, 11: 640x640 1 person, 1 tennis racket, 12: 640x640 3 persons, 1 tennis racket, 1 bottle, 13: 640x640 1 person, 1 sports ball, 1 tennis racket, 14: 640x640 3 persons, 1 tennis racket, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 2 persons, 1 frisbee, 1 tennis racket, 1 apple, 17: 640x640 1 person, 1 baseball bat, 18: 640x640 1 person, 1 sports ball, 1 baseball bat, 19: 640x640 1 person, 1 sports ball, 1 toothbrush, 483.3ms
Speed: 1.4ms preprocess, 24.2ms inference, 1.

Batch::  76%|███████▌  | 41/54 [00:28<00:08,  1.45it/s]


0: 640x640 4 persons, 1 handbag, 1 cell phone, 1: 640x640 6 persons, 3 tennis rackets, 2 bottles, 2: 640x640 1 person, 1 sports ball, 3: 640x640 1 person, 1 handbag, 4: 640x640 1 handbag, 5: 640x640 1 person, 1 tennis racket, 6: 640x640 1 person, 1 sports ball, 7: 640x640 1 person, 1 frisbee, 1 tennis racket, 8: 640x640 2 persons, 1 tennis racket, 9: 640x640 1 person, 1 tennis racket, 10: 640x640 (no detections), 11: 640x640 1 tennis racket, 1 scissors, 12: 640x640 1 person, 1 tennis racket, 13: 640x640 2 persons, 1 tennis racket, 14: 640x640 2 sports balls, 1 tennis racket, 15: 640x640 2 sports balls, 16: 640x640 (no detections), 17: 640x640 2 persons, 1 sports ball, 18: 640x640 2 sports balls, 1 skateboard, 1 tennis racket, 19: 640x640 1 person, 483.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  78%|███████▊  | 42/54 [00:29<00:08,  1.45it/s]


0: 640x640 1 bowl, 1 apple, 1: 640x640 2 sports balls, 1 tennis racket, 1 potted plant, 2: 640x640 1 sports ball, 1 tennis racket, 3: 640x640 1 person, 1 sports ball, 1 tennis racket, 4: 640x640 1 baseball bat, 1 tennis racket, 5: 640x640 (no detections), 6: 640x640 (no detections), 7: 640x640 1 frisbee, 2 sports balls, 8: 640x640 1 person, 9: 640x640 (no detections), 10: 640x640 1 apple, 11: 640x640 1 cup, 1 apple, 1 scissors, 12: 640x640 2 persons, 2 tennis rackets, 13: 640x640 1 knife, 1 apple, 1 scissors, 1 toothbrush, 14: 640x640 1 scissors, 1 toothbrush, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 1 tennis racket, 18: 640x640 1 person, 1 sports ball, 1 tennis racket, 19: 640x640 1 person, 1 sports ball, 1 tennis racket, 481.8ms
Speed: 1.5ms preprocess, 24.1ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  80%|███████▉  | 43/54 [00:30<00:07,  1.44it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 1 person, 1 baseball bat, 2: 640x640 1 person, 1 tennis racket, 3: 640x640 1 person, 1 tennis racket, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 1 person, 1 car, 1 sports ball, 1 baseball glove, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 1 person, 1 tennis racket, 8: 640x640 1 person, 1 baseball bat, 9: 640x640 1 person, 1 sports ball, 1 baseball bat, 1 tennis racket, 10: 640x640 1 person, 1 tennis racket, 11: 640x640 1 person, 1 sports ball, 2 tennis rackets, 12: 640x640 1 person, 1 tennis racket, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 1 person, 3 tennis rackets, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 1 tennis racket, 18: 640x640 1 person, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 482.5ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  81%|████████▏ | 44/54 [00:31<00:06,  1.45it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 1 person, 1 tennis racket, 2: 640x640 1 person, 1 frisbee, 1 tennis racket, 3: 640x640 1 person, 1 baseball bat, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 1 person, 1 surfboard, 6: 640x640 1 person, 1 tennis racket, 7: 640x640 1 person, 8: 640x640 1 person, 1 sports ball, 1 tennis racket, 9: 640x640 1 person, 1 tennis racket, 10: 640x640 1 person, 2 tennis rackets, 11: 640x640 1 person, 1 sports ball, 1 tennis racket, 12: 640x640 1 person, 13: 640x640 1 person, 1 tennis racket, 14: 640x640 1 person, 1 tennis racket, 15: 640x640 1 sports ball, 1 tennis racket, 16: 640x640 1 scissors, 17: 640x640 1 tennis racket, 18: 640x640 3 sports balls, 19: 640x640 1 airplane, 482.9ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  83%|████████▎ | 45/54 [00:31<00:06,  1.46it/s]


0: 640x640 1 person, 1 frisbee, 1: 640x640 1 tennis racket, 1 scissors, 2: 640x640 1 frisbee, 1 scissors, 3: 640x640 1 sports ball, 4: 640x640 1 frisbee, 1 sports ball, 5: 640x640 1 sports ball, 1 tennis racket, 6: 640x640 1 handbag, 1 sports ball, 7: 640x640 1 apple, 8: 640x640 1 frisbee, 1 sports ball, 9: 640x640 1 bowl, 2 apples, 10: 640x640 1 skateboard, 11: 640x640 1 tennis racket, 12: 640x640 1 sports ball, 13: 640x640 (no detections), 14: 640x640 4 persons, 3 tennis rackets, 15: 640x640 3 persons, 1 chair, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 1 skateboard, 1 knife, 1 apple, 18: 640x640 2 persons, 19: 640x640 1 person, 1 car, 483.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  85%|████████▌ | 46/54 [00:32<00:05,  1.46it/s]


0: 640x640 4 persons, 1 apple, 1: 640x640 3 persons, 1 frisbee, 1 bowl, 1 apple, 2: 640x640 4 persons, 3: 640x640 1 person, 2 frisbees, 1 surfboard, 4: 640x640 1 person, 5: 640x640 (no detections), 6: 640x640 (no detections), 7: 640x640 (no detections), 8: 640x640 2 sports balls, 9: 640x640 (no detections), 10: 640x640 4 sports balls, 1 scissors, 11: 640x640 (no detections), 12: 640x640 1 scissors, 13: 640x640 (no detections), 14: 640x640 1 frisbee, 1 sports ball, 1 baseball bat, 15: 640x640 1 frisbee, 2 apples, 16: 640x640 (no detections), 17: 640x640 1 tennis racket, 18: 640x640 2 persons, 1 tennis racket, 19: 640x640 1 traffic light, 1 clock, 482.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)


Batch::  87%|████████▋ | 47/54 [00:33<00:04,  1.47it/s]


0: 640x640 1 frisbee, 1: 640x640 1 frisbee, 1 sports ball, 2: 640x640 (no detections), 3: 640x640 1 person, 1 sports ball, 1 tennis racket, 4: 640x640 1 sports ball, 5: 640x640 1 person, 6: 640x640 1 scissors, 7: 640x640 1 apple, 8: 640x640 (no detections), 9: 640x640 1 cup, 2 apples, 1 clock, 1 scissors, 10: 640x640 1 person, 1 scissors, 11: 640x640 1 potted plant, 12: 640x640 (no detections), 13: 640x640 1 scissors, 14: 640x640 1 scissors, 15: 640x640 1 toothbrush, 16: 640x640 1 bottle, 1 toothbrush, 17: 640x640 1 apple, 18: 640x640 (no detections), 19: 640x640 (no detections), 483.0ms
Speed: 1.5ms preprocess, 24.2ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 640)


Batch::  89%|████████▉ | 48/54 [00:33<00:04,  1.47it/s]


0: 640x640 1 scissors, 1: 640x640 1 apple, 1 scissors, 2: 640x640 1 scissors, 3: 640x640 (no detections), 4: 640x640 1 umbrella, 5: 640x640 1 baseball bat, 6: 640x640 2 tennis rackets, 1 scissors, 7: 640x640 1 sports ball, 1 baseball bat, 1 scissors, 8: 640x640 1 frisbee, 2 sports balls, 9: 640x640 1 toilet, 1 scissors, 10: 640x640 1 sports ball, 1 tennis racket, 11: 640x640 1 tennis racket, 12: 640x640 1 frisbee, 1 sports ball, 1 baseball bat, 1 knife, 1 toothbrush, 13: 640x640 2 sports balls, 1 tennis racket, 14: 640x640 (no detections), 15: 640x640 1 scissors, 16: 640x640 1 sports ball, 17: 640x640 (no detections), 18: 640x640 1 sports ball, 1 tennis racket, 19: 640x640 1 apple, 482.9ms
Speed: 1.5ms preprocess, 24.1ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 640)


Batch::  91%|█████████ | 49/54 [00:34<00:03,  1.48it/s]


0: 640x640 (no detections), 1: 640x640 2 apples, 1 cell phone, 2: 640x640 1 remote, 1 scissors, 3: 640x640 1 person, 1 baseball bat, 4: 640x640 1 person, 1 tennis racket, 5: 640x640 3 persons, 1 frisbee, 1 baseball bat, 1 tennis racket, 1 apple, 6: 640x640 5 persons, 3 tennis rackets, 7: 640x640 2 persons, 1 baseball glove, 1 tennis racket, 8: 640x640 1 person, 1 tennis racket, 9: 640x640 1 person, 10: 640x640 1 person, 11: 640x640 1 person, 1 truck, 1 sports ball, 1 skateboard, 12: 640x640 1 person, 13: 640x640 1 person, 14: 640x640 2 persons, 1 tennis racket, 15: 640x640 3 persons, 2 frisbees, 1 tennis racket, 16: 640x640 2 persons, 2 tennis rackets, 17: 640x640 4 persons, 1 tennis racket, 18: 640x640 3 persons, 2 frisbees, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 482.9ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  93%|█████████▎| 50/54 [00:35<00:02,  1.47it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 1 person, 1 baseball bat, 2: 640x640 1 person, 1 baseball bat, 3: 640x640 1 person, 1 tennis racket, 4: 640x640 2 persons, 1 tennis racket, 5: 640x640 2 persons, 1 tennis racket, 6: 640x640 3 persons, 1 tennis racket, 7: 640x640 1 person, 1 scissors, 8: 640x640 2 persons, 1 tennis racket, 9: 640x640 1 person, 1 chair, 10: 640x640 1 person, 1 baseball bat, 1 tennis racket, 11: 640x640 1 person, 1 tennis racket, 12: 640x640 3 persons, 4 tennis rackets, 13: 640x640 2 persons, 2 tennis rackets, 14: 640x640 1 person, 1 apple, 1 scissors, 15: 640x640 1 person, 1 frisbee, 1 apple, 16: 640x640 1 person, 1 tennis racket, 17: 640x640 1 person, 1 tennis racket, 18: 640x640 1 person, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 482.7ms
Speed: 1.4ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  94%|█████████▍| 51/54 [00:35<00:02,  1.47it/s]


0: 640x640 4 persons, 1 umbrella, 1 tennis racket, 1: 640x640 3 persons, 1 sports ball, 2 tennis rackets, 2: 640x640 1 person, 1 baseball bat, 1 tennis racket, 3: 640x640 3 persons, 3 tennis rackets, 4: 640x640 3 persons, 2 tennis rackets, 5: 640x640 3 persons, 3 tennis rackets, 6: 640x640 1 person, 1 frisbee, 1 apple, 7: 640x640 1 person, 2 sports balls, 1 tennis racket, 8: 640x640 1 sports ball, 1 tennis racket, 9: 640x640 1 person, 1 tennis racket, 10: 640x640 1 person, 3 sports balls, 1 tennis racket, 11: 640x640 1 person, 12: 640x640 1 person, 13: 640x640 2 persons, 1 tennis racket, 14: 640x640 1 person, 1 tennis racket, 15: 640x640 2 persons, 2 tennis rackets, 16: 640x640 (no detections), 17: 640x640 1 person, 1 baseball bat, 18: 640x640 1 person, 1 sports ball, 1 tennis racket, 19: 640x640 1 person, 1 tennis racket, 482.8ms
Speed: 1.4ms preprocess, 24.1ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 640)


Batch::  96%|█████████▋| 52/54 [00:36<00:01,  1.47it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 2 persons, 1 apple, 1 hair drier, 2: 640x640 1 person, 1 baseball bat, 2 tennis rackets, 3: 640x640 1 person, 1 sports ball, 4: 640x640 2 persons, 1 frisbee, 1 sports ball, 1 baseball glove, 1 tennis racket, 5: 640x640 2 persons, 1 handbag, 6: 640x640 1 scissors, 7: 640x640 1 person, 3 sports balls, 1 baseball glove, 8: 640x640 1 scissors, 9: 640x640 1 scissors, 10: 640x640 2 sports balls, 1 baseball bat, 1 tennis racket, 11: 640x640 1 sports ball, 1 surfboard, 12: 640x640 3 sports balls, 13: 640x640 1 bench, 1 baseball bat, 14: 640x640 1 person, 1 sports ball, 15: 640x640 1 person, 1 tennis racket, 16: 640x640 1 person, 1 frisbee, 1 skateboard, 17: 640x640 1 person, 1 frisbee, 1 tennis racket, 18: 640x640 2 persons, 3 tennis rackets, 19: 640x640 1 person, 1 tennis racket, 1 bottle, 483.8ms
Speed: 1.5ms preprocess, 24.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch::  98%|█████████▊| 53/54 [00:37<00:00,  1.46it/s]


0: 640x640 1 person, 1 tennis racket, 1: 640x640 1 person, 1 baseball bat, 2: 640x640 1 person, 1 tennis racket, 3: 640x640 2 persons, 1 handbag, 1 frisbee, 2 tennis rackets, 1 apple, 4: 640x640 1 person, 5: 640x640 1 person, 6: 640x640 1 person, 2 tennis rackets, 7: 640x640 1 person, 1 tennis racket, 8: 640x640 1 person, 2 tennis rackets, 9: 640x640 1 person, 1 frisbee, 1 sports ball, 2 tennis rackets, 10: 640x640 (no detections), 11: 640x640 1 sports ball, 1 scissors, 12: 640x640 1 scissors, 13: 640x640 2 persons, 2 tennis rackets, 14: 640x640 4 persons, 2 tennis rackets, 15: 640x640 2 persons, 1 tennis racket, 16: 640x640 2 sports balls, 1 tennis racket, 17: 640x640 1 tennis racket, 1 scissors, 18: 640x640 1 bottle, 19: 640x640 1 tennis racket, 483.0ms
Speed: 1.5ms preprocess, 24.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 640)


Batch:: 100%|██████████| 54/54 [00:37<00:00,  1.43it/s]
