## Object Detection Model training using YOLO
References - 
- [Documentation](https://docs.ultralytics.com/yolov5/tutorials/train_custom_data/#13-prepare-dataset-for-yolov5)
- [Testing IoU](https://stackoverflow.com/questions/77565416/how-to-test-iou-score-after-training-a-yolo-model)
- [IoU calculation](https://stackoverflow.com/questions/25349178/calculating-percentage-of-bounding-box-overlap-for-image-detector-evaluation)
- [Hungarian Algorithm to match Bounding Boxes](https://gist.github.com/AruniRC/c629c2df0e68e23aff7dcaeef87c72d4)

In [1]:
# Import necessary libraries
import numpy as np
import os, sys 
import matplotlib.pyplot as plt
from ultralytics import YOLO
import fiftyone as fo 
import fiftyone.zoo as foz
from fiftyone import ViewField as F
import json, shutil 
from collections import defaultdict
from itertools import product 
from functools import reduce
from scipy.optimize import linear_sum_assignment

# Save to current directory
curr_dir = os.getcwd()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load dataset
fo.config.dataset_zoo_dir = curr_dir

# Download the data
# By default, the following loads data for detections
dataset = foz.load_zoo_dataset("coco-2017", 
                            splits=['train'],
                            shuffle=True,
                            seed=0,
                            max_samples=5000,
                            label_types=['detections'],
                            only_matching=True,
                            classes=['person'])

Downloading split 'train' to '/Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/coco-2017/train' if necessary
Found annotations at '/Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/coco-2017/raw/instances_train2017.json'
Sufficient images already downloaded
Existing download of split 'train' is sufficient
Loading existing dataset 'coco-2017-train-5000'. To reload from disk, either delete the existing dataset or provide a custom `dataset_name` to use


In [3]:
# Load the downloaded dataset
coco_dataset = fo.Dataset.from_dir(
    dataset_type=fo.types.COCODetectionDataset,
    data_path='coco-2017/train/data',
    labels_path='coco-2017/train/labels.json',
    max_samples=5000,
    include_id=True,
)

 100% |███████████████| 5000/5000 [1.3m elapsed, 0s remaining, 57.6 samples/s]      


In [4]:
# The above downloads all classes in COCO
# We filter them to only have people using the following - 
coco_dataset.export(
    labels_path="coco-2017/labels.json",
    dataset_type=fo.types.COCODetectionDataset,
    classes=['person'],
)

Found multiple fields ['detections', 'segmentations'] with compatible type (<class 'fiftyone.core.labels.Detections'>, <class 'fiftyone.core.labels.Polylines'>, <class 'fiftyone.core.labels.Keypoints'>); exporting 'detections'
   0% |/--------------|   17/5000 [121.4ms elapsed, 35.6s remaining, 140.0 samples/s] 



   1% |\--------------|   51/5000 [325.6ms elapsed, 31.6s remaining, 156.6 samples/s] 



   2% |/--------------|   90/5000 [533.5ms elapsed, 29.1s remaining, 168.7 samples/s] 



   3% ||--------------|  134/5000 [1.1s elapsed, 40.0s remaining, 121.7 samples/s]    



   4% |\--------------|  181/5000 [1.4s elapsed, 37.7s remaining, 124.1 samples/s]    



   4% |/--------------|  217/5000 [1.6s elapsed, 36.0s remaining, 119.7 samples/s]    



   6% ||--------------|  291/5000 [1.9s elapsed, 30.9s remaining, 187.8 samples/s]    



   7% |█|-------------|  361/5000 [2.5s elapsed, 32.0s remaining, 164.2 samples/s]    



  10% |█--------------|  477/5000 [3.1s elapsed, 29.3s remaining, 182.5 samples/s]    



  15% |██-------------|  758/5000 [4.8s elapsed, 26.8s remaining, 163.6 samples/s]    



  16% |██|------------|  796/5000 [5.0s elapsed, 26.3s remaining, 164.8 samples/s]    



 100% |███████████████| 5000/5000 [27.0s elapsed, 0s remaining, 191.8 samples/s]      


In [5]:
# We need to convert the dataset to YOLO format
input_dir = curr_dir + "/coco-2017/"
output_dir = curr_dir + "/yolo/"

images_folder = input_dir + "train/data/"


if not os.path.exists(output_dir):
	os.mkdir(output_dir)

for split in ['train', 'test']:
	shutil.rmtree(output_dir + split, ignore_errors=True)
	os.mkdir(output_dir + split)
	os.mkdir(output_dir + split + '/images')
	os.mkdir(output_dir + split + '/labels')

In [6]:

ground_truths = defaultdict(list)

# Read the annotations
with open(input_dir + 'labels.json', 'r') as f:
		data = json.load(f)

# Count number of annotations
num_data = len(data['images'])
print(f"Total number of images are {num_data}")

# Choose 80-20 split
num_train = np.floor(0.8*num_data)
num_test = np.floor(0.2*num_data)
print(num_train, num_test) 

# Match annotations to images and write in YOLO format
count = 0

id_ann = defaultdict(list)
for ann in data['annotations']:
	id_ann[ann['image_id']].append(ann)

for image in data['images']:
	width = image['width']
	height = image['height']
	filename = image['file_name'].split('.')[0]
	id = image['id']

	
	# Writing current object and copying image
	if count < num_train:
		split = 'train'
	else:
		split = 'test'
	
	f = open(f'{output_dir}{split}/labels/{filename}.txt', 'w')

	for annotation in id_ann[id]:
		current_category = annotation['category_id']
		x, y, w, h = annotation['bbox']
		
		# Finding midpoints
		x_centre = x + w/2
		y_centre = y + h/2
		
		# Normalization
		x_centre /= width
		y_centre /= height
		w /= width
		h /= height
		
		# Limiting upto fix number of decimal places
		sx_centre = format(x_centre, '.6f')
		sy_centre = format(y_centre, '.6f')
		sw = format(w, '.6f')
		sh = format(h, '.6f')


		ground_truths[image['file_name']].append([x_centre, y_centre, w, h])
		
		f.write(f"{current_category} {sx_centre} {sy_centre} {sw} {sh}\n")
	f.close()
	shutil.copy(images_folder + image['file_name'], f'{output_dir}{split}/images/{filename}.jpg')	
	count += 1
				
	


Total number of images are 5000
4000.0 1000.0


In [11]:
# Load model for training
model = YOLO('yolov5nu.pt')

In [12]:
# Train the model
# Device = mps is for Apple Silicon
train_results = model.train(data='config.yaml', batch=16, epochs=10, plots=True, device='mps')

New https://pypi.org/project/ultralytics/8.1.45 available 😃 Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.1.34 🚀 Python-3.8.16 torch-1.13.1 CPU (Apple M1 Pro)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov5nu.pt, data=config.yaml, epochs=10, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=mps, workers=8, project=None, name=train25, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, sav

[34m[1mtrain: [0mScanning /Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/yolo/train/labels.cache... 3995 images, 0 backgrounds, 5 corrupt: 100%|██████████| 4000/4000 [00:00<?, ?it/s]




[34m[1mval: [0mScanning /Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/yolo/test/labels.cache... 1000 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1000/1000 [00:00<?, ?it/s]

Plotting labels to /opt/homebrew/runs/detect/train25/labels.jpg... 





[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.002, momentum=0.9) with parameter groups 69 weight(decay=0.0), 76 weight(decay=0.0005), 75 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1m/opt/homebrew/runs/detect/train25[0m
Starting training for 10 epochs...
Closing dataloader mosaic

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/10         0G       1.35      1.814      1.295         42        640: 100%|██████████| 250/250 [3:02:35<00:00, 43.82s/it]     
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:05<00:00,  5.78s/it]

                   all       1000       4065      0.593      0.439      0.477      0.265






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       2/10         0G      1.621      1.721      1.491         38        640: 100%|██████████| 250/250 [28:35<00:00,  6.86s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:08<00:00,  5.90s/it]

                   all       1000       4065      0.536       0.45      0.462      0.243






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       3/10         0G      1.672        1.7      1.538         61        640: 100%|██████████| 250/250 [29:52<00:00,  7.17s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:07<00:00,  5.85s/it]

                   all       1000       4065      0.573      0.406      0.448      0.229






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       4/10         0G      1.628      1.604      1.514         44        640: 100%|██████████| 250/250 [28:48<00:00,  6.91s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:04<00:00,  5.76s/it]

                   all       1000       4065      0.617      0.441      0.491       0.26






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       5/10         0G      1.591      1.546      1.476         49        640: 100%|██████████| 250/250 [28:33<00:00,  6.85s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:04<00:00,  5.77s/it]

                   all       1000       4065      0.595      0.442      0.485      0.257






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       6/10         0G       1.55      1.483      1.446         27        640: 100%|██████████| 250/250 [29:31<00:00,  7.08s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:10<00:00,  5.95s/it]

                   all       1000       4065      0.637      0.471      0.536      0.303






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       7/10         0G      1.494      1.416      1.414         27        640: 100%|██████████| 250/250 [29:33<00:00,  7.09s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:13<00:00,  6.05s/it]

                   all       1000       4065       0.68      0.503      0.574      0.336






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       8/10         0G      1.446      1.346      1.373         43        640: 100%|██████████| 250/250 [28:40<00:00,  6.88s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:08<00:00,  5.90s/it]

                   all       1000       4065      0.719      0.514      0.599      0.347






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       9/10         0G      1.373      1.261       1.33         39        640: 100%|██████████| 250/250 [6:25:48<00:00, 92.60s/it]     
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [1:40:31<00:00, 188.48s/it]  

                   all       1000       4065      0.746      0.523      0.626      0.382






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


      10/10         0G      1.327      1.206        1.3         42        640: 100%|██████████| 250/250 [5:58:25<00:00, 86.02s/it]    
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [03:06<00:00,  5.83s/it]

                   all       1000       4065      0.709       0.55      0.632      0.389






10 epochs completed in 20.986 hours.
Optimizer stripped from /opt/homebrew/runs/detect/train25/weights/last.pt, 5.2MB
Optimizer stripped from /opt/homebrew/runs/detect/train25/weights/best.pt, 5.2MB

Validating /opt/homebrew/runs/detect/train25/weights/best.pt...
Ultralytics YOLOv8.1.34 🚀 Python-3.8.16 torch-1.13.1 CPU (Apple M1 Pro)
YOLOv5n summary (fused): 193 layers, 2503139 parameters, 0 gradients, 7.1 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 32/32 [02:56<00:00,  5.51s/it]


                   all       1000       4065      0.711       0.55      0.633      0.389
Speed: 0.8ms preprocess, 170.0ms inference, 0.0ms loss, 0.5ms postprocess per image
Results saved to [1m/opt/homebrew/runs/detect/train25[0m


In [13]:
# Save the model
model.save(filename='trained.pt')

In [14]:
def calc_iou(bb1, bb2):
    b1_x1 = bb1[0] - bb1[2]/2
    b1_x2 = bb1[0] + bb1[2]/2
    b1_y1 = bb1[1] - bb1[3]/2
    b1_y2 = bb1[1] + bb1[3]/2

    b2_x1 = bb2[0] - bb2[2]/2
    b2_x2 = bb2[0] + bb2[2]/2
    b2_y1 = bb2[1] - bb2[3]/2
    b2_y2 = bb2[1] + bb2[3]/2

    # determine the coordinates of the intersection rectangle
    x_left = max(b1_x1, b2_x1)
    y_top = max(b1_y1, b2_y1)
    x_right = min(b1_x2, b2_x2)
    y_bottom = min(b1_y2, b2_y2)

    if x_right < x_left or y_bottom < y_top:
        return 0.0

    # The intersection of two axis-aligned bounding boxes is always an
    # axis-aligned bounding box
    intersection_area = (x_right - x_left) * (y_bottom - y_top)

    # compute the area of both AABBs
    bb1_area = bb1[2]*bb1[3]
    bb2_area = bb2[2]*bb2[3]

    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the interesection area
    iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
    assert iou >= 0.0
    assert iou <= 1.0
    return iou

In [44]:
# Test the data with IOU score 
test_images_folder = output_dir + 'test/images/'
test_labels_folder = output_dir + 'test/labels'
test_files = os.listdir(test_images_folder)
sum_iou = 0
ious = defaultdict(float)

results_folder = output_dir + 'results/'
if not os.path.exists(results_folder):
	os.mkdir(results_folder)
     
for test_file in test_files:
    res = model.predict(test_images_folder + test_file)
    res[0].save(results_folder + test_file)
    
    gt = ground_truths[test_file]
    preds = res[0].boxes.xywhn.numpy()
    
    if len(gt) == 0 or len(preds) == 0:
         continue
    
    combinations = list(product(gt, preds))
    iou_matrix = np.zeros((len(gt), len(preds)))
    for i in range(len(gt)):
        for j in range(len(preds)):
            iou_matrix[i, j] = calc_iou(gt[i], preds[j])

    # Do the Hungarian matching algorithm
    gt_idx, pred_idx = linear_sum_assignment(1 - iou_matrix)
    assigned_ious = np.sort(iou_matrix[gt_idx, pred_idx])[-len(gt):]
    mean_iou = np.mean(assigned_ious)

    assert mean_iou <= 1.0 
    
    sum_iou += mean_iou
    ious[test_file] = (mean_iou, assigned_ious)
   
num_test = len(test_files)
print(sum_iou/num_test)


image 1/1 /Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/yolo/test/images/000000520047.jpg: 448x640 1 person, 63.0ms
Speed: 1.9ms preprocess, 63.0ms inference, 0.8ms postprocess per image at shape (1, 3, 448, 640)
1

image 1/1 /Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/yolo/test/images/000000536619.jpg: 480x640 5 persons, 58.6ms
Speed: 1.1ms preprocess, 58.6ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)
5

image 1/1 /Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/yolo/test/images/000000566984.jpg: 480x640 9 persons, 54.9ms
Speed: 0.7ms preprocess, 54.9ms inference, 0.5ms postprocess per image at shape (1, 3, 480, 640)
9

image 1/1 /Users/sudhansh/Desktop/Projects/Machine Learning Practice/Object Detection/yolo/test/images/000000499763.jpg: 640x448 1 person, 51.3ms
Speed: 0.9ms preprocess, 51.3ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 448)
1

image 1/1 /Us

In [45]:
print(sum_iou)

678.031050278821
