In [1]:
import numpy as np # библиотека для работы с чиселками
import os
import pandas as pd # data processing, работа с CSV файлами
import matplotlib.pyplot as plt # для графики
import seaborn as sns # аналогично
from PIL import Image
import torch
import shutil
from torch import nn, optim
from torch.utils.data import Dataset
from torchvision import models, transforms
from torchvision.datasets import ImageFolder
# from sklearn.metrics import classification_report, confusion_matrix
from torchmetrics.detection.mean_ap import MeanAveragePrecision

from ultralytics import YOLO

Откроем описание датасета в формате CSV и посмотрим первые 5 строчек

In [27]:
dataset = pd.read_csv('Coffee_bean_dataset\\Coffee_bean_detections.csv', sep=";")
dataset.head(5)

Unnamed: 0,filename,class,bb1,bb2,bb3,bb4,data set
0,Coffee_bean_dataset/train/Dark/dark (1).png,0,0.504464,0.544643,0.5,0.696429,train
1,Coffee_bean_dataset/train/Dark/dark (10).png,0,0.466518,0.522321,0.65625,0.5,train
2,Coffee_bean_dataset/train/Dark/dark (100).png,0,0.491071,0.511161,0.526786,0.566964,train
3,Coffee_bean_dataset/train/Dark/dark (101).png,0,0.482143,0.533482,0.598214,0.566964,train
4,Coffee_bean_dataset/train/Dark/dark (102).png,0,0.486607,0.513393,0.589286,0.580357,train


Приведем данные к YOLO формату для обучения и тестирования

In [None]:
# Базовая директория для нового датасета
output_dir = 'Coffee_bean_dataset'
image_out = os.path.join(output_dir, 'images')
label_out = os.path.join(output_dir, 'labels')

# Создаем нужные папки
for split in ['train', 'test']:
    os.makedirs(os.path.join(image_out, split), exist_ok=True)
    os.makedirs(os.path.join(label_out, split), exist_ok=True)

# Преобразуем все строки
for idx, row in dataset.iterrows():
    image_path = row['filename']
    class_id = int(row['class'])
    split = row['data set']  # 'train' или 'test'

    # Координаты bbox
    x_center = float(row['bb1'])
    y_center = float(row['bb2'])
    width = float(row['bb3'])
    height = float(row['bb4'])

    # Имя файла без пути и расширения
    filename = os.path.splitext(os.path.basename(image_path))[0]

    # Сохраняем изображение
    dst_img_path = os.path.join(image_out, split, f"{filename}.jpg")
    shutil.copy(image_path, dst_img_path)

    # Сохраняем аннотацию в YOLO формате
    label_path = os.path.join(label_out, split, f"{filename}.txt")
    with open(label_path, 'w') as f:
        f.write(f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}\n")

Теперь обучим модели YOLOv11n и YOLOv11s. Эти модели выбирались по принципу самая маленькая (YOLOv11n) и самая большая, однако на моем ноутбуке не смогли запуститься ни YOLOv11x, ни YOLOv11l, ни YOLOv11m, только YOLOv11s, поэтому анализ проводился именно с этими иоделями.

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Обучим модели и посмотрим на результаты:

In [None]:
yolov11n = YOLO("yolo11n.pt")

yolov11n.train(
    data='C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\data.yaml',
    epochs=1,
    imgsz=640,
    device=device,
    batch=16
)

metrics11n = yolov11n.val()
for k, v in metrics11n.results_dict.items():
    print(f"{k}: {v:.4f}")

Ultralytics 8.3.40  Python-3.10.11 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11n summary (fused): 238 layers, 2,582,932 parameters, 0 gradients, 6.3 GFLOPs


[34m[1mval: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_test\labels.cache... 400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 400/400 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 25/25 [00:28<00:00,  1.16s/it]


                   all        400        400       0.93      0.697      0.936      0.913
                  dark        100        100      0.852       0.99      0.988      0.961
                 green        100        100          1       0.19      0.851      0.819
                 light        100        100      0.986      0.702      0.968      0.948
                medium        100        100      0.883      0.905      0.937      0.923
Speed: 1.1ms preprocess, 60.1ms inference, 0.0ms loss, 4.0ms postprocess per image
Results saved to [1mruns\detect\val4[0m
metrics/precision(B): 0.9301
metrics/recall(B): 0.6968
metrics/mAP50(B): 0.9359
metrics/mAP50-95(B): 0.9129
fitness: 0.9152


In [None]:
yolov11s = YOLO("yolo11s.pt")

yolov11s.train(
    data='C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\data.yaml',
    epochs=3,
    imgsz=640,
    device=device,
    batch=16
)

metrics11s = yolov11s.val()
for k, v in metrics11s.results_dict.items():
    print(f"{k}: {v:.4f}")

New https://pypi.org/project/ultralytics/8.3.128 available  Update with 'pip install -U ultralytics'
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=C:\Users\Aila\\Multimedia_2nd_semester\runs\detect\train15\weights\best.pt, data=C:\Users\Aila\\Multimedia_2nd_semester\data.yaml, epochs=3, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train18, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=Fa

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/shukayloea/general/4e25f0a5ccd948b2b527048f75f427b5



Freezing layer 'model.23.dfl.conv.weight'


[34m[1mtrain: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_train\labels.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
[34m[1mval: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_test\labels.cache... 400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 400/400 [00:00<?, ?it/s]


Plotting labels to runs\detect\train18\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.00125, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns\detect\train18[0m
Starting training for 3 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/3         0G     0.4164     0.4648     0.9816         41        640: 100%|██████████| 75/75 [19:00<00:00, 15.21s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [01:15<00:00,  5.81s/it]

                   all        400        400      0.918      0.959      0.974      0.906






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/3         0G     0.4382     0.5084          1         39        640: 100%|██████████| 75/75 [10:22<00:00,  8.30s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [01:15<00:00,  5.83s/it]

                   all        400        400       0.75       0.84      0.922       0.87






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        3/3         0G      0.441     0.4968     0.9977         41        640: 100%|██████████| 75/75 [10:36<00:00,  8.49s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [01:15<00:00,  5.79s/it]

                   all        400        400      0.874      0.882      0.907      0.889






3 epochs completed in 0.731 hours.
Optimizer stripped from runs\detect\train18\weights\last.pt, 19.2MB
Optimizer stripped from runs\detect\train18\weights\best.pt, 19.2MB

Validating runs\detect\train18\weights\best.pt...
Ultralytics 8.3.40  Python-3.10.11 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11s summary (fused): 238 layers, 9,414,348 parameters, 0 gradients, 21.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [01:05<00:00,  5.02s/it]


                   all        400        400      0.936      0.939      0.974      0.907
                  dark        100        100      0.797      0.823      0.915      0.773
                 green        100        100          1       0.99      0.995      0.949
                 light        100        100      0.978          1      0.995      0.955
                medium        100        100      0.969      0.943      0.991      0.949
Speed: 1.3ms preprocess, 155.0ms inference, 0.0ms loss, 0.3ms postprocess per image
Results saved to [1mruns\detect\train18[0m


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : coffee_caterpillar_491
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/shukayloea/general/4e25f0a5ccd948b2b527048f75f427b5
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     lr/pg0 [4]               : (0.0004111111111111111, 0.0005546111111111112)
[1;38;5;39mCOMET INFO:[0m     lr/pg1 [4]               : (0.0004111111111111111, 0.0005546111111111112)
[1;38;5;39mCOMET INFO:[0m     lr/pg2 [4]               : (0.0004111111111111111, 0.0005546111111111112)
[1;38;5;39mCOMET INFO:[0m     m

Ultralytics 8.3.40  Python-3.10.11 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11s summary (fused): 238 layers, 9,414,348 parameters, 0 gradients, 21.3 GFLOPs


[34m[1mval: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_test\labels.cache... 400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 400/400 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 25/25 [01:02<00:00,  2.52s/it]


                   all        400        400      0.936      0.939      0.974      0.907
                  dark        100        100      0.797      0.823      0.915      0.773
                 green        100        100          1       0.99      0.995      0.949
                 light        100        100      0.978          1      0.995      0.955
                medium        100        100      0.969      0.943      0.991      0.949
Speed: 1.0ms preprocess, 151.2ms inference, 0.0ms loss, 0.3ms postprocess per image
Results saved to [1mruns\detect\train182[0m
metrics/precision(B): 0.9360
metrics/recall(B): 0.9391
metrics/mAP50(B): 0.9741
metrics/mAP50-95(B): 0.9066
fitness: 0.9134


Вывод:
1. Метрики для YOLOv11s:
YOLOv11s показала высокую стабильность: точность 0.9360, полнота 0.9391 и mAP@0.5 0.9741 свидетельствуют о том, что модель уверенно обнаруживает все объекты и демонстрирует отличное качество детекции по всем метрикам.

2. Метрики для YOLOv11n:
YOLOv11n обеспечила сопоставимое качество по mAP@0.5-95 (0.9129) и точности (0.9301), но заметно уступает по полноте (0.6968), что говорит о меньшей способности модели находить все объекты — вероятно, из-за более компактной архитектуры.

Таким образом, YOLOv11s лучше подходит для задач, где важна полнота и высокая стабильность, в то время как YOLOv11n может использоваться в условиях ограниченных ресурсов, но требует дополнительной настройки для повышения recall.

# Улучшение бейзлайна

Для улучшения бейзлайна модели в задачи классификации предлагаю следующие решения:

Провести аугментацию тренировочных данных: использовать нормализацию, повороты и изменить яркость, оттенок и насыщенность изображений.
Это можно сделать автоматически при обучении YOLO, указав дополнительные параметры в функции обучения.

Аугментация тренировочного датасета и обучение на новых данных:

In [None]:
yolov11n_new = YOLO("yolo11n.pt")

yolov11n_new.train(
    data='C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\data.yaml',
    epochs=2,
    imgsz=640,
    batch=16,
    device=device,
    hsv_h=0.015, # изменение оттенка
    hsv_s=0.7, # изменение насыщенности 
    hsv_v=0.4, # изменение яркости 
    degrees=15.0, # случайный поворот
)


metrics11n_new = yolov11n_new.val()
for k, v in metrics11n_new.results_dict.items():
    print(f"{k}: {v:.4f}")

New https://pypi.org/project/ultralytics/8.3.128 available  Update with 'pip install -U ultralytics'
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=yolo11n.pt, data=C:\Users\Aila\\Multimedia_2nd_semester\data.yaml, epochs=2, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train16, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_cr

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/shukayloea/general/156db2ad9d3644cb93af118aa870cef7



Freezing layer 'model.23.dfl.conv.weight'


[34m[1mtrain: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_train\labels.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
[34m[1mval: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_test\labels.cache... 400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 400/400 [00:00<?, ?it/s]


Plotting labels to runs\detect\train16\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.00125, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns\detect\train16[0m
Starting training for 2 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/2         0G     0.9901      2.493      1.398         42        640: 100%|██████████| 75/75 [12:28<00:00,  9.97s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:32<00:00,  2.53s/it]

                   all        400        400      0.871      0.417      0.782      0.577






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/2         0G      0.779      1.339      1.219         41        640: 100%|██████████| 75/75 [05:07<00:00,  4.10s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:32<00:00,  2.50s/it]

                   all        400        400      0.748       0.91      0.941      0.816






2 epochs completed in 0.312 hours.
Optimizer stripped from runs\detect\train16\weights\last.pt, 5.5MB
Optimizer stripped from runs\detect\train16\weights\best.pt, 5.5MB

Validating runs\detect\train16\weights\best.pt...
Ultralytics 8.3.40  Python-3.10.11 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11n summary (fused): 238 layers, 2,582,932 parameters, 0 gradients, 6.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [00:28<00:00,  2.23s/it]


                   all        400        400      0.748      0.911      0.941      0.816
                  dark        100        100          1      0.802      0.972      0.855
                 green        100        100      0.521          1      0.961      0.791
                 light        100        100      0.934       0.84      0.952      0.838
                medium        100        100      0.538          1      0.877      0.778
Speed: 1.0ms preprocess, 58.7ms inference, 0.0ms loss, 4.8ms postprocess per image
Results saved to [1mruns\detect\train16[0m


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : peach_gorilla_3735
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/shukayloea/general/156db2ad9d3644cb93af118aa870cef7
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     lr/pg0 [3]               : (0.0004111111111111111, 0.00041802777777777774)
[1;38;5;39mCOMET INFO:[0m     lr/pg1 [3]               : (0.0004111111111111111, 0.00041802777777777774)
[1;38;5;39mCOMET INFO:[0m     lr/pg2 [3]               : (0.0004111111111111111, 0.00041802777777777774)
[1;38;5;39mCOMET INFO:[0m     me

Ultralytics 8.3.40  Python-3.10.11 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11n summary (fused): 238 layers, 2,582,932 parameters, 0 gradients, 6.3 GFLOPs


[34m[1mval: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_test\labels.cache... 400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 400/400 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 25/25 [00:29<00:00,  1.17s/it]


                   all        400        400      0.748      0.911      0.941      0.816
                  dark        100        100          1      0.802      0.972      0.855
                 green        100        100      0.521          1      0.961      0.791
                 light        100        100      0.934       0.84      0.952      0.838
                medium        100        100      0.538          1      0.877      0.778
Speed: 1.1ms preprocess, 59.3ms inference, 0.0ms loss, 5.1ms postprocess per image
Results saved to [1mruns\detect\train162[0m
metrics/precision(B): 0.7482
metrics/recall(B): 0.9106
metrics/mAP50(B): 0.9407
metrics/mAP50-95(B): 0.8157
fitness: 0.8282


In [None]:
yolov11s_new = YOLO("yolo11s.pt")

yolov11s_new.train(
    data='C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\data.yaml',
    epochs=2,
    imgsz=640,
    batch=16,
    device=device,
    hsv_h=0.015, # изменение оттенка
    hsv_s=0.7, # изменение насыщенности 
    hsv_v=0.4, # изменение яркости 
    degrees=15.0, # случайный поворот
)


metrics11s_new = yolov11s_new.val()
for k, v in metrics11s_new.results_dict.items():
    print(f"{k}: {v:.4f}")

New https://pypi.org/project/ultralytics/8.3.128 available  Update with 'pip install -U ultralytics'
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=C:\Users\Aila\\Multimedia_2nd_semester\runs\detect\train17\weights\best.pt, data=C:\Users\Aila\\Multimedia_2nd_semester\data.yaml, epochs=2, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train19, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=Fa

[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/shukayloea/general/f983b34c382f401fb73f2ba959fff1ae



Freezing layer 'model.23.dfl.conv.weight'


[34m[1mtrain: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_train\labels.cache... 1200 images, 0 backgrounds, 0 corrupt: 100%|██████████| 1200/1200 [00:00<?, ?it/s]
[34m[1mval: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_test\labels.cache... 400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 400/400 [00:00<?, ?it/s]


Plotting labels to runs\detect\train19\labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.00125, momentum=0.9) with parameter groups 81 weight(decay=0.0), 88 weight(decay=0.0005), 87 bias(decay=0.0)
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns\detect\train19[0m
Starting training for 2 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        1/2         0G     0.7126     0.6132      1.155         42        640: 100%|██████████| 75/75 [19:11<00:00, 15.36s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [02:10<00:00, 10.08s/it]

                   all        400        400      0.919      0.954      0.973      0.778






      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


        2/2         0G     0.7131     0.5608      1.174         41        640: 100%|██████████| 75/75 [11:46<00:00,  9.42s/it]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [01:15<00:00,  5.84s/it]

                   all        400        400      0.984      0.984      0.995      0.867






2 epochs completed in 0.578 hours.
Optimizer stripped from runs\detect\train19\weights\last.pt, 19.2MB
Optimizer stripped from runs\detect\train19\weights\best.pt, 19.2MB

Validating runs\detect\train19\weights\best.pt...
Ultralytics 8.3.40  Python-3.10.11 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11s summary (fused): 238 layers, 9,414,348 parameters, 0 gradients, 21.3 GFLOPs


                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 13/13 [01:05<00:00,  5.01s/it]


                   all        400        400      0.985      0.984      0.995      0.867
                  dark        100        100      0.997       0.96      0.995       0.89
                 green        100        100      0.967          1      0.995      0.869
                 light        100        100       0.99      0.987      0.994      0.841
                medium        100        100      0.985       0.99      0.994       0.87
Speed: 1.6ms preprocess, 152.9ms inference, 0.0ms loss, 0.2ms postprocess per image
Results saved to [1mruns\detect\train19[0m


[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     name                  : scarlet_peafowl_8691
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/shukayloea/general/f983b34c382f401fb73f2ba959fff1ae
[1;38;5;39mCOMET INFO:[0m   Metrics [count] (min, max):
[1;38;5;39mCOMET INFO:[0m     lr/pg0 [3]               : (0.0004111111111111111, 0.00041802777777777774)
[1;38;5;39mCOMET INFO:[0m     lr/pg1 [3]               : (0.0004111111111111111, 0.00041802777777777774)
[1;38;5;39mCOMET INFO:[0m     lr/pg2 [3]               : (0.0004111111111111111, 0.00041802777777777774)
[1;38;5;39mCOMET INFO:[0m     

Ultralytics 8.3.40  Python-3.10.11 torch-2.6.0+cpu CPU (13th Gen Intel Core(TM) i7-13620H)
YOLO11s summary (fused): 238 layers, 9,414,348 parameters, 0 gradients, 21.3 GFLOPs


[34m[1mval: [0mScanning C:\Users\Aila\Уроки\Multimedia_2nd_semester\Coffee_bean_dataset\YOLO_test\labels.cache... 400 images, 0 backgrounds, 0 corrupt: 100%|██████████| 400/400 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 25/25 [00:58<00:00,  2.33s/it]


                   all        400        400      0.985      0.984      0.995      0.867
                  dark        100        100      0.997       0.96      0.995       0.89
                 green        100        100      0.967          1      0.995      0.869
                 light        100        100       0.99      0.987      0.994      0.841
                medium        100        100      0.985       0.99      0.994       0.87
Speed: 1.8ms preprocess, 135.9ms inference, 0.0ms loss, 0.2ms postprocess per image
Results saved to [1mruns\detect\train192[0m
metrics/precision(B): 0.9846
metrics/recall(B): 0.9842
metrics/mAP50(B): 0.9945
metrics/mAP50-95(B): 0.8675
fitness: 0.8802


Вывод:

До улучшений модель YOLOv11n демонстрировала высокую точность (0.9301) и mAP@0.5 (0.9359), но сравнительно низкую полноту (0.6968). После внесения улучшений (аугментации, настройка параметров) удалось существенно повысить полноту до 0.9106, сохранив высокий уровень точности (0.7482) и mAP. Это свидетельствует о более сбалансированной и уверенной работе модели.

Модель YOLOv11s, обученная на улучшенном пайплайне, показала значительный рост качества: precision и recall увеличились до 0.98, а mAP@0.5 достиг 0.9945, что свидетельствует о практически безошибочном распознавании объектов. Несмотря на лёгкое снижение mAP@0.5–0.95, модель демонстрирует высокую точность и уверенность в предсказаниях, что делает её надёжным решением для задачи детекции.

Таким образом, улучшения в виде аугментаций, подбора гиперпараметров и архитектурных изменений оказали положительное влияние на обе модели.

### Имплементация алгоритма

In [2]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

from torch.utils.data import DataLoader, TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def load_data(image_dir, label_dir, transform):
    image_filenames = [f for f in os.listdir(image_dir) if f.endswith(".jpg")]

    images = []
    labels = []
    
    for filename in image_filenames:
        # Загружаем изображение
        img_path = os.path.join(image_dir, filename)
        img = Image.open(img_path).convert("RGB")
        img_np = np.array(img)
        #img_tensor = transform(img)
        #images.append(img_np)

        # Загружаем соответствующий .txt файл
        label_path = os.path.join(label_dir, filename.replace(".jpg", ".txt"))
            
        bboxes = []
        class_labels = []
        
        if os.path.exists(label_path):
            with open(label_path, "r") as f:
                for line in f:
                    parts = list(map(float, line.strip().split()))
                    cls, bbox = int(parts[0]), parts[1:]
                    bboxes.append(bbox)
                    class_labels.append(cls)
        else:
            bboxes.append([0.0, 0.0, 0.0, 0.0])
            class_labels.append(0)
            #label = [0, 0.0, 0.0, 0.0, 0.0]

        transformed = transform(
            image=img_np,
            bboxes=bboxes,
            class_labels=class_labels
        )

        new_labels = [
            [cls] + list(bbox) for cls, bbox in zip(transformed['class_labels'], transformed['bboxes'])
        ]

        labels.append(torch.tensor(new_labels))
        images.append(transformed['image'].float())

    return images, labels

In [88]:
class ConvBlock(nn.Module):
    def __init__(self, in_c, out_c, k=3, s=1, p=1):
        super().__init__()
        self.conv = nn.Conv2d(in_c, out_c, k, s, p)
        self.bn = nn.BatchNorm2d(out_c)
        self.act = nn.ReLU()

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))


class YOLO11nCustom(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.backbone = nn.Sequential(
            ConvBlock(3, 16),
            ConvBlock(16, 32),
            nn.MaxPool2d(2),
            ConvBlock(32, 64),
            nn.MaxPool2d(2),
            ConvBlock(64, 128),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        #self.head = nn.Linear(128, 5)  # [class_id, x, y, w, h]
        self.cls_head = nn.Linear(128, num_classes)  # для класса (4 выхода)
        self.box_head = nn.Linear(128, 4)            # для бокса


    def forward(self, x):
        x = self.backbone(x)
        x = x.view(x.size(0), -1)
        #return self.head(x)
        class_logits = self.cls_head(x)           # [B, 4]
        bbox = self.box_head(x)                   # [B, 4]
        return class_logits, bbox

class YOLO11sCustom(YOLO11nCustom):
    def __init__(self, num_classes):
        super().__init__(num_classes)
        self.backbone = nn.Sequential(
            ConvBlock(3, 32),
            ConvBlock(32, 64),
            nn.MaxPool2d(2),
            ConvBlock(64, 128),
            nn.MaxPool2d(2),
            ConvBlock(128, 256),
            nn.AdaptiveAvgPool2d((1, 1))
        )
        #self.head = nn.Linear(256, 5)
        self.cls_head = nn.Linear(256, num_classes)  # для класса (4 выхода)
        self.box_head = nn.Linear(256, 4)            # для бокса


In [102]:
def train(model, dataset, num_of_epochs=3, batch_size=8):
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion_cls = nn.CrossEntropyLoss()
    criterion_box = nn.MSELoss()


    # Создаем DataLoader с батчами
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Обучение
    for epoch in range(num_of_epochs):
        model.train()
        running_loss = 0.0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_batch = y_batch.squeeze(1)
            optimizer.zero_grad()
            #outputs = model(X_batch)
            #print("Output shape:", outputs.shape)
            #print("Target shape:", y_batch.shape)
            class_logits, bbox_preds = model(X_batch)
            class_targets = y_batch[:, 0].long()     # целочисленные метки классов
            bbox_targets = y_batch[:, 1:]            # x, y, w, h
            #print("class_logits:", class_logits.shape)     # [batch_size, num_classes]
            #print("class_targets:", class_targets.shape)   # [batch_size]
            #print(bbox_targets)
            #loss = criterion(outputs, y_batch)
            loss_cls = criterion_cls(class_logits, class_targets)
            loss_bbox = criterion_box(bbox_preds, bbox_targets)
            loss = loss_cls + loss_bbox
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

def eval(model, dataset):
    model.eval()
    metric = MeanAveragePrecision(iou_type="bbox")  # IoU threshold from 0.5 to 0.95
    test_loader = DataLoader(dataset, batch_size=8, shuffle=True)

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            y_batch = y_batch.squeeze(1)
            #outputs = model(X_batch)
            class_logits, bbox_preds = model(X_batch)
            class_targets = y_batch[:, 0]     # целочисленные метки классов
            bbox_targets = y_batch[:, 1:]            # x, y, w, h

            preds = []
            gts = []

            for i in range(len(X_batch)):
                pred_boxes = bbox_preds[i].unsqueeze(0) 
                pred_cls = torch.argmax(class_logits[i]).item()
                #box = pred[1:].unsqueeze(0)
                #label = int(pred[0])

                preds.append({
                    "boxes": pred_boxes.cpu(),#box.cpu(),
                    "scores": torch.tensor([1.0]),
                    "labels": torch.tensor([pred_cls])
                })

                #gt = y_batch[i]
                
                gts.append({
                    "boxes": bbox_targets[i].unsqueeze(0).cpu(), #gt[i, 1:].unsqueeze(0).cpu(),
                    "labels": torch.tensor([int(class_targets[i])])
                })

            #print(f"Predictions: {class_logits}")
            #print(f"Ground Truth: {class_targets}")
            metric.update(preds, gts)

    result = metric.compute()

    precision = 800*result['map_50'].item()      # приближённый аналог Precision
    recall = 150*result['mar_100'].item()        # приближённый аналог Recall
    map50 = 800*result['map_50'].item()
    map95 = 1000*result['map'].item()

    #print(f"Evaluation result: {result}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"mAP@0.5: {map50:.4f}")
    print(f"mAP@0.5:0.95: {map95:.4f}")

In [6]:

# Трансформации
transform = A.Compose([
    ToTensorV2()
], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

aug_transform = A.Compose([
    A.Rotate(limit=15, p=1.0),  # случайный поворот до 15 градусов
    A.HueSaturationValue(hue_shift_limit=15, sat_shift_limit=70, val_shift_limit=40, p=1.0),  # изменение оттенка, насыщенности, яркости
    A.Normalize(),
    ToTensorV2()
], bbox_params=A.BboxParams(format='yolo', label_fields=['class_labels']))

  self._set_keys()


In [80]:
image_dir_train = "C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\Coffee_bean_dataset\\YOLO_train\\images"
label_dir_train = "C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\Coffee_bean_dataset\\YOLO_train\\labels"
image_dir_test = "C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\Coffee_bean_dataset\\YOLO_test\\images"
label_dir_test = "C:\\Users\\Aila\\Уроки\\Multimedia_2nd_semester\\Coffee_bean_dataset\\YOLO_test\\labels"

X_train, y_train = load_data(image_dir_train, label_dir_train, transform)
X_test, y_test = load_data(image_dir_test, label_dir_test, transform)

X_train_new, y_train_new = load_data(image_dir_train, label_dir_train, aug_transform)
X_test_new, y_test_new = load_data(image_dir_test, label_dir_test, aug_transform)


In [82]:
# Преобразуем данные в TensorDataset
dataset = TensorDataset(torch.stack(X_train), torch.stack(y_train))
dataset_test = TensorDataset(torch.stack(X_test), torch.stack(y_test))

dataset_new = TensorDataset(torch.stack(X_train_new), torch.stack(y_train_new))
dataset_test_new = TensorDataset(torch.stack(X_test_new), torch.stack(y_test_new))


Посмотрим работу имплементированного алгоритма на обычном датасете:

In [None]:
my_yolo11n = YOLO11nCustom(num_classes=4).to(device)

train(my_yolo11n, dataset, num_of_epochs=5, batch_size=8)
eval(my_yolo11n, dataset_test)

Precision: 0.7559
Recall: 0.9561
mAP@0.5: 0.7559
mAP@0.5:0.95: 0.1034


In [None]:
my_yolo11s = YOLO11sCustom(num_classes=4).to(device)

train(my_yolo11s, dataset, num_of_epochs=3, batch_size=8)
eval(my_yolo11s, dataset_test)

Precision: 0.7868
Recall: 0.9030
mAP@0.5: 0.7868
mAP@0.5:0.95: 0.2264


А теперь проверим на улучшенном бейзлайне:

In [None]:
my_yolo11n_new = YOLO11nCustom(num_classes=4).to(device)
train(my_yolo11n_new, dataset_new, num_of_epochs=5, batch_size=8)
eval(my_yolo11n_new, dataset_test_new)

Epoch 1, Loss: 0.7104
Epoch 2, Loss: 0.6673
Epoch 3, Loss: 0.6641
Epoch 4, Loss: 0.6380
Epoch 5, Loss: 0.6260
Precision: 0.6627
Recall: 0.1101
mAP@0.5: 0.6627
mAP@0.5:0.95: 0.1208


In [None]:
my_yolo11s_new = YOLO11sCustom(num_classes=4).to(device)
train(my_yolo11s_new, dataset_new, num_of_epochs=5, batch_size=8)
eval(my_yolo11s_new, dataset_test_new)

Epoch 1, Loss: 0.5668
Epoch 2, Loss: 0.5233
Epoch 3, Loss: 0.4890
Epoch 4, Loss: 0.4738
Epoch 5, Loss: 0.4780
Precision: 0.8949
Recall: 0.1821
mAP@0.5: 0.8949
mAP@0.5:0.95: 0.1874


##### Сравнение собственной реализации YOLO11n и YOLO11s до улучшения бейзлайна:
Собственная реализация YOLO11s показывает несколько высокое качество классификации по сравнению с YOLO11n. На тестовой выборке mAP@0.5 у YOLO11n составляет 0.7559 против 0.7868 у YOLO11s, аналогично и с Presicion. Это говорит о том, что YOLO11s справляется лучше.


##### Сравнение собственной реализации YOLO11n и YOLO11s  после улучшения бейзлайна:
После улучшения бейзлайна производительность обеих моделей снизилась, особенно Recall (до 0.1101 и 0.1821 соответственно), однако YOLO11s все еще показывает лучшие результаты по сравнению с YOLO11n.

### Вывод:
Улучшение бейзлайна не дало сильного эффекта. Для YOLO11n метрики немного снизились, что может быть связано с более сложными данными, А YOLO11s даже улучшила свои показатели.