# Импорт библиотек

In [1]:
from time import time, localtime, strftime

import math
import numpy as np
import pandas as pd

import cv2

import torch
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import v2 as T

from engine import train_one_epoch, evaluate
import utils

from clearml import Task, Logger

import io
from contextlib import redirect_stdout

import warnings
warnings.filterwarnings('ignore')

# Подключение ClearML

In [2]:
%env CLEARML_WEB_HOST=https://app.clear.ml
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=ZSRLXZMP0J1YF71WUXFW
%env CLEARML_API_SECRET_KEY=rKFqd1bjD1v23kEUyzPD7wK3v6qbXlsoX1kuBKECEk7jFDCOWQ

env: CLEARML_WEB_HOST=https://app.clear.ml
env: CLEARML_API_HOST=https://api.clear.ml
env: CLEARML_FILES_HOST=https://files.clear.ml
env: CLEARML_API_ACCESS_KEY=ZSRLXZMP0J1YF71WUXFW
env: CLEARML_API_SECRET_KEY=rKFqd1bjD1v23kEUyzPD7wK3v6qbXlsoX1kuBKECEk7jFDCOWQ


In [3]:
task = Task.init(
    project_name='Faster RCNN',
    task_name='Training',
    tags=['Faster RCNN', 'PyTorch', 'Training', 'Big dataset']
)

ClearML Task: created new task id=8e154479ee3d4dd8aab9d0b467eec72e
2024-04-05 01:29:01,883 - clearml.Task - INFO - Storing jupyter notebook directly as code
ClearML results page: https://app.clear.ml/projects/ccb749dab7464bfaa292381a3f21b2a9/experiments/8e154479ee3d4dd8aab9d0b467eec72e/output/log


# Подготовка данных для обучения и тестирования

In [4]:
train_data = pd.read_csv('train_df.csv')
valid_data = pd.read_csv('test_df.csv')

In [4]:
class WildLifeDataset(Dataset):
    def __init__(self, dataframe, img_dir, width, height, transforms=None):
        super().__init__()

        self.df = dataframe
        self.img_dir = img_dir
        self.images = list(dataframe['img_filename'].unique())

        self.width = width
        self.height = height

        self.transforms = transforms

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img = self.images[idx]
        image = cv2.imread(f'{self.img_dir}/{img}', cv2.IMREAD_COLOR)

        wt = image.shape[1]
        ht = image.shape[0]

        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32)
        image = cv2.resize(image, (self.width, self.height), cv2.INTER_AREA)
        image /= 255.0

        image = torch.Tensor(image)
        image = image.permute(2, 0, 1)

        boxes = []
        labels = []

        for line in self.df[self.df['img_filename'] == img].index:
            box = [
                self.df.loc[line, 'ann_bbox_xmin'] / wt * self.width,
                self.df.loc[line, 'ann_bbox_ymin'] / ht * self.height,
                self.df.loc[line, 'ann_bbox_xmax'] / wt * self.width,
                self.df.loc[line, 'ann_bbox_ymax'] / ht * self.height,
            ]
            labels.append(self.df.loc[line, 'cat_id'] + 1)

            boxes.append(box)

        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])

        labels = torch.tensor(labels)

        iscrowd = torch.zeros((boxes.shape[0],), dtype=torch.int64)

        image_id = idx

        target = {
            "boxes": boxes,
            "area": area,
            "labels": labels,
            "iscrowd": iscrowd,
            "image_id": image_id
        }

        if self.transforms is not None:
            image, target = self.transforms(image, target)

        return image, target

In [2]:
def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(p=0.5)),
        transforms.append(T.ColorJitter(p=0.3)),
        transforms.append(T.GaussianBlur(p=0.1))

    transforms.append(T.ToDtype(torch.float, scale=True))
    transforms.append(T.ToPureTensor())
    return T.Compose(transforms)

In [24]:
train_dataset = WildLifeDataset(train_data, '../../dataset/images/train', 640, 640, get_transform(train=True))
valid_dataset = WildLifeDataset(valid_data, '../../dataset/images/test', 640, 640, get_transform(train=False))

train_dataloader = DataLoader(
    train_dataset,
    batch_size=4,
    num_workers=0,
    shuffle=True,
    collate_fn=utils.collate_fn,
)

valid_dataloader = DataLoader(
    valid_dataset,
    batch_size=4,
    num_workers=0,
    shuffle=False,
    collate_fn=utils.collate_fn
)

# Создание модели и обучение

**Функция для создания модели**

In [8]:
def get_object_detection_model(num_classes):

    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="DEFAULT")

    in_features = model.roi_heads.box_predictor.cls_score.in_features
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model

**Определение устройства для обучения модели**

In [9]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

**Определение параметров обучения**

In [18]:
num_classes = 14


model = get_object_detection_model(num_classes)
model.to(device)


params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.AdamW(
    params,
    lr=0.0001,
    weight_decay=0.0005
)


lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=110,
    T_mult=1
)

**Обучение модели**

In [11]:
class Predictor:
    weights = None
    ap95 = np.float32(0.0)
    ar = np.float32(0.0)

In [12]:
def clear_string(string_out):
    string_out.truncate(0)
    string_out.seek(0)

In [13]:
num_epochs = 100
epoch_progress = epoch_remained = 5

best_predictor = Predictor()

frequency = math.ceil(len(train_dataloader.dataset) / train_dataloader.batch_size)
fake_out = io.StringIO()
log = Logger.current_logger()

for epoch in range(1, num_epochs + 1):
    print("-" * 100)
    print(f'Epoch {epoch}    {strftime("%d-%m-%Y %H:%M:%S", localtime())}')
    start_time = time()
    with redirect_stdout(fake_out):
        train_one_epoch(model, optimizer, train_dataloader, device, epoch, print_freq=frequency)
        lr_scheduler.step()
        evaluation = evaluate(model, valid_dataloader, device=device)

    output = fake_out.getvalue()
    clear_string(fake_out)

    ap95 = np.float32(0.0)
    ar = np.float32(0.0)

    for line in output.split('\n'):
        line_splitted = line.split()
        if line.startswith(f'Epoch: [{epoch}]  ['):
            loss_classifier = np.float32(line_splitted[line_splitted.index('loss_classifier:') + 1])
            loss_box_reg = np.float32(line_splitted[line_splitted.index('loss_box_reg:') + 1])

        else:
            if loss_classifier and loss_box_reg:
                log.report_scalar("Training", "Loss classifier", iteration=epoch, value=loss_classifier)
                log.report_scalar("Training", "Loss box reg", iteration=epoch, value=loss_box_reg)
                loss_classifier = loss_box_reg = 0

            if line.startswith(' Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]'):
                ap95 = np.float32(line_splitted[-1])

                log.report_scalar("Training", "AP IoU=0.50:0.95", iteration=epoch, value=ap95)

            elif line.startswith(' Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ]'):
                ap = np.float32(line_splitted[-1])

                log.report_scalar("Training", "AP IoU=0.50", iteration=epoch, value=ap)

            elif line.startswith(' Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ]'):
                ar = np.float32(line_splitted[-1])

                log.report_scalar("Training", "AR IoU=0.50:0.95", iteration=epoch, value=ar)

    torch.save(model.state_dict(), 'training/faster_rcnn_weights_last.pt')
    if epoch % 5 == 0:
        torch.save(model.state_dict(), f'training/faster_rcnn_weights_{epoch}.pt')

    if best_predictor.ap95 <= ap95 and best_predictor.ar <= ar:
        best_predictor.weights = model.state_dict()
        best_predictor.ap95 = ap95
        best_predictor.ar = ar
        epoch_remained = epoch_progress
        print(f"New best classifier epoch {epoch}")

    else:
        epoch_remained -= 1

    evaluation_time = time() - start_time
    print(f"Epoch evaluation time {evaluation_time}")
    log.report_scalar("Time", "Evaluation time", iteration=epoch, value=evaluation_time)

    if epoch_remained == 0:
        break

----------------------------------------------------------------------------------------------------
Epoch 1    05-04-2024 01:29:16
ClearML Monitor: Could not detect iteration reporting, falling back to iterations as seconds-from-start
2024-04-05 02:34:29,758 - clearml.frameworks - INFO - Found existing registered model id=c9d80a500d0b4cf8b292362876754c1f [C:\Users\User\Jupyter\DL\faster_rcnn\training\faster_rcnn_weights_last.pt] reusing it.
New best classifier epoch 1
Epoch evaluation time 3926.8371658325195
----------------------------------------------------------------------------------------------------
Epoch 2    05-04-2024 02:34:43
New best classifier epoch 2
Epoch evaluation time 3996.658895969391
----------------------------------------------------------------------------------------------------
Epoch 3    05-04-2024 03:41:19
Epoch evaluation time 4000.5950000286102
----------------------------------------------------------------------------------------------------
Epoch 4    

Retrying (Retry(total=2, connect=2, read=5, redirect=5, status=None)) after connection broken by 'NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000029DB9EABB10>: Failed to establish a new connection: [WinError 10051] Сделана попытка выполнить операцию на сокете при отключенной сети')': /


New best classifier epoch 8
Epoch evaluation time 3993.32422041893
----------------------------------------------------------------------------------------------------
Epoch 9    05-04-2024 10:21:09
New best classifier epoch 9
Epoch evaluation time 4001.8101937770844
----------------------------------------------------------------------------------------------------
Epoch 10    05-04-2024 11:27:51


Certificate did not match expected hostname: files.clear.ml. Certificate: {'subject': ((('commonName', 'web-search-results-api-public.qa.bws.esa.com'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', 'Amazon'),), (('commonName', 'Amazon RSA 2048 M02'),)), 'version': 3, 'serialNumber': '0EA57E037BE121416BCDD3B11AF87FB4', 'notBefore': 'Sep 20 00:00:00 2023 GMT', 'notAfter': 'Oct 18 23:59:59 2024 GMT', 'subjectAltName': (('DNS', 'web-search-results-api-public.qa.bws.esa.com'),), 'OCSP': ('http://ocsp.r2m02.amazontrust.com',), 'caIssuers': ('http://crt.r2m02.amazontrust.com/r2m02.cer',), 'crlDistributionPoints': ('http://crl.r2m02.amazontrust.com/r2m02.crl',)}
Certificate did not match expected hostname: files.clear.ml. Certificate: {'subject': ((('commonName', 'web-search-results-api-public.qa.bws.esa.com'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', 'Amazon'),), (('commonName', 'Amazon RSA 2048 M02'),)), 'version': 3, 'serialNumber': '0EA57E037BE121416B

New best classifier epoch 10
Epoch evaluation time 4003.511302947998
----------------------------------------------------------------------------------------------------
Epoch 11    05-04-2024 12:34:35


Certificate did not match expected hostname: files.clear.ml. Certificate: {'subject': ((('commonName', 'web-search-results-api-public.qa.bws.esa.com'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', 'Amazon'),), (('commonName', 'Amazon RSA 2048 M02'),)), 'version': 3, 'serialNumber': '0EA57E037BE121416BCDD3B11AF87FB4', 'notBefore': 'Sep 20 00:00:00 2023 GMT', 'notAfter': 'Oct 18 23:59:59 2024 GMT', 'subjectAltName': (('DNS', 'web-search-results-api-public.qa.bws.esa.com'),), 'OCSP': ('http://ocsp.r2m02.amazontrust.com',), 'caIssuers': ('http://crt.r2m02.amazontrust.com/r2m02.cer',), 'crlDistributionPoints': ('http://crl.r2m02.amazontrust.com/r2m02.crl',)}
Certificate did not match expected hostname: files.clear.ml. Certificate: {'subject': ((('commonName', 'web-search-results-api-public.qa.bws.esa.com'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', 'Amazon'),), (('commonName', 'Amazon RSA 2048 M02'),)), 'version': 3, 'serialNumber': '0EA57E037BE121416B

Epoch evaluation time 4019.1907646656036
----------------------------------------------------------------------------------------------------
Epoch 12    05-04-2024 13:41:34
Epoch evaluation time 4078.4684250354767
----------------------------------------------------------------------------------------------------
Epoch 13    05-04-2024 14:49:32


KeyboardInterrupt: 

**Сохранение весов модели**

In [14]:
torch.save(best_predictor.weights, 'training/faster_rcnn_weights_best.pt')
task.upload_artifact(name='faster_rcnn_weights_best.pt', artifact_object=best_predictor.weights)

2024-04-05 14:52:55,176 - clearml.frameworks - INFO - Found existing registered model id=2c8ccb5bcd534815999ea392232f68a0 [C:\Users\User\Jupyter\DL\faster_rcnn\training\faster_rcnn_weights_best.pt] reusing it.


True

In [15]:
torch.save(
    {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': lr_scheduler.state_dict(),
    },
    'training/checkpoint.pth'
)

2024-04-05 14:53:45,168 - clearml.frameworks - INFO - Found existing registered model id=50d5c396b974464c983f25398d2db95c [C:\Users\User\Jupyter\DL\faster_rcnn\checkpoint.pth] reusing it.


In [17]:
task.close()