#Pythonで学ぶ画像認識　第5章 画像分類
##第5.4節 Transformer による手法～DETR を実装してみよう

###モジュールのインポートとGoogleドライブのマウント

In [None]:
from tqdm import tqdm
from collections import deque
from typing import Callable, Sequence, List, Tuple, Union
import json
from pycocotools.cocoeval import COCOeval
import numpy as np
from PIL import Image
from pathlib import Path
import math
from scipy.optimize import linear_sum_assignment

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from torchvision.utils import draw_bounding_boxes

# Googleドライブをマウント
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('drive/MyDrive/python_image_recognition/5_object_detection/5_4_detr')

import util
import dataset
import transform as T
from model import ResNet18

Mounted at /content/drive


###位置エンコーディングを生成するクラス

In [None]:
class PositionalEncoding:
    '''
    位置エンコーディング生成クラス
    eps        : 0で割るのを防ぐための小さい定数
    temperature: 温度定数
    '''
    def __init__(self, eps: float=1e-6, temperature: int=10000):
        self.eps = eps
        self.temperature = temperature

    '''
    位置エンコーディングを生成する関数
    x   : 特徴マップ, [バッチサイズ, チャネル数, 高さ, 幅]
    mask: 画像領域を表すマスク, [バッチサイズ, 高さ, 幅]
    '''
    @torch.no_grad()
    def generate(self, x: torch.Tensor, mask: torch.Tensor):
        # 位置エンコーディングのチャネル数は入力の半分として
        # x方向のエンコーディングとy方向のエンコーディングを用意し、
        # それらを連結することで入力のチャネル数に合わせる
        num_pos_channels = x.shape[1] // 2

        # 温度定数の指数を計算するため、2の倍数を用意
        dim_t = torch.arange(0, num_pos_channels, 2,
                             dtype=x.dtype, device=x.device)
        # sinとcosを計算するために値を複製
        # [0, 2, ...] -> [0, 0, 2, 2, ...]
        dim_t = dim_t.repeat_interleave(2)
        # sinとcosへの入力のの分母となるT^{2i / d}を計算
        dim_t /= num_pos_channels
        dim_t = self.temperature ** dim_t

        # マスクされていない領域の座標を計算
        inverted_mask = ~mask
        y_encoding = inverted_mask.cumsum(1, dtype=torch.float32)
        x_encoding = inverted_mask.cumsum(2, dtype=torch.float32)

        # 座標を0-1に正規化して2πをかける
        y_encoding = 2 * math.pi * y_encoding / \
            (y_encoding.max(dim=1, keepdim=True)[0] + self.eps)
        x_encoding = 2 * math.pi * x_encoding / \
            (x_encoding.max(dim=2, keepdim=True)[0] + self.eps)

        # 座標を保持するテンソルにチャネル軸を追加して、
        # チャネル軸方向にdim_tで割る
        # 偶数チャネルはsin、奇数チャネルはcosの位置エンコーディング
        y_encoding = y_encoding.unsqueeze(1) / \
            dim_t.view(num_pos_channels, 1, 1)
        y_encoding[:, ::2] = y_encoding[:, ::2].sin()
        y_encoding[:, 1::2] = y_encoding[:, 1::2].cos()
        x_encoding = x_encoding.unsqueeze(1) / \
            dim_t.view(num_pos_channels, 1, 1)
        x_encoding[:, ::2] = x_encoding[:, ::2].sin()
        x_encoding[:, 1::2] = x_encoding[:, 1::2].cos()

        encoding = torch.cat((y_encoding, x_encoding), dim=1)

        return encoding

###Transformerエンコーダ層

In [None]:
class TransformerEncoderLayer(nn.Module):
    '''
    Transformerエンコーダ層
    dim_hidden     : 特徴量次元
    num_heads      : マルチヘッドアテンションのヘッド数
    dim_feedforward: FNNの中間特徴量次元
    dropout        : ドロップアウト率
    '''
    def __init__(self, dim_hidden: int=256, num_heads: int=8,
                 dim_feedforward: int=2048, dropout: float=0.1):
        super().__init__()

        # 自己アテンションブロックの構成要素
        self.self_attention = nn.MultiheadAttention(
            dim_hidden, num_heads, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(dim_hidden)

        # FNNブロックの構成要素
        self.fnn = nn.Sequential(
            nn.Linear(dim_hidden, dim_feedforward),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, dim_hidden)
        )
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(dim_hidden)

    '''
    順伝播関数
    x           : 特徴マップの特徴量,
                  [特徴量数, バッチサイズ, 特徴量次元]
    pos_encoding: 位置エンコーディング,
                  [特徴量数, バッチサイズ, 特徴量次元]
    mask        : 画像領域かどうかを表すマスク,
                  [バッチサイズ, 特徴量数]
    '''
    def forward(self, x: torch.Tensor, pos_encoding: torch.Tensor,
                mask: torch.Tensor):
        # クエリとキーには位置エンコーディングを加算することで
        # アテンションの計算に位置の情報が使われるようにする
        q = k = x + pos_encoding

        # self_attenionにはクエリ、キー、バリューの順番に入力
        # key_padding_maskにmaskを渡すことでマスクが真の値を持つ領域の
        # キーは使われなくなり、特徴収集の対象から外れる
        # MutltiheadAttentionクラスは特徴収集結果とアテンションの値の
        # 2つの結果を返すが、特徴収集結果のみを使うので[0]とする
        x2 = self.self_attention(q, k, x, key_padding_mask=mask)[0]
        x = x + self.dropout1(x2)
        x = self.norm1(x)

        x2 = self.fnn(x)
        x = x + self.dropout2(x2)
        x = self.norm2(x)

        return x

###Transformerデコーダ層

In [None]:
class TransformerDecoderLayer(nn.Module):
    '''
    Transformerデコーダ層
    dim_hidden     : 特徴量次元
    num_heads      : マルチヘッドアテンションのヘッド数
    dim_feedforward: FNNの中間特徴量次元
    dropout        : ドロップアウト率
    '''
    def __init__(self, dim_hidden: int=256, num_heads: int=8,
                 dim_feedforward: float=2048, dropout: float=0.1):
        super().__init__()

        # 物体特徴量の自己アテンションブロックの構成要素
        self.self_attention = nn.MultiheadAttention(
            dim_hidden, num_heads, dropout=dropout)
        self.dropout1 = nn.Dropout(dropout)
        self.norm1 = nn.LayerNorm(dim_hidden)

        # 物体特徴量と特徴マップの特徴量の
        # 交差アテンションブロックの構成要素
        self.cross_attention = nn.MultiheadAttention(
            dim_hidden, num_heads, dropout=dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.norm2 = nn.LayerNorm(dim_hidden)

        # FNNブロックの構成要素
        self.fnn = nn.Sequential(
            nn.Linear(dim_hidden, dim_feedforward),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, dim_hidden)
        )
        self.dropout3 = nn.Dropout(dropout)
        self.norm3 = nn.LayerNorm(dim_hidden)

    '''
    順伝播関数
    h           : 物体特徴量, [クエリ数, バッチサイズ, 特徴量次元]
    query_embed : 物体クエリ埋め込み,
                  [クエリ数, バッチサイズ, 特徴量次元]
    x           : 特徴マップの特徴量,
                  [特徴量数, バッチサイズ, 特徴量次元]
    pos_encoding: 位置エンコーディング,
                  [特徴量数, バッチサイズ, 特徴量次元]
    mask        : 画像領域かどうかを表すマスク,
                  [バッチサイズ, 特徴量数]
    '''
    def forward(self, h: torch.Tensor, query_embed: torch.Tensor,
                x: torch.Tensor, pos_encoding: torch.Tensor,
                mask: torch.Tensor):
        # 物体クエリ埋め込みの自己アテンション
        q = k = h + query_embed
        h2 = self.self_attention(q, k, h)[0]
        h = h + self.dropout1(h2)
        h = self.norm1(h)

        # 物体クエリ埋め込みと特徴マップの交差アテンション
        h2 = self.cross_attention(h + query_embed, x + pos_encoding,
                                  x, key_padding_mask=mask)[0]
        h = h + self.dropout2(h2)
        h = self.norm2(h)

        h2 = self.fnn(h)
        h = h + self.dropout3(h2)
        h = self.norm3(h)

        return h

###Transformer

In [None]:
class Transformer(nn.Module):
    '''
    エンコーダ層とデコーダ層をまとめるTransformer
    dim_hidden        : 特徴量次元
    num_heads         : マルチヘッドアテンションのヘッド数
    num_encoder_layers: エンコーダ層の数
    num_decoder_layers: デコーダ層の数
    dim_feedforward   : FNNの特徴量次元
    dropout           : ドロップアウト率
    '''
    def __init__(self, dim_hidden: int=256, num_heads: int=8,
                 num_encoder_layers: int=3, num_decoder_layers: int=3,
                 dim_feedforward: int=2048, dropout: float=0.1):
        super().__init__()

        # 引数で指定された数だけエンコーダ層とデコーダ層を用意
        self.encoder_layers = nn.ModuleList([
            TransformerEncoderLayer(
                dim_hidden, num_heads, dim_feedforward, dropout)
            for _ in range(num_encoder_layers)
        ])
        self.decoder_layers = nn.ModuleList([
            TransformerDecoderLayer(
                dim_hidden, num_heads, dim_feedforward, dropout)
            for _ in range(num_decoder_layers)
        ])

        self._reset_parameters()

    '''
    パラメータの初期化関数
    '''
    def _reset_parameters(self):
        for param in self.parameters():
            if param.dim() > 1:
                nn.init.xavier_uniform_(param)

    '''
    順伝播関数
    x           : 特徴マップ, [バッチサイズ, チャネル数, 高さ, 幅]
    pos_encoding: 位置エンコーディング,
                  [バッチサイズ, チャネル数, 高さ, 幅]
    mask        : マスク, [バッチサイズ, 高さ, 幅]
    query_embed : 物体クエリ埋め込み, [クエリ数, 特徴量次元]
    '''
    def forward(self, x: torch.Tensor, pos_encoding: torch.Tensor,
                mask: torch.Tensor, query_embed: torch.Tensor):
        bs = x.shape[0]

        ''' 入力をTransformerに入力するための整形 '''
        
        # 特徴マップ:
        # [バッチサイス、チャネル数、高さ、幅]
        # -> [高さ * 幅、バッチサイズ、チャネル数]
        x = x.flatten(2).permute(2, 0, 1)
        
        # 位置エンコーディング:
        # [バッチサイス、チャネル数、高さ、幅]
        # -> [高さ*幅、バッチサイズ、チャネル数]
        pos_encoding = pos_encoding.flatten(2).permute(2, 0, 1)
        
        # マスク:
        # [バッチサイス、高さ、幅] -> [バッチサイズ、高さ*幅]
        mask = mask.flatten(1)
        
        # 物体クエリ埋め込み:
        #[クエリ数、チャネル数]
        # -> [クエリ数、バッチサイズ、チャネル数]
        query_embed = query_embed.unsqueeze(1).expand(-1, bs, -1)

        ''''''''''''''''''''''''''''''''''''''''''''

        # エンコーダ層を直列に適用
        for layer in self.encoder_layers:
            x = layer(x, pos_encoding, mask)

        # デコーダ層を直列に適用
        # 途中のデコーダ層の出力も保持
        hs = []
        h = torch.zeros_like(query_embed)
        for layer in self.decoder_layers:
            h = layer(h, query_embed, x, pos_encoding, mask)
            hs.append(h)

        # 第0軸を追加して各デコーダ層の出力を第0軸で連結
        hs = torch.stack(hs)
        # [デコーダ層数、バッチサイズ、クエリ数、チャネル数]にする
        hs = hs.permute(0, 2, 1, 3)

        return hs

###DETR

In [None]:
class DETR(nn.Module):
    '''
    DETRモデル(ResNet18バックボーン)
    num_queries       : 物体クエリ埋め込みの数
    dim_hidden        : Transformerで処理する際の特徴量次元
    num_heads         : マルチヘッドアッテンションのヘッド数
    num_encoder_layers: Transformerエンコーダの層数
    num_decoder_layers: Transformerデコーダの層数
    dim_feedforward   : TransformerのFNNの中間特徴量次元
    dropout           : Transformer内でのドロップアウト率
    num_classes       : 物体クラス数
    '''
    def __init__(self, num_queries: int, dim_hidden: int,
                 num_heads: int, num_encoder_layers: int,
                 num_decoder_layers: int, dim_feedforward: int,
                 dropout: float, num_classes: int):
        super().__init__()

        self.backbone = ResNet18()

        # バックボーンネットワークの特徴マップのチャネル数を
        # 減らすための畳み込み層
        self.proj = nn.Conv2d(512, dim_hidden, kernel_size=1)

        self.transformer = Transformer(
            dim_hidden, num_heads, num_encoder_layers,
            num_decoder_layers, dim_feedforward, dropout)

        # 分類ヘッド
        # 背景クラスのために実際の物体クラス数に1を追加
        self.class_head = nn.Linear(dim_hidden, num_classes + 1)

        # 矩形ヘッド
        self.box_head = nn.Sequential(
            nn.Linear(dim_hidden, dim_hidden),
            nn.ReLU(inplace=True),
            nn.Linear(dim_hidden, dim_hidden),
            nn.ReLU(inplace=True),
            nn.Linear(dim_hidden, 4),
        )

        self.positional_encoding = PositionalEncoding()

        # 物体クラス埋め込み
        self.query_embed = nn.Embedding(num_queries, dim_hidden)

    '''
    順伝播関数
    x   : 入力画像, [バッチサイズ, チャネル数, 高さ, 幅]
    mask: 画像領域かどうかを表すマスク, [バッチサイズ, 高さ, 幅]
    '''
    def forward(self, x: torch.Tensor, mask: torch.Tensor):
        # バックボーンネットワークから第5レイヤーの特徴マップを取得
        x = self.backbone(x)[-1]
        # Transformer処理用に特徴マップのチャネル数を削減
        x = self.proj(x)

        # 入力画像と同じ大きさを持つmaskを特徴マップの大きさにリサイズ
        # interpolate関数はbool型には対応していないため、一旦xと
        # 同じ型に変換
        mask = mask.to(x.dtype)
        mask = F.interpolate(
            mask.unsqueeze(1), size=x.shape[2:])[:, 0]
        mask = mask.to(torch.bool)

        pos_encoding = self.positional_encoding.generate(x, mask)

        hs = self.transformer(
            x, pos_encoding, mask, self.query_embed.weight)

        preds_class = self.class_head(hs)
        preds_box = self.box_head(hs).sigmoid()

        return preds_class, preds_box

    '''
    モデルパラメータが保持されているデバイスを返す関数
    '''
    def get_device(self):
        return self.backbone.conv1.weight.device

###後処理を行う関数

In [None]:
'''
preds_class: 検出矩形のクラス, [バッチサイズ, クエリ数, 物体クラス数]
preds_box  : 検出矩形の位置と大きさ,
             [バッチサイズ, クエリ数, 4 (x, y, w, h)]
targets    : ラベル
include_bg : 分類結果に背景を含めるかどうかを表す真偽値
'''
@torch.no_grad()
def post_process(preds_class: torch.Tensor, preds_box: torch.Tensor,
                 targets: dict, include_bg: bool=False):
    probs = preds_class.softmax(dim=2)

    # 分類結果に背景クラスを含めるかどうか
    # 含めない場合は背景クラス以外で最大の確率を持つクラスに分類
    if include_bg:
        scores, labels = probs.max(dim=2)
    else:
        scores, labels = probs[:, :, :-1].max(dim=2)

    # 検出矩形の整形
    boxes = util.convert_to_xyxy(preds_box)
    # 矩形をミニバッチのサンプル毎の画像の大きさに合わせる
    img_sizes = torch.stack([target['orig_size']
                             for target in targets])
    boxes[:, :, ::2] *= img_sizes[:, 0].view(-1, 1, 1)
    boxes[:, :, 1::2] *= img_sizes[:, 1].view(-1, 1, 1)

    return scores, labels, boxes

###GIoUを計算する関数

In [None]:
'''
boxes1: 矩形集合, [矩形数, 4 (xmin, ymin, xmax, ymax)]
boxes2: 矩形集合, [矩形数, 4 (xmin, ymin, xmax, ymax)]
'''
def calc_giou(boxes1: torch.Tensor, boxes2: torch.Tensor):
    ious, union = util.calc_iou(boxes1, boxes2)

    # 二つの矩形を包含する最小の矩形の面積を計算
    left_top = torch.minimum(
        boxes1[:, :2].unsqueeze(1), boxes2[:, :2])
    right_bottom = torch.maximum(
        boxes1[:, 2:].unsqueeze(1), boxes2[:, 2:])
    width_height = (right_bottom - left_top).clamp(min=0)
    areas = width_height.prod(dim=2)

    return ious - (areas - union) / areas

###検出矩形と正解矩形の割り当てを行うハンガリアンアルゴリズム関数

In [None]:
'''
preds_class         : 検出矩形のクラス,
                      [バッチサイズ, クエリ数, 物体クラス数]
preds_box           : 検出矩形の位置と大きさ,
                      [バッチサイズ, クエリ数, 4 (x, y, w, h)]
targets             : ラベル
loss_weight_class   : コストを計算する際の分類コストの重み
loss_weight_box_l1  : コストを計算する際の矩形のL1コストの重み
loss_weight_box_giou: コストを計算する際の矩形のGIoUコストの重み
'''
@torch.no_grad()
def _hungarian_match(preds_class: torch.Tensor,
                     preds_box: torch.Tensor, targets: dict,
                     loss_weight_class: float=1.0,
                     loss_weight_box_l1: float=5.0,
                     loss_weight_box_giou: float=2.0):
    bs, num_queries = preds_class.shape[:2]

    # コスト計算を全てのサンプル一括で計算するため、
    # 全てのサンプルの予測結果を一旦第0軸に並べる
    preds_class = preds_class.flatten(0, 1).softmax(dim=1)
    preds_box = preds_box.flatten(0, 1)

    # 予測結果と同様に全てのサンプルの正解ラベルを一旦第0軸に並べる
    targets_class = torch.cat([target['classes']
                               for target in targets])
    # 正解矩形の値を正規化された画像上の座標に変換
    targets_box = torch.cat(
        [target['boxes'] / target['size'].repeat(2)
         for target in targets])

    # 分類のコストは正解クラスの予測確率にマイナスをかけたもの
    # 正解クラスの予測確率が高ければ高いほどコストが小さくなる
    cost_class = -preds_class[:, targets_class]

    # 矩形回帰の1つ目のコストとなる予測結果と正解のL1誤差の計算
    cost_box_l1 = torch.cdist(
        preds_box, util.convert_to_xywh(targets_box), p=1)

    # 矩形回帰の2つ目のコストとなる予測結果と正解のGIoU損失の計算
    cost_box_giou = -calc_giou(
        util.convert_to_xyxy(preds_box), targets_box)

    cost = loss_weight_class * cost_class + \
        loss_weight_box_l1 * cost_box_l1 + \
        loss_weight_box_giou * cost_box_giou

    # 一括で計算していたコストをサンプル毎に分解するため軸を変更
    # 検出矩形の軸を分解して、
    # [バッチサイズ、クエリ数、全サンプルの正解数]という軸構成になる
    cost = cost.view(bs, num_queries, -1)

    # SciPyのlinear_sum_assignment関数を適用するためCPUへ転送
    cost = cost.to('cpu')

    # 各サンプルの正解矩形数を計算
    sizes = [len(target['classes']) for target in targets]

    indices = []
    # 第2軸を各サンプルの正解矩形数で分解し、バッチ軸でサンプルを
    # 指定することで、各サンプルのコスト行列を取得
    for batch_id, c in enumerate(cost.split(sizes, dim=2)):
        c_batch = c[batch_id]
        # ハンガリアンアルゴリズムにより予測結果と正解のマッチング
        # クエリのインデックスと正解のインデックスを得る
        pred_indices, target_indices = linear_sum_assignment(c_batch)
        indices.append((
            torch.tensor(pred_indices, dtype=torch.int64),
            torch.tensor(target_indices, dtype=torch.int64)))

    return indices

###並べ替えのためのインデックスを生成する関数

In [None]:
'''
indices: ハンガリアンアルゴリズムにより得られたインデックス
'''
def _get_pred_permutation_index(
    indices: List[Tuple[torch.Tensor]]):
    # マッチした予測結果のバッチインデックスを1つの軸に並べる
    batch_indices = torch.cat(
        [torch.full_like(pred_indices, i)
         for i, (pred_indices, _) in enumerate(indices)])
    # マッチした予測結果のクエリインデックスを1つの軸に並べる
    pred_indices = torch.cat([pred_indices
                              for (pred_indices, _) in indices])

    return batch_indices, pred_indices

###分類損失を計算する関数

In [None]:
'''
preds            : 検出矩形のクラス,
                   [バッチサイズ, クエリ数, 物体クラス数]
targets          : ラベル
indices          : ハンガリアンアルゴリズムにより得られたインデックス
background_weight: 背景クラスの交差エントロピー誤差の重み
'''
def _class_loss_func(preds: torch.Tensor, targets: dict,
                     indices: List[Tuple[torch.Tensor]],
                     background_weight: float):
    pred_indices = _get_pred_permutation_index(indices)

    # 物体クラス軸の最後の次元が背景クラス
    background_id = preds.shape[2] - 1

    # 正解ラベルとなるテンソルの作成
    # [バッチサイズ、クエリ数]のテンソルを作成し、背景IDを設定
    targets_class = preds.new_full(
        preds.shape[:2], background_id, dtype=torch.int64)
    # マッチした予測結果の部分に正解ラベルとなる物体クラスIDを代入
    targets_class[pred_indices] = torch.cat(
        [target['classes'][target_indices]
         for target, (_, target_indices) in zip(targets, indices)])

    # 背景クラスの正解数が多く、正解数に不均衡が生じるため、
    # 背景クラスの重みを下げる
    weights = preds.new_ones(preds.shape[2])
    weights[background_id] = background_weight

    loss = F.cross_entropy(preds.transpose(1, 2),
                           targets_class, weights)

    return loss

###矩形の回帰損失を計算する関数

In [None]:
'''
preds  : 検出矩形の位置と大きさ,
         [バッチサイズ, クエリ数, 4 (x, y, w, h)]
targets: ラベル
indices: ハンガリアンアルゴリズムにより得られたインデックス
'''
def _box_loss_func(preds: torch.Tensor, targets: dict,
                   indices: List[Tuple[torch.Tensor]]):
    pred_indices = _get_pred_permutation_index(indices)

    # マッチした予測結果を抽出
    preds = preds[pred_indices]

    # マッチした正解を抽出
    targets_box = torch.cat([
        target['boxes'][target_indices] / target['size'].repeat(2)
        for target, (_, target_indices) in zip(targets, indices)])

    # 0除算を防ぐために、最小値が1になるように計算
    num_boxes = max(1, targets_box.shape[0])

    # マッチした予測結果と正解でL1誤差を計算
    loss_l1 = F.l1_loss(preds, util.convert_to_xywh(targets_box),
                        reduction='sum') / num_boxes

    # マッチした予測結果と正解でGIoU損失を計算
    gious = calc_giou(util.convert_to_xyxy(preds), targets_box)
    loss_giou = (1 - gious.diag()).sum() / num_boxes

    return loss_l1, loss_giou

###損失を計算する関数

In [None]:
'''
preds_class         : 検出矩形のクラス,
                      [バッチサイズ, クエリ数, 物体クラス数]
preds_box           : 検出矩形の位置と大きさ,
                      [バッチサイズ, クエリ数, 4 (x, y, w, h)]
targets             : ラベル
loss_weight_class   : コストを計算する際の分類コストの重み
loss_weight_box_l1  : コストを計算する際の矩形のL1コストの重み
loss_weight_box_giou: コストを計算する際の矩形のGIoUコストの重み
'''
def loss_func(preds_class: torch.Tensor, preds_box: torch.Tensor,
              targets: dict, loss_weight_class: float=1.0,
              loss_weight_box_l1: float=5.0,
              loss_weight_box_giou: float=2.0,
              background_weight: float=0.1):
    indices = _hungarian_match(preds_class, preds_box, targets,
                               loss_weight_class, loss_weight_box_l1,
                               loss_weight_box_giou)

    loss_class = loss_weight_class * _class_loss_func(
        preds_class, targets, indices, background_weight)
    loss_box_l1, loss_box_giou = _box_loss_func(
        preds_box, targets, indices)
    loss_box_l1 = loss_weight_box_l1 * loss_box_l1
    loss_box_giou = loss_weight_box_giou * loss_box_giou

    return loss_class, loss_box_l1, loss_box_giou

###学習・評価におけるハイパーパラメータやオプションの設定

In [None]:
class ConfigTrainEval:
    '''
    ハイパーパラメータとオプションの設定
    '''
    def __init__(self):
        self.img_directory = 'val2014'                     # 画像があるディレクトリ
        self.anno_file = 'drive/MyDrive/python_image_recognition/data/coco2014/' \
                            'instances_val2014_small.json' # アノテーションファイルのパス
        self.save_file = 'drive/MyDrive/python_image_recognition/5_object_detection/model/' \
                            'detr.pth'                     # パラメータを保存するパス
        self.val_ratio = 0.2                               # 検証に使う学習セット内のデータの割合
        self.num_epochs = 100                              # 学習エポック数
        self.lr_drop = 90                                  # 学習率を減衰させるエポック
        self.val_interval = 5                              # 検証を行うエポック間隔
        self.lr = 1e-4                                     # 学習率
        self.lr_backbone = 1e-5                            # バックボーンネットワークの学習率
        self.weight_decay = 1e-4                           # 荷重減衰
        self.clip = 0.1                                    # 勾配のクリップ上限
        self.num_queries = 100                             # 物体クエリ埋め込みのクエリベクトル数
        self.dim_hidden = 256                              # Transformer内の特徴量次元
        self.num_heads = 8                                 # マルチヘッドアテンションのヘッド数
        self.num_encoder_layers = 6                        # Transformerエンコーダの層数
        self.num_decoder_layers = 6                        # Transformerデコーダの層数
        self.dim_feedforward = 2048                        # Transformer内のFNNの中間特徴量次元
        self.dropout = 0.1                                 # Transformer内のドロップアウト率
        self.loss_weight_class = 1                         # 分類損失の重み
        self.loss_weight_box_l1 = 5                        # 矩形回帰のL1誤差の重み
        self.loss_weight_box_giou = 2                      # 矩形回帰のGIoU損失の重み
        self.background_weight = 0.1                       # 背景クラスの重み
        self.moving_avg = 100                              # 移動平均で計算する損失と正確度の値の数
        self.batch_size = 8                                # バッチサイズ
        self.num_workers = 2                               # データローダに使うCPUプロセスの数
        self.device = 'cuda'                               # 学習に使うデバイス

###学習・評価を行う関数

In [None]:
def train_eval():
    config = ConfigTrainEval()
    
    # データ拡張・整形クラスの設定
    min_sizes = (480, 512, 544, 576, 608)
    train_transforms = T.Compose((
        T.RandomHorizontalFlip(),
        T.RandomSelect(
            T.RandomResize(min_sizes, max_size=1024),
            T.Compose((
                T.RandomSizeCrop(scale=(0.8, 1.0),
                                 ratio=(0.75, 1.333)),
                T.RandomResize(min_sizes, max_size=1024),
            ))
        ),
        T.ToTensor(),
        # ImageNetデータセットの平均と標準偏差
        T.Normalize(mean=(0.485, 0.456, 0.406),
                    std=(0.229, 0.224, 0.225)),
    ))
    test_transforms = T.Compose((
        # テストは短辺最大で実行
        T.RandomResize((min_sizes[-1],), max_size=1333),
        T.ToTensor(),
        # ImageNetデータセットの平均と標準偏差
        T.Normalize(mean=(0.485, 0.456, 0.406),
                    std=(0.229, 0.224, 0.225)),
    ))

    # データセットの用意
    train_dataset = dataset.CocoDetection(
        img_directory=config.img_directory,
        anno_file=config.anno_file,
        transform=train_transforms)
    val_dataset = dataset.CocoDetection(
        img_directory=config.img_directory,
        anno_file=config.anno_file,
        transform=test_transforms)

    # Subset samplerの生成
    val_set, train_set = util.generate_subset(
        train_dataset, config.val_ratio)

    print(f'学習セットのサンプル数: {len(train_set)}')
    print(f'検証セットのサンプル数: {len(val_set)}')

    # 学習時にランダムにサンプルするためのサンプラー
    train_sampler = SubsetRandomSampler(train_set)

    # DataLoaderを生成
    train_loader = DataLoader(
        train_dataset, batch_size=config.batch_size,
        num_workers=config.num_workers, sampler=train_sampler,
        collate_fn=collate_func)
    val_loader = DataLoader(
        val_dataset, batch_size=config.batch_size,
        num_workers=config.num_workers, sampler=val_set,
        collate_fn=collate_func)

    # DETR(ResNet18 backbone)モデルの生成
    model = DETR(config.num_queries, config.dim_hidden,
                 config.num_heads, config.num_encoder_layers,
                 config.num_decoder_layers, config.dim_feedforward,
                 config.dropout, len(train_dataset.classes))
    model.backbone.load_state_dict(torch.hub.load_state_dict_from_url(
        'https://download.pytorch.org/models/resnet18-5c106cde.pth'),
                                   strict=False)

    # モデルを指定デバイスに転送
    model.to(config.device)

    # Optimizerの生成, バックボーンとそうでないモジュールとの
    # パラメータで異なる学習率を適用
    params_backbone = []
    params_others = []
    for name, parameter in model.named_parameters():
        if parameter.requires_grad:
            if 'backbone' in name:
                params_backbone.append(parameter)
            else:
                params_others.append(parameter)
    param_groups = [
        {'params': params_backbone, 'lr': config.lr_backbone},
        {'params': params_others, 'lr': config.lr}]
    optimizer = optim.AdamW(param_groups,
                            weight_decay=config.weight_decay)

    # 指定したエポックで学習率を1/10に減衰するスケジューラを生成
    scheduler = optim.lr_scheduler.MultiStepLR(
        optimizer, milestones=[config.lr_drop], gamma=0.1)

    # 目的関数に予めハイパーパラメータをセット
    loss_func_lambda = lambda preds_class, preds_box, targets: \
        loss_func(preds_class, preds_box, targets,
                  config.loss_weight_class, config.loss_weight_box_l1,
                  config.loss_weight_box_giou,
                  config.background_weight)

    for epoch in range(config.num_epochs):
        model.train()

        with tqdm(train_loader) as pbar:
            pbar.set_description(f'[エポック {epoch + 1}]')

            # 移動平均計算用
            losses_class = deque()
            losses_box_l1 = deque()
            losses_box_giou = deque()
            losses_aux = deque()
            losses = deque()
            for imgs, masks, targets in pbar:
                imgs = imgs.to(model.get_device())
                masks = masks.to(model.get_device())
                targets = [{
                    k: v.to(model.get_device())
                    for k, v in target.items()} for target in targets]

                optimizer.zero_grad()

                preds_class, preds_box = model(imgs, masks)

                # 補助損失を計算
                loss_aux = 0
                for layer_index in range(
                    config.num_decoder_layers - 1):
                    loss_aux += sum(loss_func_lambda(
                        preds_class[layer_index],
                        preds_box[layer_index], targets))

                loss_class, loss_box_l1, loss_box_giou = \
                    loss_func_lambda(preds_class[-1], preds_box[-1],
                                     targets)
                loss = loss_aux + loss_class + \
                    loss_box_l1 + loss_box_giou

                loss.backward()

                # 勾配全体のL2ノルムが上限を越えるとき上限値でクリップ
                torch.nn.utils.clip_grad_norm_(
                    model.parameters(), config.clip)

                optimizer.step()

                losses_class.append(loss_class.item())
                losses_box_l1.append(loss_box_l1.item())
                losses_box_giou.append(loss_box_giou.item())
                losses_aux.append(loss_aux.item())
                losses.append(loss.item())
                if len(losses) > config.moving_avg:
                    losses_class.popleft()
                    losses_box_l1.popleft()
                    losses_box_giou.popleft()
                    losses_aux.popleft()
                    losses.popleft()
                pbar.set_postfix(
                    {'loss': torch.Tensor(losses).mean().item(),
                     'loss_class': torch.Tensor(
                         losses_class).mean().item(),
                     'loss_box_l1': torch.Tensor(
                         losses_box_l1).mean().item(),
                     'loss_box_giou': torch.Tensor(
                         losses_box_giou).mean().item(),
                     'loss_aux': torch.Tensor(
                         losses_aux).mean().item()})

        # スケジューラでエポック数をカウント
        scheduler.step()

        # パラメータを保存
        torch.save(model.state_dict(), config.save_file)

        # 検証
        if (epoch + 1) % config.val_interval == 0:
            evaluate(val_loader, model, loss_func_lambda)


###サンプルからミニバッチを生成するcollate関数

In [None]:
'''
batch: CocoDetectionからサンプルした複数の画像とラベルをまとめたもの
'''
def collate_func(batch: Sequence[Tuple[Union[torch.Tensor, dict]]]):
    # ミニバッチの中の画像で最大の高さと幅を取得
    max_height = 0
    max_width = 0
    for img, _ in batch:
        height, width = img.shape[1:]
        max_height = max(max_height, height)
        max_width = max(max_width, width)

    imgs = batch[0][0].new_zeros(
        (len(batch), 3, max_height, max_width))
    # 真偽値を保持するマスクのテンソルは真の値で初期化
    masks = batch[0][0].new_ones(
        (len(batch), max_height, max_width), dtype=torch.bool)
    targets = []
    for i, (img, target) in enumerate(batch):
        height, width = img.shape[1:]
        imgs[i, :, :height, :width] = img
        # 画像領域には偽の値を設定
        masks[i, :height, :width] = False

        targets.append(target)

    return imgs, masks, targets

###評価関数

In [None]:
'''
data_loader   : 評価に使うデータを読み込むデータローダ
model         : 評価対象のモデル
loss_func     : 目的関数
conf_threshold: 信頼度の閾値
nms_threshold : NMSのIoU閾値
'''
def evaluate(data_loader, model, loss_func):
    model.eval()

    losses_class = []
    losses_box_l1 = []
    losses_box_giou = []
    losses_aux = []
    losses = []
    preds = []
    img_ids = []
    for imgs, masks, targets in tqdm(
            data_loader, desc='[Validation]'):
        with  torch.no_grad():
            imgs = imgs.to(model.get_device())
            masks = masks.to(model.get_device())
            targets = [{k: v.to(model.get_device())
                        for k, v in target.items()}
                       for target in targets]

            preds_class, preds_box = model(imgs, masks)

            num_decoder_layers = preds_class.shape[0]

            loss_aux = 0
            for layer_index in range(num_decoder_layers - 1):
                loss_aux += sum(loss_func(preds_class[layer_index],
                                          preds_box[layer_index],
                                          targets))

            loss_class, loss_box_l1, loss_box_giou = loss_func(
                preds_class[-1], preds_box[-1], targets)
            loss = loss_class + loss_box_l1 + loss_box_giou + loss_aux

            losses_class.append(loss_class)
            losses_box_l1.append(loss_box_l1)
            losses_box_giou.append(loss_box_giou)
            losses_aux.append(loss_aux)
            losses.append(loss)

            # 後処理により最終的な検出矩形を取得
            scores, labels, boxes = post_process(
                preds_class[-1], preds_box[-1], targets)

            for img_scores, img_labels, img_boxes, img_targets in zip(
                    scores, labels, boxes, targets):
                img_ids.append(img_targets['image_id'].item())

                # 評価のためにCOCOの元々の矩形表現である
                # xmin, ymin, width, heightい変換
                img_boxes[:, 2:] -= img_boxes[:, :2]

                for score, label, box in zip(
                        img_scores, img_labels, img_boxes):
                    # COCO評価用のデータの保存
                    preds.append({
                        'image_id': img_targets['image_id'].item(),
                        'category_id': \
                        data_loader.dataset.to_coco_label(
                            label.item()),
                        'score': score.item(),
                        'bbox': box.to('cpu').numpy().tolist()
                    })

    loss_class = torch.stack(losses_class).mean().item()
    loss_box_l1 = torch.stack(losses_box_l1).mean().item()
    loss_box_giou = torch.stack(losses_box_giou).mean().item()
    loss_aux = torch.stack(losses_aux).mean().item()
    loss = torch.stack(losses).mean().item()
    print(f'Validation loss = {loss:.3f}, '
          f'class loss = {loss_class:.3f}, '
          f'box l1 loss = {loss_box_l1:.3f}, '
          f'box giou loss = {loss_box_giou:.3f}, '
          f'aux loss = {loss_aux:.3f}')

    if len(preds) == 0:
        print('Nothing detected, skip evaluation.')

        return

    # pycocotoolsを使って評価するには検出結果をjsonファイルに出力する
    # 必要があるため、jsonファイルに一時保存
    with open('tmp.json', 'w') as f:
        json.dump(preds, f)

    # 一時保存した検出結果をpycocotoolsを使って読み込み
    coco_results = data_loader.dataset.coco.loadRes('tmp.json')

    # pycocotoolsを使って評価
    coco_eval = COCOeval(
        data_loader.dataset.coco, coco_results, 'bbox')
    coco_eval.params.imgIds = img_ids
    coco_eval.evaluate()
    coco_eval.accumulate()
    coco_eval.summarize()


###学習・評価データの解凍

In [None]:
!unzip drive/MyDrive/python_image_recognition/data/coco2014/val2014.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 extracting: val2014/COCO_val2014_000000535608.jpg  
 extracting: val2014/COCO_val2014_000000279145.jpg  
 extracting: val2014/COCO_val2014_000000243495.jpg  
 extracting: val2014/COCO_val2014_000000225791.jpg  
 extracting: val2014/COCO_val2014_000000428067.jpg  
 extracting: val2014/COCO_val2014_000000059383.jpg  
 extracting: val2014/COCO_val2014_000000558498.jpg  
 extracting: val2014/COCO_val2014_000000376123.jpg  
 extracting: val2014/COCO_val2014_000000191304.jpg  
 extracting: val2014/COCO_val2014_000000171062.jpg  
 extracting: val2014/COCO_val2014_000000014549.jpg  
 extracting: val2014/COCO_val2014_000000121152.jpg  
 extracting: val2014/COCO_val2014_000000276596.jpg  
 extracting: val2014/COCO_val2014_000000029431.jpg  
 extracting: val2014/COCO_val2014_000000036349.jpg  
 extracting: val2014/COCO_val2014_000000502055.jpg  
 extracting: val2014/COCO_val2014_000000438848.jpg  
 extracting: val2014/COCO_val2014_

###学習・評価の実行

In [None]:
train_eval()

loading annotations into memory...
Done (t=1.36s)
creating index...
index created!
loading annotations into memory...
Done (t=1.02s)
creating index...
index created!
学習セットのサンプル数: 8000
検証セットのサンプル数: 2000


Downloading: "https://download.pytorch.org/models/resnet18-5c106cde.pth" to /root/.cache/torch/hub/checkpoints/resnet18-5c106cde.pth


  0%|          | 0.00/44.7M [00:00<?, ?B/s]

[エポック 1]: 100%|██████████| 1000/1000 [03:46<00:00,  4.42it/s, loss=20.4, loss_class=0.645, loss_box_l1=1.17, loss_box_giou=1.58, loss_aux=17]
[エポック 2]: 100%|██████████| 1000/1000 [03:36<00:00,  4.61it/s, loss=18.9, loss_class=0.635, loss_box_l1=1.03, loss_box_giou=1.48, loss_aux=15.8]
[エポック 3]: 100%|██████████| 1000/1000 [03:36<00:00,  4.61it/s, loss=18, loss_class=0.651, loss_box_l1=0.931, loss_box_giou=1.42, loss_aux=15]
[エポック 4]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=17.9, loss_class=0.636, loss_box_l1=0.889, loss_box_giou=1.42, loss_aux=14.9]
[エポック 5]: 100%|██████████| 1000/1000 [03:37<00:00,  4.60it/s, loss=17.4, loss_class=0.597, loss_box_l1=0.886, loss_box_giou=1.4, loss_aux=14.5]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.30it/s]


Validation loss = 28.802, class loss = 0.625, box l1 loss = 1.283, box giou loss = 1.730, aux loss = 25.163
Loading and preparing results...
DONE (t=1.66s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=23.68s).
Accumulating evaluation results...
DONE (t=2.52s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.002
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.005
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.001
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.002
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.010
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.026
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 6]: 100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s, loss=16.5, loss_class=0.581, loss_box_l1=0.835, loss_box_giou=1.31, loss_aux=13.8]
[エポック 7]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=16.4, loss_class=0.597, loss_box_l1=0.777, loss_box_giou=1.33, loss_aux=13.7]
[エポック 8]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=16, loss_class=0.572, loss_box_l1=0.768, loss_box_giou=1.32, loss_aux=13.4]
[エポック 9]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=15.8, loss_class=0.57, loss_box_l1=0.762, loss_box_giou=1.28, loss_aux=13.2]
[エポック 10]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=15.3, loss_class=0.549, loss_box_l1=0.699, loss_box_giou=1.26, loss_aux=12.8]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.24it/s]


Validation loss = 18.676, class loss = 0.614, box l1 loss = 0.771, box giou loss = 1.351, aux loss = 15.939
Loading and preparing results...
DONE (t=2.04s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=23.23s).
Accumulating evaluation results...
DONE (t=2.48s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.007
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.024
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.001
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.005
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.008
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.009
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.012
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.037
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 11]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=14.8, loss_class=0.557, loss_box_l1=0.663, loss_box_giou=1.21, loss_aux=12.4]
[エポック 12]: 100%|██████████| 1000/1000 [03:39<00:00,  4.57it/s, loss=15, loss_class=0.564, loss_box_l1=0.647, loss_box_giou=1.23, loss_aux=12.5]
[エポック 13]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=14.7, loss_class=0.567, loss_box_l1=0.614, loss_box_giou=1.2, loss_aux=12.3]
[エポック 14]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=14.9, loss_class=0.553, loss_box_l1=0.637, loss_box_giou=1.22, loss_aux=12.5]
[エポック 15]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=14.6, loss_class=0.524, loss_box_l1=0.632, loss_box_giou=1.2, loss_aux=12.2]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.25it/s]


Validation loss = 15.611, class loss = 0.564, box l1 loss = 0.638, box giou loss = 1.229, aux loss = 13.180
Loading and preparing results...
DONE (t=1.69s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.81s).
Accumulating evaluation results...
DONE (t=2.45s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.036
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.002
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.003
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.011
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.024
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.022
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.070
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 16]: 100%|██████████| 1000/1000 [03:37<00:00,  4.60it/s, loss=14.3, loss_class=0.554, loss_box_l1=0.578, loss_box_giou=1.2, loss_aux=12]
[エポック 17]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=13.9, loss_class=0.516, loss_box_l1=0.613, loss_box_giou=1.13, loss_aux=11.7]
[エポック 18]: 100%|██████████| 1000/1000 [03:40<00:00,  4.53it/s, loss=13.9, loss_class=0.535, loss_box_l1=0.573, loss_box_giou=1.15, loss_aux=11.6]
[エポック 19]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=14.2, loss_class=0.55, loss_box_l1=0.575, loss_box_giou=1.15, loss_aux=11.9]
[エポック 20]: 100%|██████████| 1000/1000 [03:40<00:00,  4.53it/s, loss=13.6, loss_class=0.544, loss_box_l1=0.545, loss_box_giou=1.13, loss_aux=11.4]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.15it/s]


Validation loss = 13.815, class loss = 0.555, box l1 loss = 0.560, box giou loss = 1.163, aux loss = 11.537
Loading and preparing results...
DONE (t=1.70s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=23.13s).
Accumulating evaluation results...
DONE (t=2.56s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.011
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.044
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.005
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.013
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.028
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.026
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.080
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 21]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=13.1, loss_class=0.518, loss_box_l1=0.551, loss_box_giou=1.07, loss_aux=11]
[エポック 22]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=13.5, loss_class=0.549, loss_box_l1=0.513, loss_box_giou=1.12, loss_aux=11.3]
[エポック 23]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=13.2, loss_class=0.534, loss_box_l1=0.533, loss_box_giou=1.1, loss_aux=11.1]
[エポック 24]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=13, loss_class=0.515, loss_box_l1=0.517, loss_box_giou=1.06, loss_aux=10.9]
[エポック 25]: 100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s, loss=13, loss_class=0.531, loss_box_l1=0.508, loss_box_giou=1.09, loss_aux=10.8]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.20it/s]


Validation loss = 13.812, class loss = 0.538, box l1 loss = 0.574, box giou loss = 1.124, aux loss = 11.575
Loading and preparing results...
DONE (t=1.65s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.96s).
Accumulating evaluation results...
DONE (t=2.46s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.011
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.041
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.003
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.015
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.027
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.026
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.081
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 26]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=12.9, loss_class=0.506, loss_box_l1=0.504, loss_box_giou=1.08, loss_aux=10.8]
[エポック 27]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=12.4, loss_class=0.511, loss_box_l1=0.481, loss_box_giou=1, loss_aux=10.4]
[エポック 28]: 100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s, loss=12.7, loss_class=0.513, loss_box_l1=0.494, loss_box_giou=1.06, loss_aux=10.6]
[エポック 29]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=12.5, loss_class=0.502, loss_box_l1=0.495, loss_box_giou=1.05, loss_aux=10.5]
[エポック 30]: 100%|██████████| 1000/1000 [03:39<00:00,  4.57it/s, loss=12.5, loss_class=0.512, loss_box_l1=0.481, loss_box_giou=1.04, loss_aux=10.4]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.25it/s]


Validation loss = 13.318, class loss = 0.528, box l1 loss = 0.531, box giou loss = 1.103, aux loss = 11.156
Loading and preparing results...
DONE (t=2.05s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.94s).
Accumulating evaluation results...
DONE (t=2.47s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.013
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.051
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.003
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.006
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.017
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.032
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.031
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.095
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 31]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=12.4, loss_class=0.509, loss_box_l1=0.467, loss_box_giou=1.04, loss_aux=10.4]
[エポック 32]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=12.3, loss_class=0.506, loss_box_l1=0.465, loss_box_giou=1.04, loss_aux=10.3]
[エポック 33]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=12.2, loss_class=0.494, loss_box_l1=0.47, loss_box_giou=1.01, loss_aux=10.2]
[エポック 34]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=12.2, loss_class=0.503, loss_box_l1=0.457, loss_box_giou=1.04, loss_aux=10.2]
[エポック 35]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=11.7, loss_class=0.477, loss_box_l1=0.456, loss_box_giou=0.987, loss_aux=9.77]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.17it/s]


Validation loss = 12.842, class loss = 0.535, box l1 loss = 0.520, box giou loss = 1.086, aux loss = 10.701
Loading and preparing results...
DONE (t=1.70s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.73s).
Accumulating evaluation results...
DONE (t=2.52s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.019
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.066
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.006
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.021
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.051
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.033
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.103
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 36]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=12, loss_class=0.499, loss_box_l1=0.454, loss_box_giou=0.998, loss_aux=10]
[エポック 37]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=12.1, loss_class=0.512, loss_box_l1=0.452, loss_box_giou=1.02, loss_aux=10.1]
[エポック 38]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=11.8, loss_class=0.497, loss_box_l1=0.453, loss_box_giou=0.995, loss_aux=9.85]
[エポック 39]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=12.3, loss_class=0.5, loss_box_l1=0.449, loss_box_giou=1.04, loss_aux=10.3]
[エポック 40]: 100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s, loss=11.6, loss_class=0.465, loss_box_l1=0.442, loss_box_giou=0.973, loss_aux=9.71]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.13it/s]


Validation loss = 12.480, class loss = 0.499, box l1 loss = 0.482, box giou loss = 1.044, aux loss = 10.454
Loading and preparing results...
DONE (t=1.69s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.62s).
Accumulating evaluation results...
DONE (t=2.46s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.024
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.084
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.007
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.006
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.038
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.064
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.043
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.122
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 41]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=11.5, loss_class=0.49, loss_box_l1=0.421, loss_box_giou=0.969, loss_aux=9.6]
[エポック 42]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=11.7, loss_class=0.505, loss_box_l1=0.427, loss_box_giou=0.993, loss_aux=9.82]
[エポック 43]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=11.5, loss_class=0.469, loss_box_l1=0.421, loss_box_giou=0.98, loss_aux=9.61]
[エポック 44]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=11.5, loss_class=0.483, loss_box_l1=0.434, loss_box_giou=0.963, loss_aux=9.63]
[エポック 45]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=11.3, loss_class=0.47, loss_box_l1=0.42, loss_box_giou=0.959, loss_aux=9.48]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.25it/s]


Validation loss = 12.548, class loss = 0.516, box l1 loss = 0.453, box giou loss = 1.028, aux loss = 10.551
Loading and preparing results...
DONE (t=1.65s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.51s).
Accumulating evaluation results...
DONE (t=2.42s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.028
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.093
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.037
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.083
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.043
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.127
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 46]: 100%|██████████| 1000/1000 [03:38<00:00,  4.59it/s, loss=11.1, loss_class=0.46, loss_box_l1=0.42, loss_box_giou=0.934, loss_aux=9.32]
[エポック 47]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=11.4, loss_class=0.467, loss_box_l1=0.416, loss_box_giou=0.988, loss_aux=9.57]
[エポック 48]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=11, loss_class=0.455, loss_box_l1=0.423, loss_box_giou=0.924, loss_aux=9.18]
[エポック 49]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=11, loss_class=0.45, loss_box_l1=0.409, loss_box_giou=0.931, loss_aux=9.22]
[エポック 50]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=11.3, loss_class=0.476, loss_box_l1=0.401, loss_box_giou=0.967, loss_aux=9.47]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.21it/s]


Validation loss = 12.524, class loss = 0.535, box l1 loss = 0.484, box giou loss = 1.004, aux loss = 10.500
Loading and preparing results...
DONE (t=2.07s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.75s).
Accumulating evaluation results...
DONE (t=2.40s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.031
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.102
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.013
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.006
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.036
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.087
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.044
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.125
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 51]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=11.2, loss_class=0.46, loss_box_l1=0.409, loss_box_giou=0.95, loss_aux=9.38]
[エポック 52]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=11, loss_class=0.438, loss_box_l1=0.419, loss_box_giou=0.953, loss_aux=9.21]
[エポック 53]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=10.8, loss_class=0.44, loss_box_l1=0.406, loss_box_giou=0.91, loss_aux=9.01]
[エポック 54]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=11.1, loss_class=0.476, loss_box_l1=0.383, loss_box_giou=0.952, loss_aux=9.32]
[エポック 55]: 100%|██████████| 1000/1000 [03:39<00:00,  4.57it/s, loss=11, loss_class=0.458, loss_box_l1=0.389, loss_box_giou=0.948, loss_aux=9.2]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.18it/s]


Validation loss = 11.952, class loss = 0.517, box l1 loss = 0.464, box giou loss = 0.983, aux loss = 9.988
Loading and preparing results...
DONE (t=1.67s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.56s).
Accumulating evaluation results...
DONE (t=2.44s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.041
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.121
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.019
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.050
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.107
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.052
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.139
 Average Recall     (AR) @[ IoU=0.50:0.9

[エポック 56]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=11.1, loss_class=0.457, loss_box_l1=0.397, loss_box_giou=0.958, loss_aux=9.33]
[エポック 57]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=10.7, loss_class=0.416, loss_box_l1=0.386, loss_box_giou=0.938, loss_aux=8.91]
[エポック 58]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=11.1, loss_class=0.446, loss_box_l1=0.389, loss_box_giou=0.971, loss_aux=9.3]
[エポック 59]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=10.8, loss_class=0.427, loss_box_l1=0.4, loss_box_giou=0.947, loss_aux=9]
[エポック 60]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=10.9, loss_class=0.441, loss_box_l1=0.392, loss_box_giou=0.947, loss_aux=9.13]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.28it/s]


Validation loss = 12.021, class loss = 0.494, box l1 loss = 0.458, box giou loss = 0.990, aux loss = 10.078
Loading and preparing results...
DONE (t=2.04s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.39s).
Accumulating evaluation results...
DONE (t=2.38s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.045
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.131
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.021
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.013
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.051
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.106
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.052
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.144
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 61]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=10.7, loss_class=0.43, loss_box_l1=0.383, loss_box_giou=0.91, loss_aux=8.93]
[エポック 62]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=10.7, loss_class=0.435, loss_box_l1=0.402, loss_box_giou=0.911, loss_aux=8.97]
[エポック 63]: 100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s, loss=10.6, loss_class=0.419, loss_box_l1=0.39, loss_box_giou=0.912, loss_aux=8.86]
[エポック 64]: 100%|██████████| 1000/1000 [03:38<00:00,  4.59it/s, loss=10.5, loss_class=0.425, loss_box_l1=0.385, loss_box_giou=0.895, loss_aux=8.8]
[エポック 65]: 100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s, loss=10.5, loss_class=0.416, loss_box_l1=0.401, loss_box_giou=0.903, loss_aux=8.75]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.20it/s]


Validation loss = 11.907, class loss = 0.497, box l1 loss = 0.462, box giou loss = 0.983, aux loss = 9.966
Loading and preparing results...
DONE (t=1.68s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.46s).
Accumulating evaluation results...
DONE (t=2.34s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.054
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.134
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.033
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.062
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.133
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.058
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.147
 Average Recall     (AR) @[ IoU=0.50:0.9

[エポック 66]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=10.5, loss_class=0.409, loss_box_l1=0.401, loss_box_giou=0.89, loss_aux=8.77]
[エポック 67]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=10.2, loss_class=0.413, loss_box_l1=0.377, loss_box_giou=0.862, loss_aux=8.54]
[エポック 68]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=10.7, loss_class=0.418, loss_box_l1=0.373, loss_box_giou=0.977, loss_aux=8.95]
[エポック 69]: 100%|██████████| 1000/1000 [03:39<00:00,  4.57it/s, loss=10.6, loss_class=0.401, loss_box_l1=0.4, loss_box_giou=0.923, loss_aux=8.84]
[エポック 70]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=10.5, loss_class=0.416, loss_box_l1=0.382, loss_box_giou=0.921, loss_aux=8.81]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.20it/s]


Validation loss = 11.730, class loss = 0.472, box l1 loss = 0.460, box giou loss = 1.008, aux loss = 9.790
Loading and preparing results...
DONE (t=1.68s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.18s).
Accumulating evaluation results...
DONE (t=2.34s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.061
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.155
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.038
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.008
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.077
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.141
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.062
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.154
 Average Recall     (AR) @[ IoU=0.50:0.9

[エポック 71]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=10.5, loss_class=0.397, loss_box_l1=0.377, loss_box_giou=0.924, loss_aux=8.8]
[エポック 72]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=10.4, loss_class=0.394, loss_box_l1=0.373, loss_box_giou=0.919, loss_aux=8.71]
[エポック 73]: 100%|██████████| 1000/1000 [03:39<00:00,  4.57it/s, loss=10.5, loss_class=0.413, loss_box_l1=0.373, loss_box_giou=0.917, loss_aux=8.77]
[エポック 74]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=9.81, loss_class=0.392, loss_box_l1=0.36, loss_box_giou=0.855, loss_aux=8.2]
[エポック 75]: 100%|██████████| 1000/1000 [03:37<00:00,  4.59it/s, loss=10.2, loss_class=0.396, loss_box_l1=0.371, loss_box_giou=0.921, loss_aux=8.56]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]


Validation loss = 11.738, class loss = 0.462, box l1 loss = 0.463, box giou loss = 0.982, aux loss = 9.831
Loading and preparing results...
DONE (t=1.67s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.45s).
Accumulating evaluation results...
DONE (t=2.39s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.065
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.172
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.037
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.010
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.074
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.172
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.058
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.152
 Average Recall     (AR) @[ IoU=0.50:0.9

[エポック 76]: 100%|██████████| 1000/1000 [03:37<00:00,  4.61it/s, loss=10.2, loss_class=0.395, loss_box_l1=0.37, loss_box_giou=0.897, loss_aux=8.54]
[エポック 77]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=10.1, loss_class=0.369, loss_box_l1=0.374, loss_box_giou=0.9, loss_aux=8.43]
[エポック 78]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=9.84, loss_class=0.368, loss_box_l1=0.363, loss_box_giou=0.88, loss_aux=8.23]
[エポック 79]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=10.2, loss_class=0.402, loss_box_l1=0.362, loss_box_giou=0.894, loss_aux=8.54]
[エポック 80]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=10.3, loss_class=0.389, loss_box_l1=0.371, loss_box_giou=0.919, loss_aux=8.58]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.20it/s]


Validation loss = 11.248, class loss = 0.459, box l1 loss = 0.441, box giou loss = 0.940, aux loss = 9.408
Loading and preparing results...
DONE (t=2.07s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.16s).
Accumulating evaluation results...
DONE (t=2.35s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.079
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.198
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.050
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.013
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.076
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.202
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.067
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.171
 Average Recall     (AR) @[ IoU=0.50:0.9

[エポック 81]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=9.95, loss_class=0.391, loss_box_l1=0.357, loss_box_giou=0.878, loss_aux=8.33]
[エポック 82]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=10.2, loss_class=0.409, loss_box_l1=0.368, loss_box_giou=0.887, loss_aux=8.58]
[エポック 83]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=9.79, loss_class=0.386, loss_box_l1=0.359, loss_box_giou=0.849, loss_aux=8.2]
[エポック 84]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=9.87, loss_class=0.387, loss_box_l1=0.351, loss_box_giou=0.859, loss_aux=8.27]
[エポック 85]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=9.84, loss_class=0.381, loss_box_l1=0.354, loss_box_giou=0.862, loss_aux=8.24]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.20it/s]


Validation loss = 12.486, class loss = 0.552, box l1 loss = 0.467, box giou loss = 1.001, aux loss = 10.466
Loading and preparing results...
DONE (t=2.03s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.39s).
Accumulating evaluation results...
DONE (t=2.35s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.069
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.173
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.042
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.085
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.158
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.062
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.153
 Average Recall     (AR) @[ IoU=0.50:0.

[エポック 86]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=9.88, loss_class=0.375, loss_box_l1=0.365, loss_box_giou=0.871, loss_aux=8.27]
[エポック 87]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=9.69, loss_class=0.377, loss_box_l1=0.352, loss_box_giou=0.846, loss_aux=8.11]
[エポック 88]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=9.66, loss_class=0.366, loss_box_l1=0.361, loss_box_giou=0.84, loss_aux=8.09]
[エポック 89]: 100%|██████████| 1000/1000 [03:38<00:00,  4.58it/s, loss=9.93, loss_class=0.387, loss_box_l1=0.347, loss_box_giou=0.863, loss_aux=8.34]
[エポック 90]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=9.76, loss_class=0.392, loss_box_l1=0.34, loss_box_giou=0.853, loss_aux=8.17]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.17it/s]


Validation loss = 11.593, class loss = 0.449, box l1 loss = 0.448, box giou loss = 0.970, aux loss = 9.726
Loading and preparing results...
DONE (t=2.08s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.39s).
Accumulating evaluation results...
DONE (t=2.30s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.086
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.208
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.057
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.010
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.094
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.233
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.073
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.171
 Average Recall     (AR) @[ IoU=0.50:0.9

[エポック 91]: 100%|██████████| 1000/1000 [03:40<00:00,  4.52it/s, loss=8.98, loss_class=0.362, loss_box_l1=0.294, loss_box_giou=0.803, loss_aux=7.52]
[エポック 92]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=9.1, loss_class=0.354, loss_box_l1=0.314, loss_box_giou=0.808, loss_aux=7.62]
[エポック 93]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=8.94, loss_class=0.355, loss_box_l1=0.304, loss_box_giou=0.795, loss_aux=7.49]
[エポック 94]: 100%|██████████| 1000/1000 [03:38<00:00,  4.57it/s, loss=8.9, loss_class=0.349, loss_box_l1=0.306, loss_box_giou=0.786, loss_aux=7.46]
[エポック 95]: 100%|██████████| 1000/1000 [03:39<00:00,  4.56it/s, loss=8.73, loss_class=0.345, loss_box_l1=0.299, loss_box_giou=0.769, loss_aux=7.32]
[Validation]: 100%|██████████| 250/250 [00:47<00:00,  5.22it/s]


Validation loss = 10.677, class loss = 0.440, box l1 loss = 0.416, box giou loss = 0.892, aux loss = 8.929
Loading and preparing results...
DONE (t=2.04s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.40s).
Accumulating evaluation results...
DONE (t=2.30s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.111
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.256
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.083
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.023
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.136
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.260
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.085
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.201
 Average Recall     (AR) @[ IoU=0.50:0.9

[エポック 96]: 100%|██████████| 1000/1000 [03:40<00:00,  4.54it/s, loss=8.75, loss_class=0.35, loss_box_l1=0.286, loss_box_giou=0.788, loss_aux=7.33]
[エポック 97]: 100%|██████████| 1000/1000 [03:41<00:00,  4.51it/s, loss=8.66, loss_class=0.344, loss_box_l1=0.299, loss_box_giou=0.756, loss_aux=7.26]
[エポック 98]: 100%|██████████| 1000/1000 [03:40<00:00,  4.53it/s, loss=8.55, loss_class=0.334, loss_box_l1=0.288, loss_box_giou=0.76, loss_aux=7.16]
[エポック 99]: 100%|██████████| 1000/1000 [03:39<00:00,  4.55it/s, loss=8.61, loss_class=0.334, loss_box_l1=0.291, loss_box_giou=0.779, loss_aux=7.21]
[エポック 100]: 100%|██████████| 1000/1000 [03:40<00:00,  4.53it/s, loss=8.57, loss_class=0.332, loss_box_l1=0.304, loss_box_giou=0.76, loss_aux=7.18]
[Validation]: 100%|██████████| 250/250 [00:48<00:00,  5.16it/s]


Validation loss = 10.774, class loss = 0.443, box l1 loss = 0.420, box giou loss = 0.895, aux loss = 9.017
Loading and preparing results...
DONE (t=1.68s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=22.71s).
Accumulating evaluation results...
DONE (t=2.35s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.116
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.266
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.085
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.025
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.137
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.272
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.087
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.202
 Average Recall     (AR) @[ IoU=0.50:0.9

###デモおけるハイパーパラメータやオプションの設定

In [None]:
class ConfigDemo:
    '''
    ハイパーパラメータとオプションの設定
    '''
    def __init__(self):
        self.img_directory = 'drive/MyDrive/python_image_recognition/data/' \
                            'object_detection' # 画像があるディレクトリ
        self.load_file = 'drive/MyDrive/python_image_recognition/5_object_detection/model/' \
                            'detr.pth'         # 学習済みパラメータのパス
        self.classes = ['person', 'car']       # 検出対象の物体クラス
        self.num_queries = 100                 # 物体クエリ埋め込みのクエリベクトル数
        self.dim_hidden = 256                  # Transformer内の特徴量次元
        self.num_heads = 8                     # マルチヘッドアテンションのヘッド数
        self.num_encoder_layers = 6            # Transformerエンコーダの層数
        self.num_decoder_layers = 6            # Transformerデコーダの層数
        self.dim_feedforward = 2048            # Transformer内のFNNの中間特徴量次元
        self.dropout = 0.1                     # Transformer内のドロップアウト率
        self.device = 'cuda'                   # デモに使うデバイス

###デモを行う関数

In [None]:
def demo():
    config = ConfigDemo()
    
    transforms = T.Compose((
        T.RandomResize((608,), max_size=1024),
        T.ToTensor(),
        T.Normalize(mean=(0.485, 0.456, 0.406),
                    std=(0.229, 0.224, 0.225)),
    ))

    # 学習済みのモデルパラメータを読み込み
    model = DETR(config.num_queries, config.dim_hidden,
                 config.num_heads, config.num_encoder_layers,
                 config.num_decoder_layers, config.dim_feedforward,
                 config.dropout, len(config.classes))
    model.load_state_dict(torch.load(config.load_file))
    model.to(config.device)
    model.eval()

    for img_path in Path(config.img_directory).iterdir():
        img_orig = Image.open(img_path)
        width, height = img_orig.size

        # データ整形を適用するためにダミーのラベルを作成
        target = {
            'classes': torch.zeros((0,), dtype=torch.int64),
            'boxes': torch.zeros((0, 4), dtype=torch.float32),
            'size': torch.tensor((width, height), dtype=torch.int64),
            'orig_size': torch.tensor(
                (width, height), dtype=torch.int64),
        }

        # データ整形
        img, target = transforms(img_orig, target)
        imgs, masks, targets = collate_func([(img, target)])

        with  torch.no_grad():
            imgs = imgs.to(model.get_device())
            masks = masks.to(model.get_device())
            targets = [{k: v.to(model.get_device())
                        for k, v in target.items()}
                       for target in targets]

            preds_class, preds_box = model(imgs, masks)

            scores, labels, boxes = post_process(
                preds_class[-1], preds_box[-1], targets,
                include_bg=True)

            # 背景クラスに分類された矩形を除外
            boxes = boxes[0, labels[0] != len(config.classes)]
            labels = labels[0, labels[0] != len(config.classes)]

            # 描画用の画像を用意
            img = torch.tensor(np.asarray(img_orig))
            img = img.permute(2, 0, 1)

            # クラスIDをクラス名に変換
            labels = [config.classes[label] for label in labels]

            # 矩形を描画
            img = draw_bounding_boxes(
                img, boxes, labels, colors='red',
                font='LiberationSans-Regular.ttf',
                font_size=42, width=4)
            img = img.permute(1, 2, 0)
            img = img.to('cpu').numpy()
            img = Image.fromarray(img)
            display(img)

###デモの実行

In [None]:
demo()

Output hidden; open in https://colab.research.google.com to view.