In [1]:
from math import sqrt
import matplotlib.pyplot as plt
from itertools import product

import torch
from torch import nn,optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.datasets import VOCDetection
from torchvision.ops import box_iou # IoU計算
from tqdm import tqdm

### vggモジュール

In [3]:
def make_vgg():
    cfg = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C',
           512, 512, 512, 'M', 512, 512, 512]
    layers = []
    in_chanels = 3
    for v in cfg:
        if v == 'M':
            layers += [nn.MaxPool2d(kernel_size=2)] #出力サイズを切り捨て(デフォルトでeil_mode=False)
        elif v == 'C':
            layers += [nn.MaxPool2d(kernel_size=2, ceil_mode=True)] # 出力サイズを切り上げる
        else:
            conv2d = nn.Conv2d(in_chanels, v, kernel_size=3, padding=1)
            layers += [conv2d, nn.ReLU()]
            in_chanels = v
    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
    layers += [pool5, conv6, nn.ReLU(), conv7, nn.ReLU()]

    return nn.ModuleList(layers)

In [4]:
a = []
b = [3]
c = [4]
a += b
a +=c
a

[3, 4]

In [5]:
[3]+[4]

[3, 4]

In [6]:
in_chanels = 3
v = 512
layers = []
layers += [nn.MaxPool2d(kernel_size=2)]
pool1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
conv1 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
conv2 = nn.Conv2d(1024, 1024, kernel_size=1)
layers += [pool1, conv1, nn.ReLU(), conv2, nn.ReLU()]
conv2d = nn.Conv2d(in_chanels, v, kernel_size=3, padding=1)
layers += [conv2d, nn.ReLU()]

In [7]:
layers

[MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
 MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False),
 Conv2d(512, 1024, kernel_size=(3, 3), stride=(1, 1), padding=(6, 6), dilation=(6, 6)),
 ReLU(),
 Conv2d(1024, 1024, kernel_size=(1, 1), stride=(1, 1)),
 ReLU(),
 Conv2d(3, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
 ReLU()]

In [8]:
module = nn.ModuleList(layers)
for name, params in module.named_parameters():
    if 'weight' in name:
        print(name)

2.weight
4.weight
6.weight


In [9]:
nn.ModuleList(layers)[1]

MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)

### extrasモジュール

In [11]:
def make_extras():
    layers = [
        # out3
        nn.Conv2d(1024, 256, kernel_size=1),
        nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1),
        # out3からout4
        nn.Conv2d(512, 128, kernel_size=1),
        nn.Conv2d(128, 256, kernel_size=3, stride=2, padding=1),
        # out4からout5
        nn.Conv2d(256, 128, kernel_size=1),
        nn.Conv2d(128, 256, kernel_size=3),
        # out6からout6
        nn.Conv2d(256, 128, kernel_size=1),
        nn.Conv2d(128, 256, kernel_size=3),
    ]
    return nn.ModuleList(layers)


### Locモジュール

In [13]:
def make_loc(num_classes=21):
    """
    オフセットの予測を出力する
    nn.Conv2d第2引数は出力ベクトルのchanel方向の次元数.作成されるDBox(4 or 6個)ごとにオフセットを出力するので
    作成されるオフセットの数*4となる
    """
    layers = [
        # out1に対する処理
        nn.Conv2d(512, 4*4, kernel_size=3, padding=1),

        # out2に対する処理
        nn.Conv2d(1024, 6*4, kernel_size=3, padding=1),

        # out3に対する処理
        nn.Conv2d(512, 6*4, kernel_size=3, padding=1),

        # out4に対する処理
        nn.Conv2d(256, 6*4, kernel_size=3, padding=1),

        # out5に対する処理
        nn.Conv2d(256, 4*4, kernel_size=3, padding=1),

        # out1に対する処理
        nn.Conv2d(256, 4*4, kernel_size=3, padding=1)
    ]
    return nn.ModuleList(layers)

### confモジュール

In [15]:
def make_coef(num_classes=21):
    """
    クラスの予測を出力する
    nn.Conv2d第2引数は出力ベクトルの次元数.作成されるDBox(4 or 6個)ごとに各クラスの信頼度を出力するので
    作成されるオフセットの数*num_classesとなる
    """
    layers = [
        # out1に対する処理
        nn.Conv2d(512, 4*num_classes, kernel_size=3, padding=1), # [b, 16, 38, 38]

        # out2に対する処理
        nn.Conv2d(1024, 6*num_classes, kernel_size=3, padding=1),

        # out3に対する処理
        nn.Conv2d(512, 6*num_classes, kernel_size=3, padding=1),

        # out4に対する処理
        nn.Conv2d(256, 6*num_classes, kernel_size=3, padding=1),

        # out5に対する処理
        nn.Conv2d(256, 4*num_classes, kernel_size=3, padding=1),

        # out1に対する処理
        nn.Conv2d(256, 4*num_classes, kernel_size=3, padding=1)
    ]
    return nn.ModuleList(layers)

### L2Normの実装
### Layer Normalizationの実装

In [17]:
torch.Tensor(20)

tensor([4.9592e-04, 1.4391e-42, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00])

In [18]:
torch.Tensor([20])

tensor([20.])

###### nn.Parameterは、PyTorchのモデル内で学習可能なパラメータを定義するために使用されるクラスです。主に、ニューラルネットワークの層で使用され、バックプロパゲー ションを通じて最適化される重みやバイアスを定義します。

###### nn.init.constant_は、PyTorchの初期化モジュールで提供される関数の一つで、テンソルのすべての要素を指定した定数で初期化するために使用されます。この関数は、通常、モデルのパラメータの初期値を設定する際に利用されます。

#### L2Norm : チャネル方向のL2Normの合計を1にする

In [21]:
class L2Norm(nn.Module):
    def __init__(self, n_channels=512, scale=20):
        super().__init__()
        self.n_channels = n_channels
        self.gamma = scale # 正規化後に掛けるパラメータ,channel分だけある.(これはbackwaradで最適化される)
        self.eps = 1e-10 # 0で割ることを防ぐためのε
        self.weight = nn.Parameter(torch.Tensor(self.n_channels))
        self.reset_parameters()
    def reset_parameters(self):
        nn.init.constant_(self.weight, self.gamma) # self.gamma(デフォルトで20)でweightを初期化

    def forward(self, X):
        """
        X : [b * c * h * w]を想定
        """
        norm = X.pow(2).sum(dim=1, keepdim=True).sqrt() + self.eps # norm : [b, 1, h, w]

        # 入力をnormで割る
        X = torch.div(X, norm) # X : [b, c, h, w]

        # スケーリングの重みを掛ける
        out = self.weight.reshape(1, self.n_channels, 1, 1) * X # self.weight.reshape(1, self.n_channels, 1, 1) : [1, c, 1, 1]
                                                                # out : [b, c, h, w]

        return out


In [22]:
tensor = nn.Parameter(torch.Tensor(3))
tensor

Parameter containing:
tensor([0., 0., 0.], requires_grad=True)

In [23]:
weight = nn.Parameter(torch.Tensor(3))
eps = 1e-10
X = torch.randn((8, 3, 8, 8))
norm = X.pow(2).sum(dim=1, keepdim=True).sqrt() + eps

X = torch.div(X, norm) # これで各ピクセルの和は1


In [24]:
norm.shape # データ1つの正規化(channel間での正規化)

torch.Size([8, 1, 8, 8])

In [25]:
X.shape

torch.Size([8, 3, 8, 8])

In [26]:
out = weight.unsqueeze(0).unsqueeze(2).unsqueeze(3).expand_as(X) * X

In [27]:
out.shape

torch.Size([8, 3, 8, 8])

### DBoxの実装

###### self.min_sizes = [30, 60, 111, 162, 213, 264]
###### self.max_sizes = [60, 111, 162, 213, 264, 315]
###### この値から、解像度が38*38の特徴マップでは画像の縦、横10% ~ 20%の大きさの画像の検出が得意ということ

In [30]:
class PriorBox:
    def __init__(self):
        self.image_size = 300 # 入力画像のサイズを300 × 300と想定
        # 解像度が38 : 1つの特徴量マップでは300/38で7ピクセル分の情報を表現
        self.feature_maps = [38, 19, 10, 5, 3, 1]
        self.steps = [8, 16, 32, 64, 100, 300] # 特徴量マップの1セルが何ピクセルを(ピクセル/セル)表現するかをリストに格納.32, 64は計算の効率性のため少しずらした値(2の累乗)を設定
                                               # 例えば300/38 ≒ 8, 300/19 ≒ 16としている
        self.min_sizes = [30, 60, 111, 162, 213, 264] # 30 ... 画像の10%程度の大きさの物体の検出に適している

        self.max_sizes = [60, 111, 162, 213, 264, 315] # 60 ...画像の20%程度の大きさの物体の検出に適している
        self.aspect_rations = [[2], [2, 3], [2, 3], [2, 3], [2], [2]]

    def forward(self):
        mean = []
        for k, f in enumerate(self.feature_maps): # [38, 19, 10, 5, 3, 1]
            for i, j in product(range(f), repeat=2): # 各特徴量マップのセルごとにDBox作成
                #self.steps は計算効率のために調整されたセルのピクセル数を格納しているが、
                # 実際のDBoxの配置で精度を保つために、f_k = self.image_size / self.steps[k] で再度スケールを計算
                f_k = self.image_size / self.steps[k] # 特徴量マップf_k個で1になる
                cx = (j + 0.5) / f_k # 比の計算 これを座標としている
                cy = (i + 0.5) / f_k
                s_k = self.min_sizes[k] / self.image_size # これは最小サイズの正方形のサイズ. 特徴量マップの解像度で固定
                mean += [cx, cy, s_k, s_k] # 最小サイズの正方形のDBox作成
                s_k_prime = sqrt(s_k * (self.max_sizes[k]/self.image_size)) # これは最大サイズの正方形のサイズ. 特徴量マップの解像度で固定
                mean += [cx, cy, s_k_prime, s_k_prime] # 最大サイズの正方形のDBox作成
                for ar in self.aspect_rations[k]:
                    mean += [cx, cy, s_k*sqrt(ar), s_k/sqrt(ar)]
                    mean += [cx, cy, s_k/sqrt(ar), s_k*sqrt(ar)]
        output = torch.Tensor(mean).view(-1, 4)
        # イメージ : [1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4]
        # ===>
        # tensor([[1., 2., 3., 4.],
        # [5., 6., 7., 8.],
        # [1., 2., 3., 4.]])

        output.clamp_(max=1, min=0)
        return output

In [31]:
300/8

37.5

In [32]:
# from itertools import product
for i, j in product(range(3), range(4)): # 直積を計算
    print(i, j)

0 0
0 1
0 2
0 3
1 0
1 1
1 2
1 3
2 0
2 1
2 2
2 3


In [33]:
for i, j in product(range(3), repeat=2):
    print(i, j)

0 0
0 1
0 2
1 0
1 1
1 2
2 0
2 1
2 2


In [34]:
mean = []
mean += [1,2,3,4]
mean += [5,6,7,8]
mean += [1,2,3,4]
# エラーになる
# mean += [1,2,3]

mean

[1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4]

In [35]:
torch.Tensor(mean).view(-1, 4)

tensor([[1., 2., 3., 4.],
        [5., 6., 7., 8.],
        [1., 2., 3., 4.]])

### SSDのクラス

In [37]:
class SSD(nn.Module):
    def __init__(self, phase='train', num_classes=21):
        super().__init__()
        self.phase = phase
        self.num_classes = num_classes
        self.vgg = make_vgg()
        self.extras = make_extras()
        self.L2Norm = L2Norm()
        self.loc = make_loc()
        self.conf = make_coef()
        dbox = PriorBox()
        self.priors = dbox.forward() # self.priorsには各解像度の各セルに対してのDBoxが4or6個格納されている

        if phase == 'test':
            self.detect = Dtect()

    def forward(self, X):
        """
        X : [b, c, h=300, w=300]
        """
        bs = X.shape[0]
        # lout = []  各セルのオフセットの予測が格納
        # cout = []  各セルのクラス分類の結果が格納
        # out = []  各解像度の特徴マップが出力
        out, lout, cout = [], [], []
        for i in range(23): # 23はvggの定義でL2Normが適用されるまでに通過する層数(Conv2d, ReLU, Maxpool2d)
            X = self.vgg[i](X)
        X1 = X
        out.append(self.L2Norm(X1)) # out1を得る

        for i in range(23, len(self.vgg)):
            X = self.vgg[i](X)

        out.append(X) # out2を得る

        # out3,4,5,6
        for i in range(0, 8, 2):
            X = F.relu(self.extras[i](X))
            X = F.relu(self.extras[i+1](X))
            out.append(X)

        # オフセットとクラス毎の信頼度を求める
        for i in range(6): # out1~out6に対する出力処理
            # 各セルのオフセットの予測
            lx = self.loc[i](out[i]).permute(0,2,3,1).reshape(bs, -1, 4)
            # self.loc[i](out[i]).permute(0,2,3,1) : [bs, 38, 38, 16] reshape後 : [bs, 38*38*4, 4]になるのでは...
            # 書籍では[bs, 38*38, 4] と各セルに対してオフセットが得られるとあるが...多分誤植

            # cout = []  各セルのクラス分類を予測
            # self.conf[i](out[i]) : [b, 4(or6)*num_classes, h, w]
            # .permute(0,2,3,1) : [b, h, w, 4(or6)*num_classes]
            # .reshape() : [b, h*w*4(or6), num_classes]
            cx = self.conf[i](out[i]).permute(0,2,3,1).reshape(bs, -1, self.num_classes)
            lout.append(lx)
            cout.append(cx)
        #import pdb; pdb.set_trace()
        lout = torch.cat(lout, 1) # [bs, 38*38*4+19*19*6+10*10*6+5*5*6+3*3*4+1*1*4, 4] # 1枚の画像の38*38の各セルに対して4つのDBoxがあり,4次元のオフセットがある
        cout = torch.cat(cout, 1) # [bs, 38*38*4+19*19*6+10*10*6+5*5*6+3*3*4+1*1*4, self.num_classes] # 1枚の画像の38*38の各セルに対して4つのDBoxがあり, 21クラスのクラス分類を行う
        outputs = (lout, cout, self.priors)
        if self.phase == 'test':
            return self.detect.apply(output, self.num_classes)
        else:
            return outputs

In [38]:
38*38*4+19*19*6+10*10*6+5*5*6+3*3*4+1*1*4

8732

In [39]:
test_tensor = torch.randn(8, 3, 300, 300)
test_model = SSD()
lout, cout, priors = test_model(test_tensor)

In [40]:
# 各セルにおけるオフセット
lout.shape

torch.Size([8, 8732, 4])

In [41]:
# 各セルにおけるクラス分類
cout.shape

torch.Size([8, 8732, 21])

In [42]:
cout.view(cout.size()).shape

torch.Size([8, 8732, 21])

In [43]:
# 各セルにおけるDBox
priors.shape

torch.Size([8732, 4])

In [44]:
priors.size(0)

8732

## 損失関数の実装


In [46]:
VOC_CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 
               'dog', 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor']

In [47]:
VOC_CLASSES.index('bicycle')

1

In [211]:
class SSDLoss(nn.Module):
    def __init__(self, num_classes, priors, neg_pos_ratio=3, alpha=1.0, iou_threshold=0.5, device='cpu'):
        super(SSDLoss, self).__init__()
        self.device = device
        self.num_classes = num_classes
        self.priors = priors  # 事前定義されたデフォルトボックス
        self.neg_pos_ratio = neg_pos_ratio  # Negative DBoxのサンプル数はPositive DBoxのサンプル数のneg_pos_ratio倍
        self.alpha = alpha  # ロケーション(loc)損失の重み
        self.iou_threshold = iou_threshold  # IoUの閾値

    def forward(self, loc_preds, cls_preds, annotations):
        """
        cls_preds: クラス予測値 (B, N, num_classes)
        loc_preds: オフセット予測値 (B, N, 4)
        annotations: 各画像のPASCAL VOC形式のアノテーションリスト
        """
        # ターゲットを生成
        cls_targets, loc_targets = self.create_targets(annotations)
        cls_targets = cls_targets.to(self.device)
        loc_targets = loc_targets.to(self.device)
        #import pdb; pdb.set_trace()
        # クラス分類損失 (Cross-entropy)
        cls_loss = self.compute_cls_loss(cls_preds, cls_targets)

        # ボックス回帰損失 (Smooth L1 Loss)
        loc_loss = self.compute_loc_loss(loc_preds, loc_targets, cls_targets)

        # 合計損失
        total_loss = cls_loss + self.alpha * loc_loss
        
        return total_loss, cls_loss, loc_loss

    def create_targets(self, annotations):
        """
        アノテーションからターゲットを作成
        """
        batch_size = len(annotations)
        cls_targets = torch.zeros(batch_size, len(self.priors), dtype=torch.long)
        loc_targets = torch.zeros(batch_size, len(self.priors), 4)

        for i, annotation in enumerate(annotations):
            # 全てのバッチで共通にデフォルトボックスであるので self.priors[i]の必要はない
            cls_targets[i], loc_targets[i] = self.generate_single_target(annotation, self.priors)

        return cls_targets, loc_targets

    def generate_single_target(self, annotation, default_boxes):
        """
        1つの画像に対してターゲットを生成する
        """
        img_width = int(annotation['annotation']['size']['width'])
        img_height = int(annotation['annotation']['size']['height'])
        cls_targets = torch.zeros(len(default_boxes), dtype=torch.long)
        loc_targets = torch.zeros((len(default_boxes), 4))

        for obj in annotation['annotation']['object']:
            class_name = obj['name']
            class_id = VOC_CLASSES.index(class_name)

            # バウンディングボックスの正規化
            xmin = float(obj['bndbox']['xmin']) / img_width
            ymin = float(obj['bndbox']['ymin']) / img_height
            xmax = float(obj['bndbox']['xmax']) / img_width
            ymax = float(obj['bndbox']['ymax']) / img_height
            gt_box = torch.tensor([[xmin, ymin, xmax, ymax]]) # box_iouは2次元で渡す必要がある

            # IoUの計算
            ious = box_iou(gt_box, default_boxes)[0]
            pos_idx = ious > self.iou_threshold
            cls_targets[pos_idx] = class_id
            loc_targets[pos_idx] = self.encode_offsets(default_boxes[pos_idx], gt_box)

        return cls_targets, loc_targets


    def encode_offsets(self, default_boxes, gt_box):
        """
        バウンディングボックスのオフセットを計算する
        """
        cx = (gt_box[:, 0] + gt_box[:, 2]) / 2 # (x_min + x_max) / 2 : バウンディングボックスの中心x座標
        cy = (gt_box[:, 1] + gt_box[:, 3]) / 2  # バウンディングボックスの中心y座標
        cx_d = (default_boxes[:, 0] + default_boxes[:, 2]) / 2 # (d_xmin + d_xmax) / 2 : デフォルトボックスのxの中心座標
        cy_d = (default_boxes[:, 1] + default_boxes[:, 3]) / 2 # (d_xmin + d_xmax) / 2 : デフォルトボックスのyの中心座標
        w_d = default_boxes[:, 2] - default_boxes[:, 0] # d_xmax - d_xmin : デフォルトボックスの幅
        h_d = default_boxes[:, 3] - default_boxes[:, 1] # d_ymax - d_ymin : デフォルトボックスの高さ
        w = gt_box[:, 2] - gt_box[:, 0] #　バウンディングボックスの幅
        h = gt_box[:, 3] - gt_box[:, 1] #　バウンディングボックスの高さ

        # 論文に従う 
        d_cx = (cx - cx_d) / (0.1 * w_d)
        d_cy = (cy - cy_d) / (0.1 * h_d)
        d_w = torch.log(w / w_d) / 0.2
        d_h = torch.log(h / h_d) / 0.2


        offsets = torch.stack([d_cx, d_cy, d_w, d_h], dim=1)
        return offsets

    def compute_cls_loss(self, cls_preds, cls_targets):
        """
        クラス分類損失を計算する
        cls_preds : [b, N, num_classes]
        cls_targets : [b, N]
        """
        #import pdb; pdb.set_trace
        pos_mask = cls_targets > 0 # 背景以外
        num_pos = pos_mask.sum() # 背景以外の数(バッチ全体)
        import pdb; pdb.set_trace()
        
        cls_loss = F.cross_entropy(cls_preds.view(-1, self.num_classes), cls_targets.view(-1), reduction='none')
        cls_loss = cls_loss.view(cls_targets.size()) # 各デフォルトボックス毎のロスの形にする

        neg_mask = ~pos_mask
        num_neg = min(self.neg_pos_ratio * num_pos, neg_mask.sum())
        neg_loss = cls_loss[neg_mask].topk(num_neg, largest=False)[0].sum()

        cls_loss = cls_loss[pos_mask].sum() + neg_loss
        return cls_loss

    def compute_loc_loss(self, loc_preds, loc_targets, cls_targets):
        """
        オフセット損失 (Smooth L1 Loss) を計算
        """
        pos_mask = cls_targets > 0 
        num_pos = pos_mask.sum()

        if num_pos == 0:
            return loc_preds.sum() * 0

        loc_loss = F.smooth_l1_loss(loc_preds[pos_mask], loc_targets[pos_mask], reduction='sum')
        loc_loss = loc_loss / num_pos
        return loc_loss


In [49]:
a = torch.tensor([[1,2,3],[4,5,6]])
b = a.view(-1)
print(b)
b.view(a.size())

tensor([1, 2, 3, 4, 5, 6])


tensor([[1, 2, 3],
        [4, 5, 6]])

In [50]:
# 基本的な使用方法
x = torch.randn(2, 3)
x_list = [x, x, x]
result = torch.stack(x_list, dim=0)
result

tensor([[[ 1.6439, -0.4512, -0.4485],
         [-0.4982,  0.2407,  0.4295]],

        [[ 1.6439, -0.4512, -0.4485],
         [-0.4982,  0.2407,  0.4295]],

        [[ 1.6439, -0.4512, -0.4485],
         [-0.4982,  0.2407,  0.4295]]])

In [51]:
[torch.tensor([i]) for i in range(4)]

[tensor([0]), tensor([1]), tensor([2]), tensor([3])]

In [52]:
torch.stack([torch.tensor([i]) for i in range(4)], dim=1)

tensor([[0, 1, 2, 3]])

In [53]:
a = torch.tensor([2])
b = torch.tensor([1])
torch.log(a / b)

tensor([0.6931])

### データ準備

In [89]:
train_transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.RandomHorizontalFlip(p=0.5), # 左右反転
    transforms.RandomCrop(300, padding=8), # データの切り抜き
    transforms.RandomRotation(10), # 回転する角度の範囲を指定. ここで10とすると、-10度から+10度までの範囲でランダムに回転
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 0~1 => -1 ~ 1
])

val_transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 0~1 => -1 ~ 1
])

show_transform = transforms.Compose([
    transforms.Resize((300, 300)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # 0~1 => -1 ~ 1
])

def collate_fn(batch):
    images = []
    targets = []
    for item in batch:
        images.append(item[0])   # 画像データをリストに追加
        targets.append(item[1])  # アノテーション（辞書）をリストに追加
    return torch.stack(images, 0), targets  # 画像のみテンソル化し、アノテーションはリストのまま返す



batch_size = 8
train_dataset = VOCDetection(root='./dataset/voc_detection', year='2012', image_set='train', \
                       download=True, transform=train_transform)
val_dataset = VOCDetection(root='./dataset/voc_detection', year='2012', image_set='val', \
                       download=True, transform=val_transform)
show_dataset = VOCDetection(root='./dataset/voc_detection', year='2012', image_set='val', \
                       download=True, transform=show_transform)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
show_loader = DataLoader(show_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

Using downloaded and verified file: ./dataset/voc_detection\VOCtrainval_11-May-2012.tar
Extracting ./dataset/voc_detection\VOCtrainval_11-May-2012.tar to ./dataset/voc_detection
Using downloaded and verified file: ./dataset/voc_detection\VOCtrainval_11-May-2012.tar
Extracting ./dataset/voc_detection\VOCtrainval_11-May-2012.tar to ./dataset/voc_detection
Using downloaded and verified file: ./dataset/voc_detection\VOCtrainval_11-May-2012.tar
Extracting ./dataset/voc_detection\VOCtrainval_11-May-2012.tar to ./dataset/voc_detection


### まずはどのようなデータであるのか可視化

In [91]:
# data augmentationしていないval_dataを用いる
imgs, targets = next(iter(show_loader))

In [93]:
len(targets)

8

In [95]:
imgs[0]

tensor([[[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         ...,
         [-1.0000, -0.9843, -0.9922,  ..., -0.9843, -0.9843, -0.9843],
         [-0.9922, -0.9843, -0.9922,  ..., -0.9843, -0.9843, -0.9843],
         [-0.9922, -0.9843, -0.9922,  ..., -0.9765, -0.9922, -0.9922]],

        [[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         ...,
         [-1.0000, -0.9843, -0.9922,  ..., -0.9843, -0.9843, -0.9843],
         [-0.9922, -0.9843, -0.9922,  ..., -0.9843, -0.9843, -0.9843],
         [-0.9922, -0.9843, -0.9922,  ..., -0.9765, -0.9922, -0.9922]],

        [[ 1.0000,  1.0000,  1.0000,  ...,  1.0000,  1.0000,  1.0000],
         [ 1.0000,  1.0000,  1.0000,  ...,  1

In [97]:
targets[5]

{'annotation': {'folder': 'VOC2012',
  'filename': '2008_000021.jpg',
  'source': {'database': 'The VOC2008 Database',
   'annotation': 'PASCAL VOC2008',
   'image': 'flickr'},
  'size': {'width': '500', 'height': '375', 'depth': '3'},
  'segmented': '0',
  'object': [{'name': 'aeroplane',
    'pose': 'Frontal',
    'truncated': '0',
    'occluded': '0',
    'bndbox': {'xmin': '14', 'ymin': '148', 'xmax': '475', 'ymax': '288'},
    'difficult': '0'}]}}

In [None]:
img_width = int(targets[1]['annotation']['size']['width'])
img_height = int(targets[1]['annotation']['size']['height'])
# cls_targets = torch.zeros(len(default_boxes), dtype=torch.long)
# loc_targets = torch.zeros((len(default_boxes), 4))
for obj in  targets[1]['annotation']['object']:
            class_name = obj['name']
            class_id = 500

            # バウンディングボックスの正規化
            xmin = float(obj['bndbox']['xmin']) / img_width
            ymin = float(obj['bndbox']['ymin']) / img_height
            xmax = float(obj['bndbox']['xmax']) / img_width
            ymax = float(obj['bndbox']['ymax']) / img_height
            gt_box = torch.tensor([[xmin, ymin, xmax, ymax]]) 

gt_box 

In [None]:
# 可視化
# まずは正規化を元に戻す
img = imgs[0]
tgt = targets[0]

img = (img * 0.5) + 0.5 # -1 ~ 1 => 0 ~ 1
img.shape

In [None]:
# plt.imshow(img.permute(1,2,0))

### 学習準備

In [117]:
ssd = SSD()
opt = optim.Adam(ssd.parameters(), lr=0.03, weight_decay=1e-4)
num_epochs = 10
num_classes = 21

### 学習ループ作成

In [187]:
def learn(model, num_epochs, optimizer, train_loader, val_loader, num_classes=21, save_path=None, early_stop=False, device='cpu'):

    model.to(device)
    criterion = SSDLoss(num_classes, model.priors, device)
    # early stop
    best_total_val_loss = float('inf')
    no_update = 0
    for epoch in range(num_epochs):
        train_total_losses = []
        val_total_losses = []

        train_loc_losses = []
        val_loc_losses = []

        train_cls_losses = []
        val_cls_losses = []

        running_train_total_losses = 0.0
        running_val_total_losses = 0.0

        running_train_loc_losses = 0.0
        running_val_loc_losses = 0.0

        running_train_cls_losses = 0.0
        running_val_cls_losses = 0.0

        model.train()
        for imgs, anotations in tqdm(train_loader, desc='now training', total=len(train_loader), leave=False):
            
            imgs = imgs.to(device)
            anotations = anotations

            optimizer.zero_grad()
            lout, cout, priors = model.forward(imgs)
            import pdb; pdb.set_trace()
            total_loss, cls_loss, loc_loss = criterion(lout, cout, anotations)

            total_loss.backward()
            optimizer.step()

            running_train_total_losses += total_loss.item()
            running_train_loc_losses += cls_loss.item()
            running_train_cls_losses += loc_loss.item()

        model.eval()
        for val_imgs, val_anotations in tqdm(val_loader, desc='now validation', total=len(val_loader), leave=False):

            val_imgs = val_imgs.to(device)
            val_anotations = val_anotations

            val_lout, val_cout, val_priors = model.forward(val_imgs)
            val_total_loss, val_cls_loss, val_loc_loss = criterion(val_lout, val_cout, val_anotations)

            running_val_total_losses += val_total_loss.item()
            running_val_loc_losses += val_cls_loss.item()
            running_val_cls_losses += val_loc_loss.item()

            



            

        train_total_losses.append(running_train_total_losses / len(train_loader))
        val_total_losses.append(running_val_total_losses / len(val_loader))
        train_loc_losses.append(running_train_loc_losses / len(train_loader))
        val_loc_losses.append(running_val_loc_losses / len(val_loader))
        train_cls_losses.append(running_train_cls_losses / len(train_loader))
        val_cls_losses.append(running_val_cls_losses / len(val_loader))

        if val_total_losses[-1] < best_total_val_loss:
                best_total_val_loss = val_total_losses[-1]
                no_update = 0
                if save_path is not None:
                    torch.save(model.state_dict(), save_path)

        else:
            no_update +=1
            if early_stop  and early_stop <= no_update:
                break
        print(f"epoch {epoch+1}: train total loss {train_total_losses[-1]:.4f}, val total loss {val_total_losses[-1]:.4f}")


    return train_total_losses, val_total_losses, train_loc_losses, val_loc_losses, train_cls_losses, val_cls_losses

In [67]:
train_total_losses, val_total_losses, train_loc_losses, val_loc_losses, train_cls_losses, val_cls_losses = \
learn(ssd, num_epochs, opt, train_loader, val_loader, None, False)

                                                    

KeyboardInterrupt: 

In [189]:
X = torch.randn(8, 3, 300, 300)
lout, cout, priors = ssd.forward(X)

In [191]:
lout.shape

torch.Size([8, 8732, 4])

In [193]:
priors.shape

torch.Size([8732, 4])

In [213]:
craiterion = SSDLoss(21, ssd.priors)

In [215]:
len(targets)

8

In [217]:
craiterion(lout, cout, )

> [1;32mc:\users\syouta\appdata\local\temp\ipykernel_14060\1292697379.py[0m(110)[0;36mcompute_cls_loss[1;34m()[0m

ipdb>  cls_preds.shape
torch.Size([8, 8732, 21])
ipdb>  cls_targets.shape
torch.Size([8, 8732])
ipdb>  cls_targets.view(-1).shape
torch.Size([69856])
ipdb>  cls_preds.view(-1,21)
tensor([[-0.1450, -0.0406,  0.4179,  ..., -0.0324, -0.0599, -0.0723],
        [ 0.0368,  0.3389, -0.6194,  ..., -0.2567, -0.3531, -0.0032],
        [-0.0882, -0.0830,  0.0299,  ...,  0.4308, -0.2818,  0.2299],
        ...,
        [ 0.0068,  0.0024, -0.0216,  ...,  0.0155, -0.0022, -0.0179],
        [ 0.0157,  0.0140, -0.0171,  ..., -0.0070, -0.0133, -0.0096],
        [ 0.0109,  0.0177,  0.0088,  ..., -0.0084, -0.0200, -0.0132]],
       grad_fn=<ViewBackward0>)
ipdb>  cls_preds.view(-1,21).shape
torch.Size([69856, 21])
ipdb>  F.cross_entropy(cls_preds.view(-1, 21), cls_targets.view(-1), reduction='none')
tensor([3.1950, 2.9539, 3.2010,  ..., 3.0392, 3.0257, 3.0336],
       grad_fn=<NllLossBac

In [9]:
a = torch.randn(8, 8732, 21)
a = a.transpose(2, 1)
a.shape

torch.Size([8, 21, 8732])

In [37]:
a = 100
b = 10
c = 3
d = 2

In [5]:
import numpy as np

In [41]:
b / a

0.1

In [43]:
d / c

0.6666666666666666

In [45]:
np.log(b / a) 

-2.3025850929940455

In [49]:
np.log(d / c)

-0.40546510810816444

In [9]:
a = torch.randn(8732, 4)
b = torch.randn(8732, 4)
c = a[:, :2] + b[:, :2] * 0.1 * b[:, 2:]
d = a[:, 2:] + b[:, 2:]
print(c.shape, d.shape)

torch.Size([8732, 2]) torch.Size([8732, 2])


In [10]:
e = torch.cat((a, b), dim=0)
e.shape

torch.Size([17464, 4])