<a href="https://colab.research.google.com/github/Siu0901/AI_study/blob/main/Yolo_v1_%EA%B5%AC%ED%98%84(Resnet_%EC%9D%BC%EB%B6%80_%EC%9D%B4%EC%8B%9D).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import torchvision.transforms as transforms
from torchvision.datasets import VOCDetection
import numpy as np
import cv2

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

LEARNING_RATE = 1e-4
BATCH_SIZE = 8
WEIGHT_DECAY = 0
EPOCHS = 10
NUM_WORKERS = 2
PIN_MEMORY = True
LOAD_MODEL = False

In [21]:
VOC_CLASSES = [
    "aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat",
    "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person",
    "pottedplant", "sheep", "sofa", "train", "tvmonitor"
]

In [4]:
# 매우매우 힘들었던 데이터 가져오기
# 제미나이랑 입씨름 끝에 캐글에서 가져옴
import kagglehub
import shutil
import os

# 1. data 폴더 초기화
if os.path.exists('./data'):
    shutil.rmtree('./data')
os.makedirs('./data/VOCdevkit', exist_ok=True)

# 2. 데이터 경로 확인 (다운로드)
print("경로 확인 중...")
path = kagglehub.dataset_download("huanghanchina/pascal-voc-2012")
print(f"원본 경로: {path}")

# 3. 소스 경로 찾기 (VOC2012 폴더 위치 찾기)
# 캐글 데이터셋 안에 VOC2012가 어디 박혀있는지 확인
src_path = os.path.join(path, 'VOC2012')
if not os.path.exists(src_path):
    # 만약 VOCdevkit 안에 들어있다면
    if os.path.exists(os.path.join(path, 'VOCdevkit', 'VOC2012')):
        src_path = os.path.join(path, 'VOCdevkit', 'VOC2012')
    else:
        # 그것도 아니면 그냥 path 자체가 데이터임
        src_path = path

print(f"찾은 소스 경로: {src_path}")

# 4. 복사 시작 (shutil.copytree 사용)
# Move(이동)가 아니라 Copy(복사)를 해야 에러가 안 납니다!
dst_path = './data/VOCdevkit/VOC2012'

print("데이터 복사 중... (잠시만 기다리세요)")
shutil.copytree(src_path, dst_path)

print("\n[성공] 데이터 준비 완료!")
print(f"데이터 위치: {dst_path}")

경로 확인 중...
Using Colab cache for faster access to the 'pascal-voc-2012' dataset.
원본 경로: /kaggle/input/pascal-voc-2012
찾은 소스 경로: /kaggle/input/pascal-voc-2012/VOC2012
데이터 복사 중... (잠시만 기다리세요)

[성공] 데이터 준비 완료!
데이터 위치: ./data/VOCdevkit/VOC2012


In [22]:
# 데이터 정제
class VOCDataset(VOCDetection):
    def __init__(self, root, year, image_set, download=False, S=7, B=2, C=20, transform=None):
        super().__init__(root, year=year, image_set=image_set, download=download)
        self.S = S
        self.B = B
        self.C = C
        self.transform = transform

    def __getitem__(self, index):
        img, target = super().__getitem__(index)

        # 이미지 크기 변경 전 정보 저장
        original_width, original_height = img.size

        # 이미지 리사이즈 및 텐서 변환
        if self.transform:
            img = self.transform(img)

        # 정답 텐서 7x7x30 만들기
        label_matrix = torch.zeros((self.S, self.S, self.C + 5 * self.B))

        # XML에서 객체 정보 가져오기
        objects = target['annotation']['object']
        if not isinstance(objects, list):
            objects = [objects]

        for obj in objects:
            class_name = obj['name']
            if class_name not in VOC_CLASSES:
                continue

            class_idx = VOC_CLASSES.index(class_name)
            bndbox = obj['bndbox']

            # 원본 크기 기준으로 좌표 가져오기
            xmin = float(bndbox['xmin'])
            ymin = float(bndbox['ymin'])
            xmax = float(bndbox['xmax'])
            ymax = float(bndbox['ymax'])

            # 448x448 크기 맞춰 좌표 스케일링
            scale_x = 448 / original_width
            scale_y = 448 / original_height

            x = (xmin * scale_x + xmax * scale_x) / 2
            y = (ymin * scale_y + ymax * scale_y) / 2
            w = (xmax * scale_x - xmin * scale_x)
            h = (ymax * scale_y - ymin * scale_y)

            # 0~1 사이 값으로 정규화
            x /= 448
            y /= 448
            w /= 448
            h /= 448

            # 그리드 셀 인덱스 계산
            i, j = int(self.S * y), int(self.S * x)
            x_cell, y_cell = self.S * x - j, self.S * y - i

            # 해당 셀에 아직 객체가 없으면 할당
            if label_matrix[i, j, 20] == 0:
                label_matrix[i, j, 20] = 1 # Confidence
                label_matrix[i, j, 21:25] = torch.tensor([x_cell, y_cell, w, h])
                label_matrix[i, j, class_idx] = 1

        return img, label_matrix

In [23]:
transform = transforms.Compose([
    transforms.Resize((448, 448)),
    transforms.ToTensor(),
])

dataset = VOCDataset(
    root='./data',
    year='2012',
    image_set='trainval',
    download=False,
    transform=transform
)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

print(f"전체 데이터: {len(dataset)}장")
print(f"-> 학습용(Train): {len(train_dataset)}장")
print(f"-> 테스트용(Test): {len(test_dataset)}장")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    shuffle=True,
    drop_last=True,
    collate_fn=None
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=PIN_MEMORY,
    drop_last=False
)

전체 데이터: 11540장
-> 학습용(Train): 9232장
-> 테스트용(Test): 2308장


욜로 다크넷 구현 했는데 이거 이미지넷으로 선행 학습 돌려야 된다해서

저 부분을 resnet50 가져와서 바꿔 끼고 마지막 벡본 부분만 조금 수정해서 코드 갈아끼는 걸로 바꿈

In [24]:
# class DarkNet(nn.Module):
#     def __init__(self):
#       super(DarkNet, self).__init__()

#       self.feature = nn.Sequential(
#           nn.Conv2d(3,64, kernel_size=7, stride=2, padding=3, bias=False),
#           nn.BatchNorm2d(64),
#           nn.LeakyReLU(0.1),
#           nn.MaxPool2d(kernel_size=2, stride=2),

#           nn.Conv2d(64,192, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(192),
#           nn.LeakyReLU(0.1),
#           nn.MaxPool2d(kernel_size=2, stride=2),

#           nn.Conv2d(192,128, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(128),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(128,256, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(256),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(256,256, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(256),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(256,512, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),
#           nn.MaxPool2d(kernel_size=2, stride=2),

#           nn.Conv2d(512,256, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(256),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(256,512, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(512,256, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(256),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(256,512, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(512,256, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(256),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(256,512, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(512,256, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(256),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(256,512, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),

#           nn.Conv2d(512,512, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(512,1024, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(1024),
#           nn.LeakyReLU(0.1),
#           nn.MaxPool2d(kernel_size=2, stride=2),

#           nn.Conv2d(1024,512, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(512,1024, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(1024),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(1024,512, kernel_size=1, stride=1, padding=0, bias=False),
#           nn.BatchNorm2d(512),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(512,1024, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(1024),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(1024,1024, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(1024),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(1024,1024, kernel_size=3, stride=2, padding=1, bias=False),
#           nn.BatchNorm2d(1024),
#           nn.LeakyReLU(0.1),

#           nn.Conv2d(1024,1024, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(1024),
#           nn.LeakyReLU(0.1),
#           nn.Conv2d(1024,1024, kernel_size=3, stride=1, padding=1, bias=False),
#           nn.BatchNorm2d(1024),
#           nn.LeakyReLU(0.1),
#       )
#       self.full_con = nn.Sequential(
#           nn.Flatten(),
#           nn.Linear(7*7*1024,4096),
#           nn.LeakyReLU(0.1),
          # nn.Linear(4096,7*7*30),
          # nn.Unflatten(1,(7,7,30))
#       )

#     def forward(self, x):
#       x = self.feature(x)
#       x = self.full_con(x)
#       return x
import torchvision.models as models

class Yolov1_resnet50(nn.Module):
    def __init__(self):
        super(Yolov1_resnet50, self).__init__()

        self.backbone = models.resnet50(weights='DEFAULT')
        # resnet 뒷부분 fc 자르기 (2048, 14, 14) 크기로 나옴
        self.backbone = nn.Sequential(*list(self.backbone.children())[:-2])

        self.feature = nn.Sequential(
            nn.Conv2d(2048, 1024, kernel_size=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),

            # 크기 14->7로 줄이기 (stride 2로 설정했음)
            nn.Conv2d(1024, 1024, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),

            nn.Conv2d(1024, 1024, kernel_size=3, padding=1,bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),

            nn.Conv2d(1024, 1024, kernel_size=3, padding=1,bias=False),
            nn.BatchNorm2d(1024),
            nn.LeakyReLU(0.1),
        )

        self.full_con = nn.Sequential(
            nn.Flatten(),
            nn.Linear(7*7*1024,4096),
            nn.LeakyReLU(0.1),
            nn.Linear(4096,7*7*30),
            nn.Unflatten(1,(7,7,30))
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.feature(x)
        x = self.full_con(x)
        return x

In [25]:
class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S, self.B, self.C = S, B, C
        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def compute_iou(self, box1, box2):
        b1_x1, b1_y1 = box1[..., 0] - box1[..., 2] / 2, box1[..., 1] - box1[..., 3] / 2
        b1_x2, b1_y2 = box1[..., 0] + box1[..., 2] / 2, box1[..., 1] + box1[..., 3] / 2
        b2_x1, b2_y1 = box2[..., 0] - box2[..., 2] / 2, box2[..., 1] - box2[..., 3] / 2
        b2_x2, b2_y2 = box2[..., 0] + box2[..., 2] / 2, box2[..., 1] + box2[..., 3] / 2

        inter_x1 = torch.max(b1_x1, b2_x1)
        inter_y1 = torch.max(b1_y1, b2_y1)
        inter_x2 = torch.min(b1_x2, b2_x2)
        inter_y2 = torch.min(b1_y2, b2_y2)

        inter_area = (inter_x2 - inter_x1).clamp(0) * (inter_y2 - inter_y1).clamp(0)
        box1_area = abs((b1_x2 - b1_x1) * (b1_y2 - b1_y1))
        box2_area = abs((b2_x2 - b2_x1) * (b2_y2 - b2_y1))

        return inter_area / (box1_area + box2_area - inter_area + 1e-6)

    def forward(self, predictions, target):
        # 두 박스 중 누가 책임질지 결정
        iou_b1 = self.compute_iou(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = self.compute_iou(predictions[..., 26:30], target[..., 21:25])

        # IoU가 더 큰 박스를 선택
        # best_box 값이 0이면 Box1, 1이면 Box2를 따름
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)
        best_box = torch.argmax(ious, dim=0) # (N, 7, 7)

        exists_box = target[..., 20].unsqueeze(3) # (N, 7, 7, 1)

        # best_box에 따라 예측값 선택
        box_predictions = exists_box * (
            (best_box.unsqueeze(3) == 0) * predictions[..., 21:25] +
            (best_box.unsqueeze(3) == 1) * predictions[..., 26:30]
        )
        box_targets = exists_box * target[..., 21:25]

        # Width, Height 루트 처리
        box_predictions[..., 2:4] = torch.sign(box_predictions[..., 2:4]) * torch.sqrt(torch.abs(box_predictions[..., 2:4] + 1e-6))
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])

        box_loss = self.mse(torch.flatten(box_predictions, end_dim=-2),
                            torch.flatten(box_targets, end_dim=-2))

        pred_box = (
            (best_box.unsqueeze(3) == 0) * predictions[..., 20:21] +
            (best_box.unsqueeze(3) == 1) * predictions[..., 25:26]
        )
        object_loss = self.mse(torch.flatten(exists_box * pred_box),
                               torch.flatten(exists_box * target[..., 20:21]))

        # 물체 없는 곳에서는 Box1, Box2 둘 다 Loss 계산
        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )
        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )

        # 클래스 손실률
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2)
        )

        loss = (
            self.lambda_coord * box_loss
            + object_loss
            + self.lambda_noobj * no_object_loss
            + class_loss
        )
        return loss

In [27]:
# model = DarkNet().to(device)
model = Yolov1_resnet50().to(device)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
loss_fn = YoloLoss()

print("학습 시작한다")
model.train()

for epoch in range(EPOCHS):
  mean_loss = []

  for batch_idx, (x,y) in enumerate(train_loader):
    x,y = x.to(device), y.to(device)

    out = model(x)
    loss = loss_fn(out, y)
    mean_loss.append(loss.item())

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if batch_idx % 100 == 0:
            print(f"Epoch [{epoch+1}/{EPOCHS}] Batch {batch_idx}/{len(train_loader)} Loss: {loss.item():.4f}")

  print(f"===> Epoch {epoch+1} 평균 Loss: {sum(mean_loss)/len(mean_loss):.4f}")

학습 시작한다
Epoch [1/10] Batch 0/1154 Loss: 247.0426
Epoch [1/10] Batch 100/1154 Loss: 102.1081
Epoch [1/10] Batch 200/1154 Loss: 54.7044
Epoch [1/10] Batch 300/1154 Loss: 55.0788
Epoch [1/10] Batch 400/1154 Loss: 23.9167
Epoch [1/10] Batch 500/1154 Loss: 54.4183
Epoch [1/10] Batch 600/1154 Loss: 60.2700
Epoch [1/10] Batch 700/1154 Loss: 53.6127
Epoch [1/10] Batch 800/1154 Loss: 72.7721
Epoch [1/10] Batch 900/1154 Loss: 57.6377
Epoch [1/10] Batch 1000/1154 Loss: 140.0523
Epoch [1/10] Batch 1100/1154 Loss: 67.4052
===> Epoch 1 평균 Loss: 79.7244
Epoch [2/10] Batch 0/1154 Loss: 993.0522
Epoch [2/10] Batch 100/1154 Loss: 79.2010
Epoch [2/10] Batch 200/1154 Loss: 150.7109
Epoch [2/10] Batch 300/1154 Loss: 77.4356
Epoch [2/10] Batch 400/1154 Loss: 126.7852
Epoch [2/10] Batch 500/1154 Loss: 78.3755
Epoch [2/10] Batch 600/1154 Loss: 95.9337
Epoch [2/10] Batch 700/1154 Loss: 40.9933
Epoch [2/10] Batch 800/1154 Loss: 100.9630
Epoch [2/10] Batch 900/1154 Loss: 53.9005
Epoch [2/10] Batch 1000/1154 Loss

In [28]:
# 모델 저장
torch.save(model.state_dict(), "yolo_resnet50_voc.pth")
print("모델 저장 완료")

모델 저장 완료


In [29]:
import torch
from collections import Counter

# 1. IoU 계산 (Intersection Over Union)
def intersection_over_union(boxes_preds, boxes_labels, box_format="midpoint"):
    # boxes_preds shape: (N, 4) -> (x, y, w, h)
    # boxes_labels shape: (N, 4)

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2

        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)

# 2. NMS (Non-Maximum Suppression) - 중복 박스 제거
def non_max_suppression(bboxes, iou_threshold, threshold, box_format="midpoint"):
    # bboxes: [[class, score, x, y, w, h], ...]
    assert type(bboxes) == list

    bboxes = [box for box in bboxes if box[1] > threshold]
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)
    bboxes_after_nms = []

    while bboxes:
        chosen_box = bboxes.pop(0)

        bboxes = [
            box for box in bboxes
            if box[0] != chosen_box[0]
            or intersection_over_union(
                torch.tensor(chosen_box[2:]),
                torch.tensor(box[2:]),
                box_format=box_format
            ) < iou_threshold
        ]

        bboxes_after_nms.append(chosen_box)

    return bboxes_after_nms

# 3. mAP 계산 (Mean Average Precision)
def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format="midpoint", num_classes=20):
    # pred_boxes: [[train_idx, class_pred, prob_score, x1, y1, x2, y2], ...]
    average_precisions = []
    epsilon = 1e-6

    for c in range(num_classes):
        detections = []
        ground_truths = []

        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)

        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # 이미지별 정답 개수 카운트 (예: img 0에 강아지 2마리, img 1에 1마리...)
        amount_bboxes = Counter([gt[0] for gt in ground_truths])
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        detections.sort(key=lambda x: x[2], reverse=True)
        TP = torch.zeros(len(detections))
        FP = torch.zeros(len(detections))
        total_true_bboxes = len(ground_truths)

        if total_true_bboxes == 0:
            continue

        for detection_idx, detection in enumerate(detections):
            ground_truth_img = [bbox for bbox in ground_truths if bbox[0] == detection[0]]

            num_gts = len(ground_truth_img)
            best_iou = 0
            best_gt_idx = -1

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format,
                )

                if iou > best_iou:
                    best_iou = iou
                    best_gt_idx = idx

            if best_iou > iou_threshold:
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] = 1
                else:
                    FP[detection_idx] = 1
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = TP_cumsum / (TP_cumsum + FP_cumsum + epsilon)

        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))

        # 적분 (Area under curve)
        average_precisions.append(torch.trapz(precisions, recalls))

    return sum(average_precisions) / len(average_precisions)

# 4. 모델 출력을 박스 리스트로 변환 (get_bboxes)
def get_bboxes(loader, model, iou_threshold, threshold, S=7, device="cuda"):
    all_pred_boxes = []
    all_true_boxes = []

    model.eval()
    train_idx = 0

    for batch_idx, (x, labels) in enumerate(loader):
        x = x.to(device)
        labels = labels.to(device)

        with torch.no_grad():
            predictions = model(x)

        batch_size = x.shape[0]
        # (N, 7, 7, 30) -> 리스트 형태로 변환

        # --- Decode Predictions ---
        bboxes = []
        # Box 1
        box1_conf = predictions[..., 20:21]
        box1_coord = predictions[..., 21:25]
        # Box 2
        box2_conf = predictions[..., 25:26]
        box2_coord = predictions[..., 26:30]
        # Class
        class_probs = predictions[..., :20]
        best_class = class_probs.argmax(-1).unsqueeze(-1)

        # Cell indices
        cell_indices = torch.arange(S).repeat(batch_size, S, 1).unsqueeze(-1).to(device)

        # 좌표 복원 (Relative to Image)
        for box_conf, box_coord in [(box1_conf, box1_coord), (box2_conf, box2_coord)]:
            x_c = (1/S) * (box_coord[..., 0:1] + cell_indices)
            y_c = (1/S) * (box_coord[..., 1:2] + cell_indices.permute(0, 2, 1, 3))
            w_h = box_coord[..., 2:4]
            converted_box = torch.cat((best_class, box_conf, x_c, y_c, w_h), dim=-1) # (N, 7, 7, 6)

            # 배치별로 쪼개기
            for idx in range(batch_size):
                # 7x7=49개 박스 중 threshold 넘는 것만 NMS 수행
                boxes_in_img = converted_box[idx].reshape(-1, 6).tolist()
                nms_boxes = non_max_suppression(boxes_in_img, iou_threshold, threshold)
                for box in nms_boxes:
                    all_pred_boxes.append([train_idx + idx] + box)

        # --- Decode Targets ---
        # 정답 데이터도 리스트로 변환
        for idx in range(batch_size):
            label = labels[idx] # (7, 7, 30)
            # 물체 있는 곳만 가져오기
            mask = label[..., 20] == 1
            true_locs = label[mask] # (num_obj, 30)

            for true_box in true_locs:
                cls = true_box[:20].argmax().item()
                x_c = (true_box[21] + mask.nonzero()[0][1]) / S # j + x
                y_c = (true_box[22] + mask.nonzero()[0][0]) / S # i + y
                w = true_box[23]
                h = true_box[24]
                all_true_boxes.append([train_idx + idx, cls, 1, x_c.item(), y_c.item(), w.item(), h.item()])

        train_idx += batch_size

    model.train()
    return all_pred_boxes, all_true_boxes

In [30]:
print("정확도(mAP) 계산 중... (시간이 좀 걸립니다)")

# 1. 모든 예측값과 정답을 리스트로 변환
# threshold=0.05: 확신 5% 미만은 버림
# iou_threshold=0.5: NMS할 때 겹치는 박스 제거 기준
pred_boxes, true_boxes = get_bboxes(test_loader, model, iou_threshold=0.5, threshold=0.05, device=device)

# 2. mAP 계산 (IoU 0.5 기준)
mAP = mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, num_classes=20)

print(f"========================================")
print(f"최종 모델 성능 (mAP @ IoU 0.5): {mAP.item()*100:.2f}%")
print(f"========================================")

정확도(mAP) 계산 중... (시간이 좀 걸립니다)
최종 모델 성능 (mAP @ IoU 0.5): 0.61%


학습은 일단 정상적으로 되는 것 같음.

근데 resnet을 몸체 갈아 끼우고, 에폭 10 하는데 1시간 반 걸렸는데 loss가 60인거 보고
음 얘부터는 노트북 레벨로는 할 수 없구나를 느낌.

학습 되는거만 확인하고 여기까지 하자...

피치 못할 사정으로 모델 정확도 검증 코드는 무지성 제미나이 갈김. 약간 찔린다.

근데 아무리 그래도 정확도 0.61%은 많이많이많이 뭔가긴 해.

이게 노트북의 한계다. 나중에 컴으로 학습 50에폭 때리고 해보자