In [1]:
!pip install torchvision

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0->torchvision)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch==2.6.0->torchvision)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch==2.6.0->torchvision)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch==2.6.0->torchvision)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import torch
from torch.utils.data import DataLoader
import torchvision
from torchvision import transforms
from torchvision.datasets import CocoDetection
from torchvision.models.detection import fasterrcnn_resnet50_fpn
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.utils import draw_bounding_boxes
from torchvision.transforms.functional import to_pil_image
from torchvision.ops import nms
import matplotlib.pyplot as plt

# 1) 환경 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [4]:
# 2) 데이터 경로
img_dir = '/content/drive/MyDrive/Data/CV_dataset/Segmentation/images'           # 이미지가 저장된 폴더
ann_file = '/content/drive/MyDrive/Data/CV_dataset/Detection/annotations.json'   # COCO 주석 파일

In [5]:
# 4) CocoDetection Dataset 래핑
class CustomCocoDataset(CocoDetection):
    def __init__(self, img_folder, ann_file, transforms=None):
        super().__init__(img_folder, ann_file)
        self._transforms = transforms

    def __getitem__(self, idx):
        img, targets = super().__getitem__(idx)
        # ─────────────────────────────────────────────────────
        # COCO annotation → 모델 입력용 타깃 변환
        # ─────────────────────────────────────────────────────
        # 1) bbox·label 추출 (COCO 형식: [x_min, y_min, width, height])
        boxes_list  = [obj["bbox"]        for obj in targets]   # 예) [[140, 175, 307, 328]]
        labels_list = [obj["category_id"] for obj in targets]

        # 2) 리스트 → Tensor 변환
        boxes = torch.as_tensor(boxes_list, dtype=torch.float32)  # shape = (N, 4)

        # 3) 빈 annotation 여부 체크
        if boxes.numel() == 0:
            # 박스가 하나도 없을 때 : shape을 (0,4)로 맞춰 오류 방지
            boxes = boxes.view(0, 4)
        else:
            # ────────────────────────────────────────────────
            # COCO는 [x, y, w, h] 이지만
            # Faster R-CNN 등 torchvision 모델은 [x1, y1, x2, y2] 를 기대한다.
            #   x2 = x1 + w
            #   y2 = y1 + h
            # 즉, width·height를 끝점 좌표로 바꿔야 한다.
            # 아래 한 줄이 그 변환을 벡터화해 수행:
            #   boxes[:, 2] (width)  += boxes[:, 0] (x1)
            #   boxes[:, 3] (height) += boxes[:, 1] (y1)
            # 결과: [[x1, y1, x1+w, y1+h]]  >>  [[140, 175, 447, 503]]
            # ────────────────────────────────────────────────
            boxes[:, 2:] += boxes[:, :2]

        labels   = torch.as_tensor(labels_list, dtype=torch.int64)
        image_id = torch.tensor([idx])

        # 면적(area) 계산 (박스가 있을 때만)
        if boxes.numel() > 0:
            area = (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
        else:
            area = torch.tensor([], dtype=torch.float32)

        iscrowd = torch.zeros((boxes.size(0),), dtype=torch.int64)

        target = {
            "boxes": boxes,          # [x1, y1, x2, y2] 형식
            "labels": labels,
            "image_id": image_id,
            "area": area,
            "iscrowd": iscrowd,
        }

        if self._transforms is not None:
            img = self._transforms(img)

        return img, target


In [6]:
# 5) 데이터로더 생성
dataset_train = CustomCocoDataset(img_dir, ann_file, transforms=transforms.ToTensor())
dataset_val   = CustomCocoDataset(img_dir, ann_file, transforms=transforms.ToTensor())

loading annotations into memory...
Done (t=3.28s)
creating index...
index created!
loading annotations into memory...
Done (t=0.00s)
creating index...
index created!


In [12]:
data_loader_train = DataLoader(dataset_train, batch_size=4, shuffle=True, num_workers=2, collate_fn=lambda x: tuple(zip(*x)))
data_loader_val   = DataLoader(dataset_val,   batch_size=1, shuffle=False, num_workers=2, collate_fn=lambda x: tuple(zip(*x)))

In [8]:
# ────────────────────────────────────────────────────────────────
# DataLoader가 미니배치를 만들 때 수행되는 과정
# ────────────────────────────────────────────────────────────────
# 1) Sampler가 이번 배치에 사용할 인덱스들(ex. [17, 42, 91, 103])을 뽑는다
# 2) 각 인덱스에 대해  dataset[idx]  를 호출하여 (img, target) 튜플을 얻는다
#       batch = [
#           (img_17,  target_17),
#           (img_42,  target_42),
#           (img_91,  target_91),
#           (img_103, target_103)
#       ]
#    여기서 batch가 곧 collate_fn의 인수 x 로 전달된다.
# 3) collate_fn 은 이 리스트를 (imgs, targets) 형태로 풀어 모델에 공급할 수 있게 한다.
# ────────────────────────────────────────────────────────────────

# collate_fn을 명시적 함수로 정의 (동일 기능의 lambda 버전을 가독성 있게 분리)
# def collate_fn(batch):
#     """
#     batch: [(img, target), (img, target), ...]  # DataLoader가 만든 리스트
#     반환:  ( (img1, img2, ...), (target1, target2, ...) )
#     """
#     # zip(*batch)  ➜  ( (img1,img2,..), (target1,target2,..) )
#     return tuple(zip(*batch))

# # DataLoader에 전달 ─ 이미지 4장씩, shuffle=True
# data_loader_train = DataLoader(dataset_train,
#                                batch_size=4,
#                                shuffle=True,
#                                num_workers=2,
#                                collate_fn=collate_fn)

# data_loader_val   = DataLoader(dataset_val,
#                                batch_size=4,
#                                shuffle=False,
#                                num_workers=2,
#                                collate_fn=collate_fn)

In [11]:
for x in data_loader_train:
    print(x)
    break

((tensor([[[0.8392, 0.6314, 0.8392,  ..., 0.2824, 0.3569, 0.9765],
         [0.9294, 0.6863, 0.6353,  ..., 0.6824, 0.8510, 0.7176],
         [0.4078, 0.3373, 0.7765,  ..., 0.2549, 0.6510, 0.8471],
         ...,
         [0.2824, 0.1843, 0.4824,  ..., 0.2824, 0.4549, 0.7294],
         [0.8588, 0.8745, 0.0471,  ..., 0.3490, 0.2941, 0.7804],
         [0.5020, 0.4196, 0.7686,  ..., 0.0118, 0.1725, 0.0196]],

        [[0.6039, 0.2118, 0.2314,  ..., 0.0706, 1.0000, 0.9373],
         [0.4039, 0.5804, 0.3255,  ..., 0.4039, 0.5647, 0.7176],
         [0.9412, 0.2431, 0.7412,  ..., 0.1137, 0.4941, 0.5137],
         ...,
         [0.9451, 0.4824, 0.7373,  ..., 0.2549, 0.1490, 0.4667],
         [0.1608, 0.8353, 0.0588,  ..., 0.5490, 0.9961, 0.9882],
         [0.5098, 0.5020, 0.5765,  ..., 0.8863, 0.8510, 0.0275]],

        [[0.8118, 0.5490, 0.3882,  ..., 0.8588, 0.5176, 0.9020],
         [0.3647, 0.5020, 0.3647,  ..., 0.7843, 0.0902, 0.9255],
         [0.0706, 0.0863, 0.1804,  ..., 0.3216, 0.2902, 

In [13]:
# 6) 모델 정의 (사전학습된 Faster R-CNN 불러오기)
model = fasterrcnn_resnet50_fpn(pretrained=True) # weights='DEFAULT'

in_features = model.roi_heads.box_predictor.cls_score.in_features
num_classes = 4   # background 포함

model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
model.to(device)

# 7) Optimizer & Scheduler
params = [p for p in model.parameters() if p.requires_grad]

optimizer = torch.optim.Adam(params, lr=1e-4)

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth
100%|██████████| 160M/160M [00:00<00:00, 232MB/s]


In [14]:
# 8) 학습 루프 (간단한 예제)
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for images, targets in data_loader_train:
        images = list(img.to(device) for img in images)
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

    # Validation 스텝 (optional)
    model.eval()
    # 여기서는 간단히 첫 배치 예측만 수행
    with torch.no_grad():
        imgs, tgts = next(iter(data_loader_val))
        imgs = list(img.to(device) for img in imgs)
        outputs = model(imgs)
    print(f"[Epoch {epoch+1}/{num_epochs}] Train Loss: {losses.item():.4f}")

print("Training finished.")


[Epoch 1/10] Train Loss: 0.3105
[Epoch 2/10] Train Loss: 0.2775
[Epoch 3/10] Train Loss: 0.2567
[Epoch 4/10] Train Loss: 0.2244
[Epoch 5/10] Train Loss: 0.1758
[Epoch 6/10] Train Loss: 0.2203
[Epoch 7/10] Train Loss: 0.2954
[Epoch 8/10] Train Loss: 0.2158
[Epoch 9/10] Train Loss: 0.1392
[Epoch 10/10] Train Loss: 0.1585
Training finished.


In [None]:
# 카테고리 ID → 이름 매핑
category_names = {1: "anomaly_1", 2: "anomaly_2", 3: "anomaly_3"}

model.eval()
images, targets = next(iter(data_loader_val))
images = [img.to(device) for img in images]

with torch.no_grad():
    outputs = model(images)

for idx in range(len(images)):
    img_tensor = images[idx].cpu()
    pred       = outputs[idx]
    boxes      = pred["boxes"].cpu()
    labels     = pred["labels"].cpu()
    scores     = pred["scores"].cpu()

    # ── 1) 점수(threshold) 필터 ──
    conf_keep = scores >= 0.3
    boxes, labels, scores = boxes[conf_keep], labels[conf_keep], scores[conf_keep]

    # ── 2) NMS 적용 ──
    if boxes.numel() > 0:
        nms_keep = nms(boxes, scores, iou_threshold=0.5)
        boxes, labels, scores = boxes[nms_keep], labels[nms_keep], scores[nms_keep]

    # ── 3) 시각화 ──
    texts = [f"{category_names[int(l)]}: {s:.2f}" for l, s in zip(labels, scores)]
    img_uint8 = (img_tensor * 255).to(torch.uint8)
    vis = draw_bounding_boxes(img_uint8, boxes, labels=texts, colors="red", width=2)

    plt.figure(figsize=(6, 6))
    plt.imshow(to_pil_image(vis))
    plt.axis("off")
    plt.title(f"Sample {idx+1}")
    plt.show()


Output hidden; open in https://colab.research.google.com to view.