Import & Setup

In [8]:
import torch
import torchvision
from torch.utils.data import DataLoader
from torchvision.transforms import transforms

import os
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches


Clean Empty Annotation Files

In [9]:
annotation_dirs = ["QC V2.v6i.yolov11/train/labels",
                   "QC V2.v6i.yolov11/valid/labels"]

for ann_dir in annotation_dirs:
    before = len(os.listdir(ann_dir))

    for file in os.listdir(ann_dir):
        fpath = os.path.join(ann_dir, file)
        if os.path.getsize(fpath) == 0:
            os.remove(fpath)

    after = len(os.listdir(ann_dir))
    print(f"{ann_dir}: {before} → {after} (cleaned)")


QC V2.v6i.yolov11/train/labels: 543 → 543 (cleaned)
QC V2.v6i.yolov11/valid/labels: 79 → 79 (cleaned)


Custom Dataset (YOLO → Faster R-CNN)

In [10]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, image_dir, label_dir, transform=None):
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.transform = transform

        # collect annotation files
        self.annotations = [f for f in os.listdir(label_dir) if f.endswith(".txt")]

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        ann_name = self.annotations[idx]
        img_id = ann_name.replace(".txt", "")

        # find the image file
        possible_ext = [".jpg", ".jpeg", ".png"]
        img_path = None

        for ext in possible_ext:
            if os.path.exists(os.path.join(self.image_dir, img_id + ext)):
                img_path = os.path.join(self.image_dir, img_id + ext)
                break

        if img_path is None:
            raise FileNotFoundError(f"No image found for {img_id}")

        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        height, width, _ = image.shape

        boxes = []
        labels = []

        with open(os.path.join(self.label_dir, ann_name), "r") as f:
            for line in f:
                cls, xc, yc, w, h = map(float, line.strip().split())

                cls = int(cls)  # class index stays original
                xmin = (xc - w/2) * width
                ymin = (yc - h/2) * height
                xmax = (xc + w/2) * width
                ymax = (yc + h/2) * height

                boxes.append([xmin, ymin, xmax, ymax])
                labels.append(cls + 1)  # shift class → background = 0

        boxes = torch.tensor(boxes, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.int64)

        image = self.transform(image) if self.transform else transforms.ToTensor()(image)

        target = {"boxes": boxes, "labels": labels}
        return image, target


Dataset & DataLoader

In [11]:
train_dataset = CustomDataset(
    image_dir="QC V2.v6i.yolov11/train/images",
    label_dir="QC V2.v6i.yolov11/train/labels",
    transform=transforms.ToTensor()
)

valid_dataset = CustomDataset(
    image_dir="QC V2.v6i.yolov11/valid/images",
    label_dir="QC V2.v6i.yolov11/valid/labels",
    transform=transforms.ToTensor()
)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True,
                          collate_fn=lambda x: tuple(zip(*x)))
valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False,
                          collate_fn=lambda x: tuple(zip(*x)))

len(train_loader), len(valid_loader)


(543, 79)

Build Faster R-CNN Model

In [12]:
num_classes = 5  # background + 4 classes

model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights="COCO_V1")

in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(
    in_features, num_classes
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.001, momentum=0.9)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

num_epochs = 10


Training Loop

In [13]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    model.train()
    running_loss = 0

    for images, targets in train_loader:
        images = [img.to(device) for img in images]
        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

        loss_dict = model(images, targets)
        loss = sum(loss_dict.values())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    scheduler.step()
    print(f"Train Loss: {running_loss / len(train_loader):.4f}")


Epoch 1/10
Train Loss: 0.2941
Epoch 2/10


KeyboardInterrupt: 

Save Model

In [None]:
torch.save(model.state_dict(), "faster_rcnn_custom_4class.pth")
print("Model saved.")


136


Evaluation on Test Images

In [None]:
model.eval()

test_dir = "Bear_Datasets/test/images"
test_imgs = os.listdir(test_dir)

transform = transforms.ToTensor()

fig, ax = plt.subplots(1, 2, figsize=(10, 6))
ax = ax.ravel()

for i in range(2):
    img_name = np.random.choice(test_imgs)
    img_path = os.path.join(test_dir, img_name)

    img = Image.open(img_path).convert("RGB")
    img_tensor = transform(img).unsqueeze(0).to(device)

    with torch.no_grad():
        pred = model(img_tensor)[0]

    img_np = np.array(img)

    ax[i].imshow(img_np)

    boxes = pred["boxes"].cpu().numpy()
    labels = pred["labels"].cpu().numpy()
    scores = pred["scores"].cpu().numpy()

    for box, label, score in zip(boxes, labels, scores):
        if score < 0.7:
            continue

        xmin, ymin, xmax, ymax = box
        rect = patches.Rectangle(
            (xmin, ymin), xmax - xmin, ymax - ymin,
            linewidth=2, edgecolor="red", facecolor="none"
        )
        ax[i].add_patch(rect)
        ax[i].text(xmin, ymin - 5, f"class {label-1} ({score:.2f})",
                   color="yellow", fontsize=12)

plt.show()
