<a href="https://colab.research.google.com/github/SaraElwatany/Lung-TumorDetection-Segmentation/blob/main/comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import cv2
import torch
import shutil
import yaml
import glob
import random
import numpy as np
import torch.nn as nn
from PIL import Image
from tqdm import tqdm
from google.colab import drive
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from IPython import get_ipython
from IPython.display import display
import torchvision.transforms as transforms
import torchvision.transforms as T
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, random_split

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

#### **DataLoading & Splitting**

In [None]:
dataset_path = '/content/drive/MyDrive/LungTumorDetectionAndSegmentation'

## **Validation on Whole Images**

In [None]:
class LungTumorSegmentationDataset(Dataset):

    def __init__(self, root_path, transform=None, mask_transform=None):

        self.images = []
        for subject in os.listdir(os.path.join(root_path, 'images')):
            subject_path = os.path.join(root_path, 'images', subject)
            for image_file in os.listdir(subject_path):
                self.images.append(os.path.join(subject_path, image_file))

        self.masks = [img_path.replace('images', 'masks') for img_path in self.images]
        self.transform = transform
        self.mask_transform = mask_transform


    def __len__(self):
        return len(self.images)


    def __getitem__(self, idx):

        image_path = self.images[idx]
        mask_path = self.masks[idx]

        image = Image.open(image_path).convert("L")  # grayscale image
        mask = Image.open(mask_path).convert("L")    # mask as single-channel

        if self.transform:
            image = self.transform(image)
        if self.mask_transform:
            mask = self.mask_transform(mask)
            mask = (mask > 0.5).float()  # Ensure binary values

        return image_path, image, mask

In [None]:
image_transform = T.Compose([
                              T.Resize((256, 256)),
                              T.ToTensor(),
                           ])

In [None]:
train_dataset = LungTumorSegmentationDataset(os.path.join(dataset_path, 'train'), image_transform, image_transform)
val_dataset = LungTumorSegmentationDataset(os.path.join(dataset_path, 'val'), image_transform, image_transform)

In [None]:
print('Length of the training data:', len(train_dataset))
print('Length of the validation data:', len(val_dataset))

Length of the training data: 1832
Length of the validation data: 98


In [None]:
print('Length of the training data:', len(train_dataset))
print('Length of the validation data:', len(val_dataset))

Length of the training data: 1832
Length of the validation data: 98


In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [None]:
train_batches = iter(train_dataloader)
val_batches = iter(val_dataloader)

for train_sample, val_sample in zip(train_batches, val_batches):
    print('Shape of the first training sample & corresponding mask:', train_sample[1][0].shape, train_sample[2][0].shape)
    print('Shape of the first validation sample & corresponding mask:', val_sample[1][0].shape, val_sample[2][0].shape)
    break

Shape of the first training sample & corresponding mask: torch.Size([1, 256, 256]) torch.Size([1, 256, 256])
Shape of the first validation sample & corresponding mask: torch.Size([1, 256, 256]) torch.Size([1, 256, 256])


#### **Model Architecture**

In [None]:
def double_convolution(in_channels, out_channels):
    """
    In the original paper implementation, the convolution operations were
    not padded but we are padding them here. This is because, we need the
    output result size to be same as input size.
    """
    conv_op = nn.Sequential(
                            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
                            nn.ReLU(inplace=True),

                            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
                            nn.ReLU(inplace=True)
                           )
    return conv_op

In [None]:
class UNet(nn.Module):


    def __init__(self, num_classes):

        super(UNet, self).__init__()

        self.max_pool2d = nn.MaxPool2d(kernel_size=2, stride=2)

        # Contracting path.

        # Each convolution is applied twice.
        self.down_convolution_1 = double_convolution(1, 64)
        self.down_convolution_2 = double_convolution(64, 128)
        self.down_convolution_3 = double_convolution(128, 256)
        self.down_convolution_4 = double_convolution(256, 512)
        self.down_convolution_5 = double_convolution(512, 1024)


        # Expanding path.
        self.up_transpose_1 = nn.ConvTranspose2d(
                                                in_channels=1024, out_channels=512,
                                                kernel_size=2,
                                                stride=2)

        # Below, `in_channels` again becomes 1024 as we are concatinating.
        self.up_convolution_1 = double_convolution(1024, 512)

        self.up_transpose_2 = nn.ConvTranspose2d(
                                                  in_channels=512, out_channels=256,
                                                  kernel_size=2,
                                                  stride=2)

        self.up_convolution_2 = double_convolution(512, 256)

        self.up_transpose_3 = nn.ConvTranspose2d(
                                                  in_channels=256, out_channels=128,
                                                  kernel_size=2,
                                                  stride=2)

        self.up_convolution_3 = double_convolution(256, 128)

        self.up_transpose_4 = nn.ConvTranspose2d(
                                                  in_channels=128, out_channels=64,
                                                  kernel_size=2,
                                                  stride=2)

        self.up_convolution_4 = double_convolution(128, 64)

        # output => `out_channels` as per the number of classes.
        self.out = nn.Conv2d(
                              in_channels=64, out_channels=num_classes,
                              kernel_size=1
                            )



    def forward(self, x):

        down_1 = self.down_convolution_1(x)
        down_2 = self.max_pool2d(down_1)
        down_3 = self.down_convolution_2(down_2)
        down_4 = self.max_pool2d(down_3)
        down_5 = self.down_convolution_3(down_4)
        down_6 = self.max_pool2d(down_5)
        down_7 = self.down_convolution_4(down_6)
        down_8 = self.max_pool2d(down_7)
        down_9 = self.down_convolution_5(down_8)

        # *** DO NOT APPLY MAX POOL TO down_9 ***

        up_1 = self.up_transpose_1(down_9)
        x = self.up_convolution_1(torch.cat([down_7, up_1], 1))

        up_2 = self.up_transpose_2(x)
        x = self.up_convolution_2(torch.cat([down_5, up_2], 1))

        up_3 = self.up_transpose_3(x)
        x = self.up_convolution_3(torch.cat([down_3, up_3], 1))

        up_4 = self.up_transpose_4(x)
        x = self.up_convolution_4(torch.cat([down_1, up_4], 1))

        out = self.out(x)

        return out

In [None]:
def evaluate_model(test_loader, model, criterion, device):

    model.eval()
    test_loss = 0.0  # Initialize the test loss

    with torch.no_grad():  # Disable gradient computation
        for images_path, images, masks in test_loader:
            images, masks = images.to(device), masks.to(device)

            output = model(images)
            loss = criterion(output, masks)

            test_loss += loss.item()

    test_loss /= len(test_loader)  # Average over all batches
    print(f'Test Loss: {test_loss:.4f}')

    return test_loss

In [None]:
whole_img_model = UNet(num_classes=1).to(device)
whole_img_model.load_state_dict(torch.load(f"/content/drive/MyDrive/unet_lung_segmentation_0.009356894996017218.pth", map_location=device))
whole_img_model.to(device)

UNet(
  (max_pool2d): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (down_convolution_1): Sequential(
    (0): Conv2d(1, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (down_convolution_2): Sequential(
    (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (down_convolution_3): Sequential(
    (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
  )
  (down_convolution_4): Sequential(
    (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(51

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
# Evaluate on validation data
val_loss = evaluate_model(val_dataloader, whole_img_model, criterion, device)

Test Loss: 0.0094


## **Detection**

In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.143-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [None]:
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
def convert_to_yolo_format(xmin, ymin, xmax, ymax, img_width, img_height):
    x_center = (xmin + xmax) / 2 / img_width
    y_center = (ymin + ymax) / 2 / img_height
    width = (xmax - xmin) / img_width
    height = (ymax - ymin) / img_height
    return [0, x_center, y_center, width, height]

In [None]:
!unzip -q '/content/LungTumorDetectionAndSegmentation.zip' -d '/content/dataset'

In [None]:
def process_split(path,split):
    image_dir = os.path.join(path, split, "images")
    label_dir = os.path.join(path, split, "detections")

    out_image_dir = os.path.join(path,"images",split)
    out_label_dir = os.path.join(path,"labels",split)

    os.makedirs(out_image_dir, exist_ok=True)
    os.makedirs(out_label_dir, exist_ok=True)

    for subject in os.listdir(image_dir):

        subject_image_path = os.path.join(image_dir, subject)
        subject_label_path = os.path.join(label_dir, subject)

        for img_file in os.listdir(subject_image_path):
            if not img_file.lower().endswith(('.jpg', '.jpeg', '.png')):
                continue

            img_path = os.path.join(subject_image_path, img_file)
            label_file = img_file.rsplit(".", 1)[0] + ".txt"
            label_path = os.path.join(subject_label_path, label_file)

            with Image.open(img_path) as img:
                w, h = img.size

            new_img_name = f"{subject}_{img_file}"
            new_img_path = os.path.join(out_image_dir, new_img_name)
            shutil.copy(img_path, new_img_path)


            yolo_lines = []
            if os.path.exists(label_path):
                with open(label_path, "r") as f:
                    for line in f:

                        vals = list(map(float, line.strip().replace(',', ' ').split()))
                        if len(vals) != 4:
                            continue
                        xmin, ymin, xmax, ymax = vals
                        yolo_vals = convert_to_yolo_format(xmin, ymin, xmax, ymax, w, h)
                        yolo_lines.append(" ".join(map(str, yolo_vals)))

            out_label_path = os.path.join(out_label_dir, new_img_name.rsplit(".", 1)[0] + ".txt")
            with open(out_label_path, "w") as f:
                f.write("\n".join(yolo_lines))

In [None]:
#path of dataset
path_name='/content/dataset'
process_split(path_name,'val')

In [None]:
with open("lung_tumor.yaml", "w") as f:
    f.write(f"""train: {os.path.join(path_name, 'images', "train")}
val: {os.path.join(path_name, 'images', "val")}
nc: 1
names: ['tumor']
""")

In [None]:
#path of model
model = YOLO('/content/best(2).pt')
metrics = model.val(data='lung_tumor.yaml', split='val')

Ultralytics 8.3.143 🚀 Python-3.11.12 torch-2.6.0+cu124 CPU (Intel Xeon 2.20GHz)
Model summary (fused): 72 layers, 3,005,843 parameters, 0 gradients, 8.1 GFLOPs
[34m[1mval: [0mFast image access ✅ (ping: 0.0±0.0 ms, read: 1132.3±277.1 MB/s, size: 34.7 KB)


[34m[1mval: [0mScanning /content/dataset/labels/val.cache... 98 images, 20 backgrounds, 0 corrupt: 100%|██████████| 98/98 [00:00<?, ?it/s]
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 7/7 [01:30<00:00, 12.97s/it]


                   all         98         86      0.873      0.605      0.688      0.389
Speed: 20.7ms preprocess, 884.3ms inference, 0.0ms loss, 1.3ms postprocess per image
Results saved to [1mruns/detect/val6[0m


In [None]:
print(f"loss: {metrics.speed['loss']}")
print(f"mAP@0.5: {metrics.box.map50:.4f}")
print(f"mAP@0.5:0.95: {metrics.box.map:.4f}")

loss: 0.00014389795013608372
mAP@0.5: 0.6883
mAP@0.5:0.95: 0.3887


## **Validation on detected images (2 stages modelling)**

In [None]:
from ultralytics import YOLO

In [None]:
class DoubleConv(nn.Module):

    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Sequential(
                                  nn.Conv2d(in_channels, out_channels, 3, padding=1, bias=False),
                                  nn.BatchNorm2d(out_channels),
                                  nn.ReLU(inplace=True),
                                  nn.Dropout2d(0.15),  # Slightly increased dropout
                                  nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
                                  nn.BatchNorm2d(out_channels),
                                  nn.ReLU(inplace=True)
                                )

    def forward(self, x):
        return self.conv(x)

In [None]:
class UNet(nn.Module):

    def __init__(self, in_channels=3, out_channels=1, features=[32, 64, 128, 256]):
        super().__init__()
        self.downs = nn.ModuleList()
        self.ups = nn.ModuleList()

        # Downsampling path (encoder)
        for feature in features:
            self.downs.append(DoubleConv(in_channels, feature))
            in_channels = feature
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # Bottleneck
        self.bottleneck = DoubleConv(features[-1], features[-1] * 2)

        # Upsampling path (decoder)
        for feature in reversed(features):
            self.ups.append(nn.ConvTranspose2d(feature * 2, feature, kernel_size=2, stride=2))
            self.ups.append(DoubleConv(feature * 2, feature))

        # Final output layer
        self.final_conv = nn.Conv2d(features[0], out_channels, kernel_size=1)


    def forward(self, x):

        skip_connections = []

        for down in self.downs:
            x = down(x)
            skip_connections.append(x)
            x = self.pool(x)

        x = self.bottleneck(x)
        skip_connections = skip_connections[::-1]

        for idx in range(0, len(self.ups), 2):
            x = self.ups[idx](x)  # transpose conv
            skip_connection = skip_connections[idx // 2]

            if x.shape != skip_connection.shape:
                x = F.interpolate(x, size=skip_connection.shape[2:])

            x = torch.cat((skip_connection, x), dim=1)
            x = self.ups[idx + 1](x)

        return self.final_conv(x)

In [None]:
def load_boxes_from_txt(txt_path):

    boxes = []

    with open(txt_path, 'r') as f:

        for line in f:
            xmin, ymin, xmax, ymax = map(float, line.strip().split(','))
            boxes.append((int(xmin), int(ymin), int(xmax), int(ymax)))

    return boxes

In [None]:
def compute_iou(boxA, boxB):

    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])

    interArea = max(0, xB - xA) * max(0, yB - yA)
    boxAArea = max(1, (boxA[2] - boxA[0])) * max(1, (boxA[3] - boxA[1]))
    boxBArea = max(1, (boxB[2] - boxB[0])) * max(1, (boxB[3] - boxB[1]))

    iou = interArea / float(boxAArea + boxBArea - interArea + 1e-6)

    return iou

In [None]:
def segment_detections(val_dataset, detection_model, segmentation_model, conf=0.3, iou_thresh=0.5):

    segmentation_model.eval()

    loss_fn = nn.BCEWithLogitsLoss()

    total_loss = 0.0
    count = 0


    for img_path, _, _ in val_dataset:

        img_cv2 = cv2.imread(img_path)

        gt_boxes_path = img_path.replace('images', 'detections').replace('.png', '.txt')
        try:
            gt_boxes = load_boxes_from_txt(gt_boxes_path)
        except FileNotFoundError:
            continue

        mask_path = img_path.replace('images', 'masks')
        mask_gt = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)

        results = detection_model(img_path, conf=conf)
        pred_boxes = results[0].boxes


        for pred_box in pred_boxes:

            x1, y1, x2, y2 = map(int, pred_box.xyxy[0])

            best_iou = 0
            best_gt_box = None

            for gt_box in gt_boxes:

                iou = compute_iou((x1, y1, x2, y2), gt_box[:4])
                if iou > best_iou:
                    best_iou = iou
                    best_gt_box = gt_box

            if best_iou < iou_thresh:
                continue

            crop_img = img_cv2[y1:y2, x1:x2]
            crop_mask = mask_gt[y1:y2, x1:x2]

            if crop_img.size == 0 or crop_mask.size == 0:
                continue

            crop_img_rgb = cv2.cvtColor(crop_img, cv2.COLOR_BGR2RGB)
            image_pil = Image.fromarray(crop_img_rgb).resize((64, 64))

            input_tensor = torch.tensor(np.transpose(np.array(image_pil).astype(np.float32) / 255.0, (2, 0, 1)))
            input_tensor = input_tensor.unsqueeze(0).to(next(segmentation_model.parameters()).device)

            with torch.no_grad():
                output = segmentation_model(input_tensor)

            pred_mask = output.squeeze(0)#.squeeze(0)  # shape: (H, W)
            # if pred_mask.dim() == 2:  # [64, 64]
            #    pred_mask = pred_mask.unsqueeze(0)

            target_mask = Image.fromarray(crop_mask).resize((64, 64))
            target_tensor = torch.tensor(np.array(target_mask).astype(np.float32) / 255.0)
            target_tensor = target_tensor.unsqueeze(0).to(pred_mask.device)

            loss = loss_fn(pred_mask, target_tensor)
            total_loss += loss.item()
            count += 1

    avg_loss = total_loss / count if count > 0 else 0
    print(f"Average segmentation loss over dataset: {avg_loss:.4f}")

    return avg_loss

In [None]:
detection_model = YOLO('/content/best(2).pt')

In [None]:
segmentation_model = UNet().to(device)
segmentation_model.load_state_dict(torch.load(f"/content/drive/MyDrive/best_unet_cropped.pth", map_location=device))

<All keys matched successfully>

In [None]:
val_loss = segment_detections(val_dataset, detection_model, segmentation_model)


image 1/1 /content/drive/MyDrive/LungTumorDetectionAndSegmentation/val/images/Subject_59/90.png: 1024x1024 (no detections), 654.6ms
Speed: 10.5ms preprocess, 654.6ms inference, 0.9ms postprocess per image at shape (1, 3, 1024, 1024)

image 1/1 /content/drive/MyDrive/LungTumorDetectionAndSegmentation/val/images/Subject_59/88.png: 1024x1024 1 tumor, 600.4ms
Speed: 17.7ms preprocess, 600.4ms inference, 2.6ms postprocess per image at shape (1, 3, 1024, 1024)

image 1/1 /content/drive/MyDrive/LungTumorDetectionAndSegmentation/val/images/Subject_59/84.png: 1024x1024 1 tumor, 584.8ms
Speed: 15.0ms preprocess, 584.8ms inference, 1.4ms postprocess per image at shape (1, 3, 1024, 1024)

image 1/1 /content/drive/MyDrive/LungTumorDetectionAndSegmentation/val/images/Subject_59/82.png: 1024x1024 1 tumor, 613.2ms
Speed: 16.0ms preprocess, 613.2ms inference, 1.4ms postprocess per image at shape (1, 3, 1024, 1024)

image 1/1 /content/drive/MyDrive/LungTumorDetectionAndSegmentation/val/images/Subject_5

0.47511487282239473

## **Validation on detected images (2 stages modelling)**

In [None]:
!unzip /content/drive/MyDrive/Models/config.zip

In [None]:
!unzip /content/drive/MyDrive/Models/final_faster_rcnn_lung_tumor.zip

Archive:  /content/drive/MyDrive/Models/final_faster_rcnn_lung_tumor.zip
  inflating: final_faster_rcnn_lung_tumor.pth  


In [None]:
def load_images_and_anns(im_dir, ann_dir):
    """
    Load tumor detection dataset based on txt annotations (xmin ymin xmax ymax per line).
    """
    im_infos = []
    for img_path in tqdm(glob.glob(os.path.join(im_dir, '*/*.png'))):
        im_info = {}
        im_info['img_id'] = os.path.splitext(os.path.basename(img_path))[0]
        im_info['filename'] = img_path
        im_info['subject'] = os.path.basename(os.path.dirname(img_path))

        # Load image to get width and height
        with Image.open(img_path) as img:
            width, height = img.size
        im_info['width'] = width
        im_info['height'] = height

        # Look for corresponding annotation .txt
        ann_file = os.path.join(ann_dir, im_info['subject'], f"{im_info['img_id']}.txt")
        detections = []
        if os.path.exists(ann_file):
            with open(ann_file, 'r') as f:
                for line in f:
                    line = line.strip()
                    if line:  # Skip empty lines
                        try:
                            # Handle both comma and space separated values
                            coords = line.replace(',', ' ').split()
                            if len(coords) >= 4:
                                xmin, ymin, xmax, ymax = map(float, coords[:4])

                                # Validate coordinates
                                if xmin >= 0 and ymin >= 0 and xmax > xmin and ymax > ymin:
                                    # Ensure coordinates are within image bounds
                                    xmin = max(0, min(xmin, width - 1))
                                    ymin = max(0, min(ymin, height - 1))
                                    xmax = max(xmin + 1, min(xmax, width))
                                    ymax = max(ymin + 1, min(ymax, height))

                                    bbox = [xmin, ymin, xmax, ymax]
                                    detections.append({'label': 1, 'bbox': bbox})  # 1 for tumor
                        except ValueError:
                            print(f"Warning: Invalid annotation in {ann_file}: {line}")
                            continue

        im_info['detections'] = detections
        im_infos.append(im_info)

    print('Total {} images found'.format(len(im_infos)))
    return im_infos

In [None]:
class TumorDataset(Dataset):
    def __init__(self, split, im_dir, ann_dir):
        self.split = split
        self.im_dir = im_dir
        self.ann_dir = ann_dir
        self.label2idx = {'background': 0, 'tumor': 1}  # Fixed order
        self.idx2label = {0: 'background', 1: 'tumor'}
        self.images_info = load_images_and_anns(im_dir, ann_dir)

        # Filter out images with no valid detections for training
        if split == 'train':
            self.images_info = [info for info in self.images_info if len(info['detections']) > 0]
            print(f"Filtered to {len(self.images_info)} training images with annotations")

        self.transforms = T.ToTensor()

    def __len__(self):
        return len(self.images_info)

    def __getitem__(self, index):
        im_info = self.images_info[index]
        im = Image.open(im_info['filename']).convert("RGB")
        to_flip = False

        # Data augmentation for training
        if self.split == 'train' and random.random() < 0.5:
            to_flip = True
            im = im.transpose(Image.FLIP_LEFT_RIGHT)

        im_tensor = self.transforms(im)

        # Get image dimensions after transform
        _, height, width = im_tensor.shape

        # Get boxes and labels
        if len(im_info['detections']) == 0:
            # For images with no annotations, create minimal valid tensors
            boxes = torch.zeros((0, 4), dtype=torch.float32)
            labels = torch.zeros((0,), dtype=torch.int64)
            area = torch.zeros((0,), dtype=torch.float32)
            iscrowd = torch.zeros((0,), dtype=torch.int64)
        else:
            boxes_list = []
            labels_list = []

            for d in im_info['detections']:
                bbox = d['bbox']
                # Ensure box coordinates are valid
                x1, y1, x2, y2 = bbox

                # Clamp coordinates to image bounds
                x1 = max(0, min(x1, width - 1))
                y1 = max(0, min(y1, height - 1))
                x2 = max(x1 + 1, min(x2, width))
                y2 = max(y1 + 1, min(y2, height))

                # Only add if box has valid area
                if x2 > x1 and y2 > y1:
                    boxes_list.append([x1, y1, x2, y2])
                    labels_list.append(d['label'])

            if len(boxes_list) == 0:
                # Fallback if all boxes are invalid
                boxes = torch.zeros((0, 4), dtype=torch.float32)
                labels = torch.zeros((0,), dtype=torch.int64)
                area = torch.zeros((0,), dtype=torch.float32)
                iscrowd = torch.zeros((0,), dtype=torch.int64)
            else:
                boxes = torch.tensor(boxes_list, dtype=torch.float32)
                labels = torch.tensor(labels_list, dtype=torch.int64)

                # Calculate area for each box
                area = (boxes[:, 3] - boxes[:, 1]) * (boxes[:, 2] - boxes[:, 0])
                iscrowd = torch.zeros((len(boxes),), dtype=torch.int64)

        # Apply horizontal flip to boxes if needed
        if to_flip and boxes.numel() > 0:
            boxes[:, [0, 2]] = width - boxes[:, [2, 0]]  # flip x1, x2

        target = {
            'boxes': boxes,
            'labels': labels,
            'area': area,
            'iscrowd': iscrowd,
            'image_id': torch.tensor(index, dtype=torch.int64)
        }

        return im_tensor, target

In [None]:
class RegionProposalNetwork(nn.Module):
    def __init__(self, in_channels, scales, aspect_ratios, model_config):
        super(RegionProposalNetwork, self).__init__()
        self.scales = scales
        self.low_iou_threshold = model_config['rpn_bg_threshold']
        self.high_iou_threshold = model_config['rpn_fg_threshold']
        self.rpn_nms_threshold = model_config['rpn_nms_threshold']
        self.rpn_batch_size = model_config['rpn_batch_size']
        self.rpn_pos_count = int(model_config['rpn_pos_fraction'] * self.rpn_batch_size)
        self.rpn_topk = model_config['rpn_train_topk'] if self.training else model_config['rpn_test_topk']
        self.rpn_prenms_topk = model_config['rpn_train_prenms_topk'] if self.training \
            else model_config['rpn_test_prenms_topk']
        self.aspect_ratios = aspect_ratios
        self.num_anchors = len(self.scales) * len(self.aspect_ratios)

        # 3x3 conv layer
        self.rpn_conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)

        # 1x1 classification conv layer
        self.cls_layer = nn.Conv2d(in_channels, self.num_anchors, kernel_size=1, stride=1)

        # 1x1 regression
        self.bbox_reg_layer = nn.Conv2d(in_channels, self.num_anchors * 4, kernel_size=1, stride=1)

        for layer in [self.rpn_conv, self.cls_layer, self.bbox_reg_layer]:
            torch.nn.init.normal_(layer.weight, std=0.01)
            torch.nn.init.constant_(layer.bias, 0)

    def generate_anchors(self, image, feat):
        """Generate anchors for a single image"""
        # Handle batch dimension - work with single image
        if image.dim() == 4:
            image = image[0]  # Take first image from batch
        if feat.dim() == 4:
            feat = feat[0]    # Take corresponding feature map

        grid_h, grid_w = feat.shape[-2:]
        image_h, image_w = image.shape[-2:]

        # Calculate stride
        stride_h = torch.tensor(image_h // grid_h, dtype=torch.int64, device=feat.device)
        stride_w = torch.tensor(image_w // grid_w, dtype=torch.int64, device=feat.device)

        # Ensure scales and aspect_ratios are on the same device
        scales = torch.as_tensor(self.scales, dtype=feat.dtype, device=feat.device)
        aspect_ratios = torch.as_tensor(self.aspect_ratios, dtype=feat.dtype, device=feat.device)

        h_ratios = torch.sqrt(aspect_ratios)
        w_ratios = 1 / h_ratios

        ws = (w_ratios[:, None] * scales[None, :]).view(-1)
        hs = (h_ratios[:, None] * scales[None, :]).view(-1)

        base_anchors = torch.stack([-ws, -hs, ws, hs], dim=1) / 2
        base_anchors = base_anchors.round()

        # Generate shifts
        shifts_x = torch.arange(0, grid_w, dtype=torch.int32, device=feat.device) * stride_w
        shifts_y = torch.arange(0, grid_h, dtype=torch.int32, device=feat.device) * stride_h

        shifts_y, shifts_x = torch.meshgrid(shifts_y, shifts_x, indexing="ij")
        shifts_x = shifts_x.reshape(-1)
        shifts_y = shifts_y.reshape(-1)

        shifts = torch.stack((shifts_x, shifts_y, shifts_x, shifts_y), dim=1)

        anchors = (shifts.view(-1, 1, 4) + base_anchors.view(1, -1, 4))
        anchors = anchors.reshape(-1, 4)
        return anchors

    def assign_targets_to_anchors(self, anchors, gt_boxes):
        """Assign targets to anchors"""
        # Ensure gt_boxes is 2D
        if gt_boxes.dim() == 1:
            gt_boxes = gt_boxes.unsqueeze(0)

        # Handle empty ground truth case
        if gt_boxes.numel() == 0 or gt_boxes.shape[0] == 0:
            labels = torch.zeros(anchors.shape[0], dtype=torch.float32, device=anchors.device)
            matched_gt_boxes = torch.zeros_like(anchors)
            return labels, matched_gt_boxes

        # Get IOU matrix
        iou_matrix = get_iou(gt_boxes, anchors)

        # For each anchor get the gt box index with maximum overlap
        best_match_iou, best_match_gt_idx = iou_matrix.max(dim=0)
        best_match_gt_idx_pre_thresholding = best_match_gt_idx.clone()

        # Apply thresholds
        below_low_threshold = best_match_iou < self.low_iou_threshold
        between_thresholds = (best_match_iou >= self.low_iou_threshold) & (best_match_iou < self.high_iou_threshold)
        best_match_gt_idx[below_low_threshold] = -1
        best_match_gt_idx[between_thresholds] = -2

        # Add high quality matches
        best_anchor_iou_for_gt, _ = iou_matrix.max(dim=1)
        gt_pred_pair_with_highest_iou = torch.where(iou_matrix == best_anchor_iou_for_gt[:, None])
        pred_inds_to_update = gt_pred_pair_with_highest_iou[1]
        best_match_gt_idx[pred_inds_to_update] = best_match_gt_idx_pre_thresholding[pred_inds_to_update]

        # Get matched gt boxes
        matched_gt_boxes = gt_boxes[best_match_gt_idx.clamp(min=0)]

        # Create labels
        labels = best_match_gt_idx >= 0
        labels = labels.to(dtype=torch.float32)

        background_anchors = best_match_gt_idx == -1
        labels[background_anchors] = 0.0

        ignored_anchors = best_match_gt_idx == -2
        labels[ignored_anchors] = -1.0

        return labels, matched_gt_boxes

    def filter_proposals(self, proposals, cls_scores, image_shape):
        """Filter proposals using NMS and other criteria"""
        # Handle empty proposals
        if proposals.numel() == 0:
            return proposals, cls_scores

        # Pre NMS Filtering
        cls_scores = cls_scores.reshape(-1)
        cls_scores = torch.sigmoid(cls_scores)

        # Handle case where we have fewer proposals than requested
        num_proposals = min(self.rpn_prenms_topk, len(cls_scores))
        if num_proposals == 0:
            return proposals[:0], cls_scores[:0]

        _, top_n_idx = cls_scores.topk(num_proposals)
        cls_scores = cls_scores[top_n_idx]
        proposals = proposals[top_n_idx]

        # Clamp boxes to image boundary
        proposals = clamp_boxes_to_image_boundary(proposals, image_shape)

        # Filter small boxes
        min_size = 16
        ws, hs = proposals[:, 2] - proposals[:, 0], proposals[:, 3] - proposals[:, 1]
        keep = (ws >= min_size) & (hs >= min_size)
        keep = torch.where(keep)[0]

        if len(keep) == 0:
            return proposals[:0], cls_scores[:0]

        proposals = proposals[keep]
        cls_scores = cls_scores[keep]

        # NMS
        if len(proposals) > 0:
            keep_indices = torch.ops.torchvision.nms(proposals, cls_scores, self.rpn_nms_threshold)
            keep_mask = torch.zeros_like(cls_scores, dtype=torch.bool)
            keep_mask[keep_indices] = True
            keep_indices = torch.where(keep_mask)[0]

            # Sort by objectness
            post_nms_keep_indices = keep_indices[cls_scores[keep_indices].sort(descending=True)[1]]

            # Post NMS topk filtering
            final_keep = min(self.rpn_topk, len(post_nms_keep_indices))
            proposals = proposals[post_nms_keep_indices[:final_keep]]
            cls_scores = cls_scores[post_nms_keep_indices[:final_keep]]

        return proposals, cls_scores

    def forward(self, images, features, targets=None):
        """
        Forward pass of RPN
        Args:
            images: batch of images [B, C, H, W]
            features: batch of feature maps [B, C, H', W']
            targets: list of targets for each image (for training)
        """
        batch_size = images.shape[0]
        device = images.device

        # Process features through RPN layers
        rpn_feat = torch.relu(self.rpn_conv(features))
        cls_scores = self.cls_layer(rpn_feat)
        box_transform_pred = self.bbox_reg_layer(rpn_feat)

        # Process each image in the batch
        all_proposals = []
        all_scores = []
        total_rpn_cls_loss = 0.0
        total_rpn_loc_loss = 0.0

        for i in range(batch_size):
            # Get single image data
            image = images[i:i+1]  # Keep batch dimension for compatibility
            feat = features[i:i+1]
            cls_score_i = cls_scores[i:i+1]
            box_pred_i = box_transform_pred[i:i+1]

            # Generate anchors for this image
            anchors = self.generate_anchors(image, feat)

            # Reshape predictions
            H, W = feat.shape[-2:]
            num_anchors_per_location = cls_score_i.size(1)

            # Reshape classification scores
            cls_score_i = cls_score_i.permute(0, 2, 3, 1).reshape(-1, 1)

            # Reshape box predictions
            box_pred_i = box_pred_i.view(1, num_anchors_per_location, 4, H, W)
            box_pred_i = box_pred_i.permute(0, 3, 4, 1, 2).reshape(-1, 4)

            # Generate proposals
            proposals = apply_regression_pred_to_anchors_or_proposals(
                box_pred_i.detach().reshape(-1, 1, 4),
                anchors
            )
            proposals = proposals.reshape(-1, 4)

            # Filter proposals
            proposals, scores = self.filter_proposals(
                proposals,
                cls_score_i.detach(),
                image.shape
            )

            all_proposals.append(proposals)
            all_scores.append(scores)

            # Training losses
            if self.training and targets is not None:
                # Get target for this image
                target = targets[i] if isinstance(targets, list) else targets

                # Handle different target formats
                if isinstance(target, dict):
                    gt_boxes = target.get('bboxes', target.get('boxes', None))
                else:
                    gt_boxes = target

                if gt_boxes is not None and gt_boxes.numel() > 0:
                    # Ensure gt_boxes is 2D
                    if gt_boxes.dim() == 1:
                        gt_boxes = gt_boxes.reshape(-1, 4)

                    # Assign targets to anchors
                    labels_for_anchors, matched_gt_boxes_for_anchors = self.assign_targets_to_anchors(
                        anchors, gt_boxes
                    )

                    # Get regression targets
                    regression_targets = boxes_to_transformation_targets(
                        matched_gt_boxes_for_anchors, anchors
                    )

                    # Sample positive and negative anchors
                    sampled_neg_idx_mask, sampled_pos_idx_mask = sample_positive_negative(
                        labels_for_anchors,
                        positive_count=self.rpn_pos_count,
                        total_count=self.rpn_batch_size
                    )

                    sampled_idxs = torch.where(sampled_pos_idx_mask | sampled_neg_idx_mask)[0]

                    # Compute losses
                    if sampled_pos_idx_mask.sum() > 0:
                        localization_loss = torch.nn.functional.smooth_l1_loss(
                            box_pred_i[sampled_pos_idx_mask],
                            regression_targets[sampled_pos_idx_mask],
                            beta=1 / 9,
                            reduction="sum",
                        ) / max(sampled_idxs.numel(), 1)
                    else:
                        localization_loss = torch.tensor(0.0, device=device)

                    if len(sampled_idxs) > 0:
                        cls_loss = torch.nn.functional.binary_cross_entropy_with_logits(
                            cls_score_i[sampled_idxs].flatten(),
                            labels_for_anchors[sampled_idxs].flatten()
                        )
                    else:
                        cls_loss = torch.tensor(0.0, device=device)

                    total_rpn_cls_loss += cls_loss
                    total_rpn_loc_loss += localization_loss

        # Prepare output
        rpn_output = {
            'proposals': all_proposals,
            'scores': all_scores
        }

        if self.training and targets is not None:
            rpn_output['rpn_classification_loss'] = total_rpn_cls_loss / batch_size
            rpn_output['rpn_localization_loss'] = total_rpn_loc_loss / batch_size

        return rpn_output



In [None]:
def get_iou(boxes1, boxes2):
    """
    IOU between two sets of boxes
    :param boxes1: (Tensor of shape N x 4)
    :param boxes2: (Tensor of shape M x 4)
    :return: IOU matrix of shape N x M
    """
    # Input validation
    if boxes1.numel() == 0 or boxes2.numel() == 0:
        return torch.zeros((boxes1.shape[0], boxes2.shape[0]), dtype=torch.float32, device=boxes1.device)

    # Ensure boxes are in correct format and have valid areas
    assert boxes1.shape[1] == 4, f"boxes1 should have 4 coordinates, got {boxes1.shape[1]}"
    assert boxes2.shape[1] == 4, f"boxes2 should have 4 coordinates, got {boxes2.shape[1]}"

    # Area of boxes (x2-x1)*(y2-y1)
    area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])  # (N,)
    area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])  # (M,)

    # Clamp negative areas to 0 (invalid boxes)
    area1 = area1.clamp(min=0)
    area2 = area2.clamp(min=0)

    # Get top left x1,y1 coordinate
    x_left = torch.max(boxes1[:, None, 0], boxes2[:, 0])  # (N, M)
    y_top = torch.max(boxes1[:, None, 1], boxes2[:, 1])  # (N, M)

    # Get bottom right x2,y2 coordinate
    x_right = torch.min(boxes1[:, None, 2], boxes2[:, 2])  # (N, M)
    y_bottom = torch.min(boxes1[:, None, 3], boxes2[:, 3])  # (N, M)

    intersection_area = (x_right - x_left).clamp(min=0) * (y_bottom - y_top).clamp(min=0)  # (N, M)
    union = area1[:, None] + area2 - intersection_area  # (N, M)

    # Avoid division by zero
    iou = intersection_area / (union + 1e-6)  # (N, M)
    return iou


def boxes_to_transformation_targets(ground_truth_boxes, anchors_or_proposals):
    """
    Given all anchor boxes or proposals in image and their respective
    ground truth assignments, we use the x1,y1,x2,y2 coordinates of them
    to get tx,ty,tw,th transformation targets for all anchor boxes or proposals
    :param ground_truth_boxes: (anchors_or_proposals_in_image, 4)
        Ground truth box assignments for the anchors/proposals
    :param anchors_or_proposals: (anchors_or_proposals_in_image, 4) Anchors/Proposal boxes
    :return: regression_targets: (anchors_or_proposals_in_image, 4) transformation targets tx,ty,tw,th
        for all anchors/proposal boxes
    """
    # Input validation
    assert ground_truth_boxes.shape == anchors_or_proposals.shape, \
        f"GT boxes shape {ground_truth_boxes.shape} != anchors shape {anchors_or_proposals.shape}"
    assert ground_truth_boxes.shape[1] == 4, "Boxes should have 4 coordinates"

    # Get center_x,center_y,w,h from x1,y1,x2,y2 for anchors
    widths = anchors_or_proposals[:, 2] - anchors_or_proposals[:, 0]
    heights = anchors_or_proposals[:, 3] - anchors_or_proposals[:, 1]
    center_x = anchors_or_proposals[:, 0] + 0.5 * widths
    center_y = anchors_or_proposals[:, 1] + 0.5 * heights

    # Get center_x,center_y,w,h from x1,y1,x2,y2 for gt boxes
    gt_widths = ground_truth_boxes[:, 2] - ground_truth_boxes[:, 0]
    gt_heights = ground_truth_boxes[:, 3] - ground_truth_boxes[:, 1]
    gt_center_x = ground_truth_boxes[:, 0] + 0.5 * gt_widths
    gt_center_y = ground_truth_boxes[:, 1] + 0.5 * gt_heights

    # Avoid division by zero for width and height
    widths = torch.clamp(widths, min=1e-6)
    heights = torch.clamp(heights, min=1e-6)
    gt_widths = torch.clamp(gt_widths, min=1e-6)
    gt_heights = torch.clamp(gt_heights, min=1e-6)

    targets_dx = (gt_center_x - center_x) / widths
    targets_dy = (gt_center_y - center_y) / heights
    targets_dw = torch.log(gt_widths / widths)
    targets_dh = torch.log(gt_heights / heights)

    regression_targets = torch.stack((targets_dx, targets_dy, targets_dw, targets_dh), dim=1)
    return regression_targets


def apply_regression_pred_to_anchors_or_proposals(box_transform_pred, anchors_or_proposals):
    """
    Given the transformation parameter predictions for all
    input anchors or proposals, transform them accordingly
    to generate predicted proposals or predicted boxes
    :param box_transform_pred: (num_anchors_or_proposals, num_classes, 4) or (num_anchors_or_proposals, 4)
    :param anchors_or_proposals: (num_anchors_or_proposals, 4)
    :return pred_boxes: (num_anchors_or_proposals, num_classes, 4) or (num_anchors_or_proposals, 4)
    """
    # Handle both 2D and 3D input tensors
    original_shape = box_transform_pred.shape
    if len(original_shape) == 2:
        # (num_anchors, 4) -> (num_anchors, 1, 4)
        box_transform_pred = box_transform_pred.unsqueeze(1)
        squeeze_output = True
    else:
        # (num_anchors, num_classes, 4)
        squeeze_output = False

    box_transform_pred = box_transform_pred.reshape(box_transform_pred.size(0), -1, 4)

    # Get cx, cy, w, h from x1,y1,x2,y2
    w = anchors_or_proposals[:, 2] - anchors_or_proposals[:, 0]
    h = anchors_or_proposals[:, 3] - anchors_or_proposals[:, 1]
    center_x = anchors_or_proposals[:, 0] + 0.5 * w
    center_y = anchors_or_proposals[:, 1] + 0.5 * h

    # Clamp width and height to avoid division by zero
    w = torch.clamp(w, min=1e-6)
    h = torch.clamp(h, min=1e-6)

    dx = box_transform_pred[..., 0]
    dy = box_transform_pred[..., 1]
    dw = box_transform_pred[..., 2]
    dh = box_transform_pred[..., 3]
    # dh -> (num_anchors_or_proposals, num_classes)

    # Prevent sending too large values into torch.exp()
    dw = torch.clamp(dw, max=math.log(1000.0 / 16))
    dh = torch.clamp(dh, max=math.log(1000.0 / 16))

    pred_center_x = dx * w[:, None] + center_x[:, None]
    pred_center_y = dy * h[:, None] + center_y[:, None]
    pred_w = torch.exp(dw) * w[:, None]
    pred_h = torch.exp(dh) * h[:, None]
    # pred_center_x -> (num_anchors_or_proposals, num_classes)

    pred_box_x1 = pred_center_x - 0.5 * pred_w
    pred_box_y1 = pred_center_y - 0.5 * pred_h
    pred_box_x2 = pred_center_x + 0.5 * pred_w
    pred_box_y2 = pred_center_y + 0.5 * pred_h

    pred_boxes = torch.stack((
        pred_box_x1,
        pred_box_y1,
        pred_box_x2,
        pred_box_y2),
        dim=2)
    # pred_boxes -> (num_anchors_or_proposals, num_classes, 4)

    if squeeze_output:
        pred_boxes = pred_boxes.squeeze(1)  # (num_anchors, 4)

    return pred_boxes


In [None]:
def sample_positive_negative(labels, positive_count, total_count):
    """
    Sample positive and negative proposals for training
    :param labels: (N,) tensor of labels where 0=background, >=1=positive
    :param positive_count: target number of positive samples
    :param total_count: total number of samples to return
    :return: (pos_mask, neg_mask) boolean masks for sampled indices
    """
    # Input validation
    assert labels.dim() == 1, f"Labels should be 1D tensor, got {labels.dim()}D"
    assert positive_count <= total_count, f"positive_count ({positive_count}) > total_count ({total_count})"
    assert total_count > 0, f"total_count should be positive, got {total_count}"

    # Sample positive and negative proposals
    positive = torch.where(labels >= 1)[0]
    negative = torch.where(labels == 0)[0]

    # Handle edge cases
    if positive.numel() == 0 and negative.numel() == 0:
        # No valid samples
        sampled_pos_idx_mask = torch.zeros_like(labels, dtype=torch.bool)
        sampled_neg_idx_mask = torch.zeros_like(labels, dtype=torch.bool)
        return sampled_pos_idx_mask, sampled_neg_idx_mask

    # Calculate actual number of positive and negative samples
    num_pos = min(positive.numel(), positive_count)
    num_neg = min(negative.numel(), total_count - num_pos)

    # If we don't have enough positives, increase negatives to reach total_count
    if num_pos < positive_count:
        num_neg = min(negative.numel(), total_count - num_pos)

    # Sample indices
    sampled_pos_idx_mask = torch.zeros_like(labels, dtype=torch.bool)
    sampled_neg_idx_mask = torch.zeros_like(labels, dtype=torch.bool)

    if num_pos > 0:
        perm_positive_idxs = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
        pos_idxs = positive[perm_positive_idxs]
        sampled_pos_idx_mask[pos_idxs] = True

    if num_neg > 0:
        perm_negative_idxs = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
        neg_idxs = negative[perm_negative_idxs]
        sampled_neg_idx_mask[neg_idxs] = True

    return sampled_pos_idx_mask, sampled_neg_idx_mask


In [None]:
def clamp_boxes_to_image_boundary(boxes, image_shape):
    """
    Clamp bounding boxes to image boundaries
    :param boxes: (..., 4) tensor of boxes in (x1, y1, x2, y2) format
    :param image_shape: (H, W) or (..., H, W) shape of the image
    :return: clamped boxes with same shape as input
    """
    if boxes.numel() == 0:
        return boxes

    # Handle different input shapes
    if len(image_shape) >= 2:
        height, width = image_shape[-2:]
    else:
        raise ValueError(f"image_shape should have at least 2 dimensions, got {len(image_shape)}")

    # Extract coordinates
    boxes_x1 = boxes[..., 0]
    boxes_y1 = boxes[..., 1]
    boxes_x2 = boxes[..., 2]
    boxes_y2 = boxes[..., 3]

    # Clamp coordinates to image boundaries
    # Fix: x coordinates should be clamped to [0, width-1], y to [0, height-1]
    boxes_x1 = boxes_x1.clamp(min=0, max=width - 1)
    boxes_x2 = boxes_x2.clamp(min=0, max=width - 1)
    boxes_y1 = boxes_y1.clamp(min=0, max=height - 1)
    boxes_y2 = boxes_y2.clamp(min=0, max=height - 1)

    # Ensure x2 >= x1 and y2 >= y1 (valid boxes)
    boxes_x2 = torch.max(boxes_x1, boxes_x2)
    boxes_y2 = torch.max(boxes_y1, boxes_y2)

    # Reconstruct boxes tensor
    boxes = torch.stack((boxes_x1, boxes_y1, boxes_x2, boxes_y2), dim=-1)

    return boxes


In [None]:
def transform_boxes_to_original_size(boxes, new_size, original_size):
    """
    Transform boxes from resized image coordinates back to original image coordinates
    :param boxes: (N, 4) tensor of boxes in (x1, y1, x2, y2) format
    :param new_size: (H, W) size of the resized image
    :param original_size: (H, W) size of the original image
    :return: boxes transformed to original image coordinates
    """
    if boxes.numel() == 0:
        return boxes

    # Input validation
    assert len(new_size) == 2, f"new_size should have 2 elements, got {len(new_size)}"
    assert len(original_size) == 2, f"original_size should have 2 elements, got {len(original_size)}"
    assert boxes.shape[-1] == 4, f"boxes should have 4 coordinates, got {boxes.shape[-1]}"

    # Calculate scaling ratios
    new_height, new_width = new_size
    orig_height, orig_width = original_size

    # Avoid division by zero
    if new_height == 0 or new_width == 0:
        raise ValueError(f"new_size cannot contain zeros: {new_size}")

    ratio_height = float(orig_height) / float(new_height)
    ratio_width = float(orig_width) / float(new_width)

    # Convert ratios to tensors on the same device as boxes
    ratio_height = torch.tensor(ratio_height, dtype=boxes.dtype, device=boxes.device)
    ratio_width = torch.tensor(ratio_width, dtype=boxes.dtype, device=boxes.device)

    # Extract coordinates
    xmin, ymin, xmax, ymax = boxes.unbind(-1)

    # Scale coordinates
    xmin = xmin * ratio_width
    xmax = xmax * ratio_width
    ymin = ymin * ratio_height
    ymax = ymax * ratio_height

    # Reconstruct boxes
    transformed_boxes = torch.stack((xmin, ymin, xmax, ymax), dim=-1)

    return transformed_boxes


In [None]:
class ROIHead(nn.Module):
    """
    ROI head on top of ROI pooling layer for generating
    classification and box transformation predictions
    """

    def __init__(self, model_config, num_classes, in_channels):
        super(ROIHead, self).__init__()
        self.num_classes = num_classes
        self.roi_batch_size = model_config['roi_batch_size']
        self.roi_pos_count = int(model_config['roi_pos_fraction'] * self.roi_batch_size)
        self.iou_threshold = model_config['roi_iou_threshold']
        self.low_bg_iou = model_config['roi_low_bg_iou']
        self.nms_threshold = model_config['roi_nms_threshold']
        self.topK_detections = model_config['roi_topk_detections']
        self.low_score_threshold = model_config['roi_score_threshold']
        self.pool_size = model_config['roi_pool_size']
        self.fc_inner_dim = model_config['fc_inner_dim']

        # FC layers
        self.fc6 = nn.Linear(in_channels * self.pool_size * self.pool_size, self.fc_inner_dim)
        self.fc7 = nn.Linear(self.fc_inner_dim, self.fc_inner_dim)
        self.cls_layer = nn.Linear(self.fc_inner_dim, self.num_classes)
        self.bbox_reg_layer = nn.Linear(self.fc_inner_dim, self.num_classes * 4)

        # Initialize weights
        torch.nn.init.normal_(self.cls_layer.weight, std=0.01)
        torch.nn.init.constant_(self.cls_layer.bias, 0)
        torch.nn.init.normal_(self.bbox_reg_layer.weight, std=0.001)
        torch.nn.init.constant_(self.bbox_reg_layer.bias, 0)

    def assign_target_to_proposals(self, proposals, gt_boxes, gt_labels):
        """
        Assign ground truth targets to proposals based on IoU
        """
        if gt_boxes.numel() == 0 or proposals.numel() == 0:
            # Handle empty cases
            labels = torch.zeros(proposals.shape[0], dtype=torch.int64, device=proposals.device)
            matched_gt_boxes = torch.zeros_like(proposals)
            return labels, matched_gt_boxes

        # Ensure inputs are 2D
        if gt_boxes.dim() == 1:
            gt_boxes = gt_boxes.reshape(-1, 4)
        if proposals.dim() == 1:
            proposals = proposals.reshape(-1, 4)
        if gt_labels.dim() == 0:
            gt_labels = gt_labels.unsqueeze(0)

        # Get IOU matrix
        iou_matrix = get_iou(gt_boxes, proposals)

        # For each proposal, find best matching gt box
        best_match_iou, best_match_gt_idx = iou_matrix.max(dim=0)

        # Classify proposals
        background_proposals = (best_match_iou < self.iou_threshold) & (best_match_iou >= self.low_bg_iou)
        ignored_proposals = best_match_iou < self.low_bg_iou

        # Update indices for background and ignored proposals
        best_match_gt_idx[background_proposals] = -1
        best_match_gt_idx[ignored_proposals] = -2

        # Get matched gt boxes (clamp to avoid negative indexing)
        matched_gt_boxes_for_proposals = gt_boxes[best_match_gt_idx.clamp(min=0)]

        # Get class labels
        labels = gt_labels[best_match_gt_idx.clamp(min=0)]
        labels = labels.to(dtype=torch.int64)

        # Set background and ignored labels
        labels[background_proposals] = 0  # Background class
        labels[ignored_proposals] = -1   # Ignored

        return labels, matched_gt_boxes_for_proposals

    def postprocess_detections(self, cls_scores, box_transform_pred, proposals, image_shape):
        """
        Post-process predictions to get final detections
        """
        device = cls_scores.device
        num_boxes, num_classes = cls_scores.shape

        # Reshape box predictions: (num_boxes, num_classes, 4)
        box_transform_pred = box_transform_pred.view(num_boxes, num_classes, 4)

        # Apply softmax to classification scores
        cls_probs = torch.softmax(cls_scores, dim=1)

        all_boxes = []
        all_scores = []
        all_labels = []

        # Process each class (skip background class 0)
        for class_idx in range(1, num_classes):
            # Get scores for this class
            class_scores = cls_probs[:, class_idx]

            # Filter by score threshold
            score_mask = class_scores > self.low_score_threshold
            if not score_mask.any():
                continue

            class_scores = class_scores[score_mask]
            class_proposals = proposals[score_mask]
            class_box_deltas = box_transform_pred[score_mask, class_idx]

            # Apply box transformations
            class_boxes = apply_regression_pred_to_anchors_or_proposals(
                class_box_deltas.unsqueeze(1),
                class_proposals
            ).squeeze(1)

            # Clamp boxes to image boundary
            class_boxes = clamp_boxes_to_image_boundary(class_boxes, image_shape)

            # Apply NMS
            keep_indices = torch.ops.torchvision.nms(
                class_boxes, class_scores, self.nms_threshold
            )

            if len(keep_indices) > 0:
                all_boxes.append(class_boxes[keep_indices])
                all_scores.append(class_scores[keep_indices])
                all_labels.append(torch.full((len(keep_indices),), class_idx,
                                           dtype=torch.int64, device=device))

        if len(all_boxes) == 0:
            # Return empty results
            return (torch.empty((0, 4), device=device),
                   torch.empty((0,), device=device),
                   torch.empty((0,), dtype=torch.int64, device=device))

        # Concatenate all detections
        final_boxes = torch.cat(all_boxes, dim=0)
        final_scores = torch.cat(all_scores, dim=0)
        final_labels = torch.cat(all_labels, dim=0)

        # Keep top K detections
        if len(final_scores) > self.topK_detections:
            _, top_indices = final_scores.topk(self.topK_detections)
            final_boxes = final_boxes[top_indices]
            final_scores = final_scores[top_indices]
            final_labels = final_labels[top_indices]

        return final_boxes, final_scores, final_labels

    def forward(self, features, proposals_list, image_shapes, targets=None):
        """
        Forward pass of ROI Head
        Args:
            features: batch of feature maps [B, C, H, W]
            proposals_list: list of proposals for each image
            image_shapes: list of image shapes for each image
            targets: list of targets (for training)
        """
        batch_size = len(proposals_list)
        device = features.device

        all_detections = []
        total_roi_cls_loss = 0.0
        total_roi_loc_loss = 0.0

        for i in range(batch_size):
            # Get data for this image
            feat = features[i:i+1]  # Keep batch dimension
            proposals = proposals_list[i]
            image_shape = image_shapes[i] if isinstance(image_shapes, list) else image_shapes

            # Skip if no proposals
            if proposals.numel() == 0:
                # Return empty detections
                empty_boxes = torch.empty((0, 4), device=device)
                empty_scores = torch.empty((0,), device=device)
                empty_labels = torch.empty((0,), dtype=torch.int64, device=device)
                all_detections.append({
                    'boxes': empty_boxes,
                    'scores': empty_scores,
                    'labels': empty_labels
                })
                continue

            # Ensure proposals are on correct device
            proposals = proposals.to(device)

            # Training: add ground truth and assign targets
            if self.training and targets is not None:
                target = targets[i] if isinstance(targets, list) else targets

                # Get ground truth data
                if isinstance(target, dict):
                    gt_boxes = target.get('bboxes', target.get('boxes', torch.empty((0, 4))))
                    gt_labels = target.get('labels', torch.empty((0,), dtype=torch.int64))
                else:
                    gt_boxes = target
                    gt_labels = torch.ones(gt_boxes.shape[0], dtype=torch.int64)  # Default to class 1

                # Ensure proper shapes and devices
                if gt_boxes.numel() > 0:
                    gt_boxes = gt_boxes.to(device)
                    if gt_boxes.dim() == 1:
                        gt_boxes = gt_boxes.reshape(-1, 4)

                    gt_labels = gt_labels.to(device).flatten()

                    # Add ground truth to proposals
                    proposals = torch.cat([proposals, gt_boxes], dim=0)

                    # Assign targets to proposals
                    labels, matched_gt_boxes_for_proposals = self.assign_target_to_proposals(
                        proposals, gt_boxes, gt_labels
                    )

                    # Sample positive and negative proposals
                    sampled_neg_idx_mask, sampled_pos_idx_mask = sample_positive_negative(
                        labels,
                        positive_count=self.roi_pos_count,
                        total_count=self.roi_batch_size
                    )

                    sampled_idxs = torch.where(sampled_pos_idx_mask | sampled_neg_idx_mask)[0]

                    if len(sampled_idxs) > 0:
                        # Keep only sampled proposals
                        proposals = proposals[sampled_idxs]
                        labels = labels[sampled_idxs]
                        matched_gt_boxes_for_proposals = matched_gt_boxes_for_proposals[sampled_idxs]
                        regression_targets = boxes_to_transformation_targets(
                            matched_gt_boxes_for_proposals, proposals
                        )

            # Skip if no proposals after sampling
            if proposals.numel() == 0:
                empty_boxes = torch.empty((0, 4), device=device)
                empty_scores = torch.empty((0,), device=device)
                empty_labels = torch.empty((0,), dtype=torch.int64, device=device)
                all_detections.append({
                    'boxes': empty_boxes,
                    'scores': empty_scores,
                    'labels': empty_labels
                })
                continue

            # Calculate spatial scale for ROI pooling
            feat_size = feat.shape[-2:]
            if isinstance(image_shape, (list, tuple)):
                img_h, img_w = image_shape[-2:]
            else:
                img_h, img_w = image_shape.shape[-2:]

            spatial_scale = min(feat_size[0] / img_h, feat_size[1] / img_w)

            # ROI pooling
            proposal_roi_pool_feats = torchvision.ops.roi_pool(
                feat,
                [proposals],
                output_size=self.pool_size,
                spatial_scale=spatial_scale
            )

            # Forward through FC layers
            proposal_roi_pool_feats = proposal_roi_pool_feats.flatten(start_dim=1)
            box_fc_6 = torch.relu(self.fc6(proposal_roi_pool_feats))
            box_fc_7 = torch.relu(self.fc7(box_fc_6))
            cls_scores = self.cls_layer(box_fc_7)
            box_transform_pred = self.bbox_reg_layer(box_fc_7)

            # Compute losses during training
            if self.training and targets is not None and 'labels' in locals():
                # Classification loss
                valid_mask = labels >= 0  # Exclude ignored samples
                if valid_mask.sum() > 0:
                    cls_loss = torch.nn.functional.cross_entropy(
                        cls_scores[valid_mask],
                        labels[valid_mask]
                    )
                else:
                    cls_loss = torch.tensor(0.0, device=device)

                # Regression loss (only for positive samples)
                pos_mask = labels > 0
                if pos_mask.sum() > 0:
                    # Get box predictions for the correct class
                    num_boxes = box_transform_pred.shape[0]
                    box_transform_pred_reshaped = box_transform_pred.view(num_boxes, self.num_classes, 4)

                    # Select predictions for ground truth classes
                    pos_labels = labels[pos_mask]
                    pos_box_preds = box_transform_pred_reshaped[pos_mask, pos_labels]
                    pos_regression_targets = regression_targets[pos_mask]

                    loc_loss = torch.nn.functional.smooth_l1_loss(
                        pos_box_preds,
                        pos_regression_targets,
                        beta=1.0,
                        reduction="mean"
                    )
                else:
                    loc_loss = torch.tensor(0.0, device=device)

                total_roi_cls_loss += cls_loss
                total_roi_loc_loss += loc_loss

            # Post-process predictions for inference
            if not self.training:
                boxes, scores, pred_labels = self.postprocess_detections(
                    cls_scores, box_transform_pred, proposals, image_shape
                )
                all_detections.append({
                    'boxes': boxes,
                    'scores': scores,
                    'labels': pred_labels
                })
            else:
                # During training, return raw predictions
                all_detections.append({
                    'cls_scores': cls_scores,
                    'box_predictions': box_transform_pred,
                    'proposals': proposals
                })

        # Prepare output
        roi_output = {
            'detections': all_detections
        }

        if self.training and targets is not None:
            roi_output['roi_classification_loss'] = total_roi_cls_loss / batch_size
            roi_output['roi_localization_loss'] = total_roi_loc_loss / batch_size

        return roi_output


In [None]:
class FasterRCNN(nn.Module):
    def __init__(self, model_config, num_classes):
        super(FasterRCNN, self).__init__()
        self.model_config = model_config
        vgg16 = torchvision.models.vgg16(pretrained=True)
        self.backbone = vgg16.features[:-1]
        self.rpn = RegionProposalNetwork(model_config['backbone_out_channels'],
                                         scales=model_config['scales'],
                                         aspect_ratios=model_config['aspect_ratios'],
                                         model_config=model_config)
        self.roi_head = ROIHead(model_config, num_classes, in_channels=model_config['backbone_out_channels'])

        # Freeze early layers
        for layer in self.backbone[:10]:
            for p in layer.parameters():
                p.requires_grad = False

        self.image_mean = [0.485, 0.456, 0.406]
        self.image_std = [0.229, 0.224, 0.225]
        self.min_size = model_config['min_im_size']
        self.max_size = model_config['max_im_size']

    def normalize_resize_image_and_boxes(self, image, bboxes):
        """
        Normalize and resize image and corresponding bounding boxes
        Args:
            image: Tensor of shape (B, C, H, W) or (C, H, W)
            bboxes: Tensor of shape (B, N, 4) or (N, 4) or None
        Returns:
            resized_image: Tensor of shape (B, C, H', W')
            resized_bboxes: Tensor of shape (B, N, 4) or None
        """
        dtype, device = image.dtype, image.device

        # Ensure image has batch dimension
        original_batch_dim = image.dim() == 4
        if not original_batch_dim:
            image = image.unsqueeze(0)

        batch_size = image.shape[0]

        # Normalize
        mean = torch.as_tensor(self.image_mean, dtype=dtype, device=device).view(1, 3, 1, 1)
        std = torch.as_tensor(self.image_std, dtype=dtype, device=device).view(1, 3, 1, 1)
        image = (image - mean) / std

        # Calculate resize scale
        h, w = image.shape[-2:]
        min_size_current = min(h, w)
        max_size_current = max(h, w)
        scale = min(self.min_size / min_size_current, self.max_size / max_size_current)

        # Resize image
        new_h = int(h * scale)
        new_w = int(w * scale)

        image = torch.nn.functional.interpolate(
            image,
            size=(new_h, new_w),
            mode="bilinear",
            align_corners=False
        )

        # Resize bboxes if provided
        resized_bboxes = None
        if bboxes is not None and bboxes.numel() > 0:
            # Ensure bboxes have batch dimension
            if bboxes.dim() == 2:  # (N, 4)
                bboxes = bboxes.unsqueeze(0)  # (1, N, 4)

            # Expand to match batch size if needed
            if bboxes.shape[0] == 1 and batch_size > 1:
                bboxes = bboxes.expand(batch_size, -1, -1)

            # Apply scaling
            resized_bboxes = bboxes * scale

            # Clamp to image boundaries
            resized_bboxes[:, :, [0, 2]] = torch.clamp(resized_bboxes[:, :, [0, 2]], 0, new_w)
            resized_bboxes[:, :, [1, 3]] = torch.clamp(resized_bboxes[:, :, [1, 3]], 0, new_h)

        return image, resized_bboxes

    def forward(self, image, target=None):
        """
        Forward pass of Faster R-CNN
        Args:
            image: Input image tensor (B, C, H, W)
            target: Dictionary with 'bboxes' and 'labels' (training only)
        Returns:
            For training: Dictionary with losses
            For inference: Dictionary with detections
        """
        if image.dim() != 4:
            raise ValueError(f"Expected 4D image tensor (B,C,H,W), got shape: {image.shape}")

        original_image_size = image.shape[-2:]

        # Process targets for training
        processed_target = None
        if self.training and target is not None:
            # Normalize and resize first
            image, resized_bboxes = self.normalize_resize_image_and_boxes(image, target['bboxes'])

            # Process targets per image (RPN expects list of targets, not batched)
            batch_size = image.shape[0]
            processed_target = []

            for i in range(batch_size):
                img_target = {}

                # Extract data for this image
                if resized_bboxes is not None:
                    img_bboxes = resized_bboxes[i]  # Shape: (N, 4)
                    img_labels = target['labels'][i] if 'labels' in target else None  # Shape: (N,)

                    # Filter out padding/invalid boxes (assuming label 0 means padding/background)
                    if img_labels is not None:
                        valid_mask = img_labels > 0
                        img_bboxes = img_bboxes[valid_mask]
                        img_labels = img_labels[valid_mask]

                    # Additional validation for box coordinates
                    if img_bboxes.numel() > 0:
                        valid_box_mask = (img_bboxes[:, 2] > img_bboxes[:, 0]) & \
                                       (img_bboxes[:, 3] > img_bboxes[:, 1])
                        img_bboxes = img_bboxes[valid_box_mask]
                        if img_labels is not None:
                            img_labels = img_labels[valid_box_mask]

                    img_target['bboxes'] = img_bboxes
                    if img_labels is not None:
                        img_target['labels'] = img_labels
                else:
                    img_target['bboxes'] = torch.empty((0, 4), dtype=torch.float32, device=image.device)
                    if 'labels' in target:
                        img_target['labels'] = torch.empty((0,), dtype=torch.long, device=image.device)

                # Copy other target information
                for key, value in target.items():
                    if key not in ['bboxes', 'labels']:
                        if isinstance(value, torch.Tensor) and value.dim() > 0:
                            img_target[key] = value[i]
                        else:
                            img_target[key] = value

                processed_target.append(img_target)

            print(f"Debug: Processed target for {len(processed_target)} images")
            for i, img_tgt in enumerate(processed_target):
                if 'bboxes' in img_tgt:
                    print(f"  Image {i}: {img_tgt['bboxes'].shape[0]} boxes, shape: {img_tgt['bboxes'].shape}")

        else:
            # Inference mode
            image, _ = self.normalize_resize_image_and_boxes(image, None)

        try:
            # Extract features
            features = self.backbone(image)
            print(f"Debug: Features shape: {features.shape}")

            # RPN forward pass - expects list of targets for training
            rpn_output = self.rpn(image, features, processed_target)
            print(f"Debug: RPN output keys: {rpn_output.keys()}")

            if 'proposals' not in rpn_output:
                raise RuntimeError("RPN did not return proposals")

            proposals = rpn_output['proposals']
            print(f"Debug: Proposals - {len(proposals)} batches, shapes: {[p.shape for p in proposals]}")

            # Validate proposals format
            for i, prop in enumerate(proposals):
                if prop.dim() != 2 or prop.shape[1] != 4:
                    raise ValueError(f"Invalid proposal shape at batch {i}: {prop.shape}, expected (N, 4)")
                # Check for invalid coordinates
                invalid_mask = (prop[:, 2] <= prop[:, 0]) | (prop[:, 3] <= prop[:, 1])
                if invalid_mask.any():
                    print(f"Warning: Found {invalid_mask.sum()} invalid proposals in batch {i}")
                    # Fix invalid proposals
                    prop[invalid_mask, 2] = prop[invalid_mask, 0] + 1
                    prop[invalid_mask, 3] = prop[invalid_mask, 1] + 1

            # ROI Head forward pass
            roi_output = self.roi_head(features, proposals, image.shape[-2:], processed_target)
            print(f"Debug: ROI output keys: {roi_output.keys()}")

            # FIXED: Proper loss aggregation and return format
            if self.training:
                # Aggregate all losses
                losses = {}

                # Add RPN losses with proper tensor conversion
                if 'rpn_classification_loss' in rpn_output:
                    rpn_cls_loss = rpn_output['rpn_classification_loss']
                    if not isinstance(rpn_cls_loss, torch.Tensor):
                        rpn_cls_loss = torch.tensor(rpn_cls_loss, dtype=torch.float32, device=image.device)
                    losses['rpn_classification_loss'] = rpn_cls_loss

                if 'rpn_localization_loss' in rpn_output:
                    rpn_loc_loss = rpn_output['rpn_localization_loss']
                    if not isinstance(rpn_loc_loss, torch.Tensor):
                        rpn_loc_loss = torch.tensor(rpn_loc_loss, dtype=torch.float32, device=image.device)
                    losses['rpn_localization_loss'] = rpn_loc_loss

                # Add ROI losses with proper tensor conversion
                if 'roi_classification_loss' in roi_output:
                    roi_cls_loss = roi_output['roi_classification_loss']
                    if not isinstance(roi_cls_loss, torch.Tensor):
                        roi_cls_loss = torch.tensor(roi_cls_loss, dtype=torch.float32, device=image.device)
                    losses['roi_classification_loss'] = roi_cls_loss

                if 'roi_localization_loss' in roi_output:
                    roi_loc_loss = roi_output['roi_localization_loss']
                    if not isinstance(roi_loc_loss, torch.Tensor):
                        roi_loc_loss = torch.tensor(roi_loc_loss, dtype=torch.float32, device=image.device)
                    losses['roi_localization_loss'] = roi_loc_loss

                # Compute total loss
                total_loss = torch.tensor(0.0, dtype=torch.float32, device=image.device, requires_grad=True)
                for loss_name, loss_value in losses.items():
                    if loss_value.requires_grad:
                        total_loss = total_loss + loss_value
                    else:
                        total_loss = total_loss + loss_value.detach().requires_grad_(True)

                losses['total_loss'] = total_loss

                print(f"Debug: Training losses computed: {list(losses.keys())}")
                return losses

            else:
                # Inference mode - return detections
                result = roi_output.copy()

                # Transform boxes back to original size
                if 'detections' in result:
                    # Handle detection format - assuming it's a list of detections per image
                    detections = result['detections']
                    if isinstance(detections, list):
                        for i, detection in enumerate(detections):
                            if isinstance(detection, dict) and 'boxes' in detection:
                                if detection['boxes'] is not None and len(detection['boxes']) > 0:
                                    detection['boxes'] = transform_boxes_to_original_size(
                                        detection['boxes'], image.shape[-2:], original_image_size
                                    )
                    elif isinstance(detections, dict) and 'boxes' in detections:
                        if detections['boxes'] is not None and len(detections['boxes']) > 0:
                            detections['boxes'] = transform_boxes_to_original_size(
                                detections['boxes'], image.shape[-2:], original_image_size
                            )

                return result

        except Exception as e:
            print(f"Error in FasterRCNN forward pass: {e}")
            print(f"Image shape: {image.shape}")
            if processed_target:
                if isinstance(processed_target, list):
                    print(f"Target list length: {len(processed_target)}")
                    for i, tgt in enumerate(processed_target):
                        print(f"  Target {i} keys: {tgt.keys()}")
                        if 'bboxes' in tgt:
                            print(f"  Target {i} bboxes shape: {tgt['bboxes'].shape}")
                else:
                    print(f"Target keys: {processed_target.keys()}")
                    if 'bboxes' in processed_target and processed_target['bboxes'] is not None:
                        print(f"Bboxes shape: {processed_target['bboxes'].shape}")
                        print(f"Bboxes content: {processed_target['bboxes']}")

            # Additional debugging for box dimension errors
            import traceback
            print("Full traceback:")
            traceback.print_exc()
            raise

In [None]:
with open('/content/config/tumor.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Evaluate model
model = FasterRCNN(config['model_params'], num_classes=config['dataset_params']['num_classes'])
model.load_state_dict(torch.load('/content/lung_tumor_exp1/final_faster_rcnn_lung_tumor.pth'))
evaluate_model(model, config)

100%|██████████| 98/98 [00:00<00:00, 9168.49it/s]


Total 98 images found


Evaluating:   0%|          | 0/98 [00:00<?, ?it/s]

Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1421, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1421, 4])]


Evaluating:   1%|          | 1/98 [00:00<00:17,  5.49it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])


Evaluating:   2%|▏         | 2/98 [00:00<00:13,  7.21it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1096, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1096, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1455, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:   3%|▎         | 3/98 [00:00<00:12,  7.67it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1455, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])


Evaluating:   4%|▍         | 4/98 [00:00<00:11,  7.89it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1482, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1482, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1455, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:   5%|▌         | 5/98 [00:00<00:11,  7.98it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1121, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1121, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:   7%|▋         | 7/98 [00:00<00:10,  8.98it/s]

Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1151, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])


Evaluating:   8%|▊         | 8/98 [00:00<00:10,  8.81it/s]

Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1428, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1428, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1

Evaluating:  10%|█         | 10/98 [00:01<00:09,  9.37it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1157, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1157, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  11%|█         | 11/98 [00:01<00:09,  9.50it/s]

Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1181, 4])]
Debug: ROI output keys: dict_keys(['detections'])


Evaluating:  12%|█▏        | 12/98 [00:01<00:08,  9.62it/s]

Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1040, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1040, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_lo

Evaluating:  13%|█▎        | 13/98 [00:01<00:08,  9.68it/s]

Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1007, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 2 boxes, shape: torch.Size([2, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1051, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 

Evaluating:  15%|█▌        | 15/98 [00:01<00:08,  9.97it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1012, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])


Evaluating:  16%|█▋        | 16/98 [00:01<00:08,  9.70it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1387, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1387, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1097, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:  17%|█▋        | 17/98 [00:01<00:08,  9.75it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1097, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1140, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1140, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  19%|█▉        | 19/98 [00:02<00:08,  9.50it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1420, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])


Evaluating:  20%|██        | 20/98 [00:02<00:08,  9.56it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1074, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1074, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1049, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:  21%|██▏       | 21/98 [00:02<00:07,  9.67it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1049, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1126, 4])]


Evaluating:  22%|██▏       | 22/98 [00:02<00:07,  9.72it/s]

Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1126, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1131, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 

Evaluating:  23%|██▎       | 23/98 [00:02<00:07,  9.62it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1131, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1531, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])


Evaluating:  24%|██▍       | 24/98 [00:02<00:08,  9.23it/s]

Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1531, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1183, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])


Evaluating:  26%|██▌       | 25/98 [00:02<00:07,  9.37it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1183, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])


Evaluating:  27%|██▋       | 26/98 [00:02<00:07,  9.03it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1413, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1413, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1129, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:  29%|██▊       | 28/98 [00:03<00:07,  9.51it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1103, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1103, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1028, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:  31%|███       | 30/98 [00:03<00:06,  9.81it/s]

Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1167, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1173, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1173, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:  34%|███▎      | 33/98 [00:03<00:06,  9.69it/s]

Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1388, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 2 boxes, shape: torch.Size([2, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1165, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN 

Evaluating:  36%|███▌      | 35/98 [00:03<00:06,  9.87it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1123, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1123, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1082, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:  38%|███▊      | 37/98 [00:03<00:06, 10.08it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1101, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1090, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1090, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  40%|███▉      | 39/98 [00:04<00:05, 10.00it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1369, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1369, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  42%|████▏     | 41/98 [00:04<00:05,  9.89it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1319, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1319, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  44%|████▍     | 43/98 [00:04<00:05,  9.40it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1442, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1442, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  46%|████▌     | 45/98 [00:04<00:05,  9.61it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1326, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1404, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1404, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  48%|████▊     | 47/98 [00:04<00:05,  9.73it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1378, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1378, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  50%|█████     | 49/98 [00:05<00:05,  9.35it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1409, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1409, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  52%|█████▏    | 51/98 [00:05<00:04,  9.66it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1286, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1286, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  54%|█████▍    | 53/98 [00:05<00:04,  9.17it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1537, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1537, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  56%|█████▌    | 55/98 [00:05<00:04,  9.59it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1300, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1364, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1364, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  58%|█████▊    | 57/98 [00:06<00:04,  9.72it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 2 boxes, shape: torch.Size([2, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1471, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1471, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  60%|██████    | 59/98 [00:06<00:04,  9.44it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1329, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1329, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  62%|██████▏   | 61/98 [00:06<00:03,  9.59it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1305, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1305, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  64%|██████▍   | 63/98 [00:06<00:03,  9.06it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1445, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1445, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  66%|██████▋   | 65/98 [00:06<00:03,  9.26it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1397, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1299, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1299, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  68%|██████▊   | 67/98 [00:07<00:03,  9.29it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1416, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1404, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1404, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  70%|███████   | 69/98 [00:07<00:03,  9.52it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1302, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1281, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1281, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  73%|███████▎  | 72/98 [00:07<00:02,  9.84it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1397, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1397, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  76%|███████▌  | 74/98 [00:07<00:02,  9.87it/s]

Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1302, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1302, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_lo

Evaluating:  78%|███████▊  | 76/98 [00:08<00:02,  9.85it/s]

Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1321, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1321, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_lo

Evaluating:  79%|███████▊  | 77/98 [00:08<00:02,  9.84it/s]

Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1366, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1366, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_lo

Evaluating:  82%|████████▏ | 80/98 [00:08<00:01,  9.31it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1384, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1384, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1334, 4])]
Debug: ROI output keys: dict_keys(['detect

Evaluating:  83%|████████▎ | 81/98 [00:08<00:01,  9.23it/s]

Debug: Processed target for 1 images
  Image 0: 2 boxes, shape: torch.Size([2, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1365, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1365, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_lo

Evaluating:  85%|████████▍ | 83/98 [00:08<00:01,  9.21it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1298, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1298, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  87%|████████▋ | 85/98 [00:08<00:01,  9.30it/s]

Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1213, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1213, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 0 boxes, shape: torch.Size([0, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_k

Evaluating:  89%|████████▉ | 87/98 [00:09<00:01,  9.16it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1325, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1361, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1361, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  91%|█████████ | 89/98 [00:09<00:00,  9.48it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1289, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 2 boxes, shape: torch.Size([2, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1369, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1369, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  92%|█████████▏| 90/98 [00:09<00:00,  9.11it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1472, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1296, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1296, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  95%|█████████▍| 93/98 [00:09<00:00,  9.39it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1261, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1398, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1398, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  97%|█████████▋| 95/98 [00:10<00:00,  9.37it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1425, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1312, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1312, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating:  99%|█████████▉| 97/98 [00:10<00:00,  9.54it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1289, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image 0: 1 boxes, shape: torch.Size([1, 4])
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores', 'rpn_classification_loss', 'rpn_localization_loss'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1350, 4])]
Debug: ROI output keys: dict_keys(['detections', 'roi_classification_loss', 'roi_localization_loss'])
Debug: Training losses computed: ['rpn_classification_loss', 'rpn_localization_loss', 'roi_classification_loss', 'roi_localization_loss', 'total_loss']
Debug: Features shape: torch.Size([1, 512, 32, 32])
Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1350, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Debug: Processed target for 1 images
  Image

Evaluating: 100%|██████████| 98/98 [00:10<00:00,  9.44it/s]

Debug: RPN output keys: dict_keys(['proposals', 'scores'])
Debug: Proposals - 1 batches, shapes: [torch.Size([1397, 4])]
Debug: ROI output keys: dict_keys(['detections'])
Evaluation Results:
  Average Test Loss: 2.1101
  Total Images: 98
  Average Detections per Image: 1.00





{'avg_loss': 2.1101264029133078, 'total_images': 98, 'avg_detections': 1.0}