In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
    # for filename in filenames:
    #     print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [36]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import xml.etree.ElementTree as ET
import random
from sklearn.model_selection import train_test_split

class DogFaceDataset(Dataset):
    def __init__(self, images, annotations, transform=None):
        """
        Args:
            images (list): List of image file paths.
            annotations (list): List of annotation file paths.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.images = images
        self.annotations = annotations
        self.transform = transform
        
    def parse_xml(self, xml_file):
        """Parse XML annotation file to extract bounding box information."""
        tree = ET.parse(xml_file)
        root = tree.getroot()
        boxes = []
        
        for obj in root.findall('object'):
            bbox = obj.find('bndbox')
            xmin = int(bbox.find('xmin').text)
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)
            boxes.append([xmin, ymin, xmax, ymax])
            break
        
        return boxes if boxes else None  # Return None if no bounding box is found

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        img_path = self.images[idx]
        annotation_path = self.annotations[idx]
        
        image = Image.open(img_path).convert("RGB")
        boxes = self.parse_xml(annotation_path)
        
        target = {}
        if boxes:
            target['boxes'] = torch.tensor(boxes, dtype=torch.float32)
            target['labels'] = torch.tensor([1] * len(boxes), dtype=torch.int64)  # Class 1 for dog

        if not boxes:
            target['boxes'] = torch.empty(0, 4)  # No boxes
            target['labels'] = torch.empty((0,), dtype=torch.int64)
            
        if self.transform:
            image = self.transform(image)
            image = image.squeeze(1)
        
        return image, target


def split_dataset(image_paths, annotation_paths, val_size=0.1, test_size=0.1):
    """
    Split dataset into train, validation, and test sets.
    
    Args:
        image_paths (list): List of image file paths.
        annotation_paths (list): List of annotation file paths.
        val_size (float): Proportion of data to be used for validation.
        test_size (float): Proportion of data to be used for testing.
        
    Returns:
        Tuple of lists containing image and annotation file paths for train, val, and test sets.
    """
    # Split into train + temp (val + test)
    train_imgs, temp_imgs, train_ann, temp_ann = train_test_split(image_paths, annotation_paths, test_size=val_size + test_size, random_state=42)
    
    # Split temp into val and test
    val_imgs, test_imgs, val_ann, test_ann = train_test_split(temp_imgs, temp_ann, test_size=test_size / (val_size + test_size), random_state=42)
    
    return train_imgs, train_ann, val_imgs, val_ann, test_imgs, test_ann

def get_loaders(data_dir, batch_size=16, num_workers=4):
    image_dir = os.path.join(data_dir, "Images")
    annotation_dir = os.path.join(data_dir, "Annotations")
    
    # Get the list of image and annotation file paths
    image_paths = []
    annotation_paths = []
    
    for category in os.listdir(image_dir):
        category_path = os.path.join(image_dir, category)
        annotation_path = os.path.join(annotation_dir, category)
        
        if os.path.isdir(category_path):
            for img_name in os.listdir(category_path):
                img_path = os.path.join(category_path, img_name)
                annotation_file = os.path.join(annotation_path, img_name.replace('.jpg', ''))
                
                if os.path.exists(annotation_file):
                    image_paths.append(img_path)
                    annotation_paths.append(annotation_file)
    
    # Split the dataset into train, validation, and test sets
    train_imgs, train_ann, val_imgs, val_ann, test_imgs, test_ann = split_dataset(image_paths, annotation_paths)

    train_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
    ])

    test_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    # Create datasets for each split
    train_dataset = DogFaceDataset(train_imgs, train_ann, transform=train_transform)
    val_dataset = DogFaceDataset(val_imgs, val_ann, transform=test_transform)
    test_dataset = DogFaceDataset(test_imgs, test_ann, transform=test_transform)

    # Create DataLoaders for each split
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=num_workers)
    val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=num_workers)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=num_workers)

    return train_loader, val_loader, test_loader

In [37]:
train_loader, val_loader, test_loader = get_loaders("/kaggle/input/imagenet/bounding_box_data")

In [21]:
import torch
import torchvision
from torch import nn
from torchvision import transforms
from torchvision.models.detection import FasterRCNN, fasterrcnn_resnet50_fpn
from torch.utils.data import DataLoader
from tqdm import tqdm

In [22]:
# Load a pre-trained Faster R-CNN model (ResNet-50 backbone)
model = fasterrcnn_resnet50_fpn(pretrained=True)

# Modify the classifier (in this case, we only need 2 classes: background and dog)
num_classes = 2  # 1 class (dog) + background

# Get the input features from the pre-trained model
in_features = model.roi_heads.box_predictor.cls_score.in_features

# Replace the predictor with a new one for our custom dataset
model.roi_heads.box_predictor = torchvision.models.detection.faster_rcnn.FastRCNNPredictor(in_features, num_classes)


In [6]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(deevice)


FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=0.0)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=0.0)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, eps=0.0)
          (relu): ReLU(

In [7]:
device

device(type='cuda')

In [8]:
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.Adam(params, lr=1e-4)


In [9]:
from torch.optim.lr_scheduler import StepLR

lr_scheduler = StepLR(optimizer, step_size=3, gamma=0.1)


In [42]:
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    running_loss = 0.0
    
    # Iterate over the training data
    for images, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        if len(images) == 0 or len(targets) == 0:
            continue  # Skip this batch

        # # Move images and targets to the correct device (GPU or CPU)
        # images = [image.squeeze(1).to(device) for image in images]
        targets = [{k: v.to(device) for k, v in targets.items()}]
        images = images.to(device)
        print(targets)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass: Get predictions
        loss_dict = model(images, targets)
        
        # Get total loss (sum of all losses)
        losses = sum(loss for loss in loss_dict.values())
        
        # Backward pass: Compute gradients
        losses.backward()
        
        # Update the weights
        optimizer.step()
        
        # Track running loss
        running_loss += losses.item()
    
    # Print the loss for the current epoch
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}")

    # Step the learning rate scheduler (optional)
    lr_scheduler.step()

Epoch 1/10:   0%|          | 0/1029 [00:00<?, ?it/s]

[{'boxes': tensor([[[ 25.,  48., 373., 498.]],

        [[ 27., 155., 296., 416.]],

        [[ 40.,  32., 459., 455.]],

        [[114.,  11., 345., 350.]],

        [[  0.,  33., 471., 373.]],

        [[ 22.,  53., 263., 430.]],

        [[  1.,  17., 404., 498.]],

        [[291., 129., 477., 271.]],

        [[168.,  97., 383., 265.]],

        [[  0.,  10., 282., 231.]],

        [[ 22.,   6., 499., 341.]],

        [[203., 146., 384., 305.]],

        [[ 33.,  30., 336., 498.]],

        [[ 32., 150., 234., 283.]],

        [[225., 118., 299., 334.]],

        [[  1.,  56., 340., 498.]]], device='cuda:0'), 'labels': tensor([[1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1]], device='cuda:0')}]


Epoch 1/10:   0%|          | 0/1029 [00:01<?, ?it/s]


AssertionError: Expected target boxes to be a tensor of shape [N, 4], got torch.Size([16, 1, 4]).

In [None]:
model.eval()

In [None]:
image, _ = next(iter(train_loader))

model()