In [1]:
from pycocotools.coco import COCO
import numpy as np
import cv2
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import os
import json
from tqdm import tqdm
from PIL import Image


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


# Dataset

In [3]:
# Load the annotation file
annotation_file = 'data/COCO/annotations/instances_val2017.json'  # Replace with your annotation file
with open(annotation_file, 'r') as f:
    coco_data = json.load(f)

# Extract original category IDs and names
categories = coco_data['categories']
original_to_mapped = {category['id']: idx for idx, category in enumerate(categories)}
mapped_to_original = {v: k for k, v in original_to_mapped.items()}

# Display the mapping
print("Original to Mapped Category IDs:")
for orig_id, new_id in mapped_to_original.items():
    print(f"Original ID: {orig_id} -> Mapped ID: {new_id}")

Original to Mapped Category IDs:
Original ID: 0 -> Mapped ID: 1
Original ID: 1 -> Mapped ID: 2
Original ID: 2 -> Mapped ID: 3
Original ID: 3 -> Mapped ID: 4
Original ID: 4 -> Mapped ID: 5
Original ID: 5 -> Mapped ID: 6
Original ID: 6 -> Mapped ID: 7
Original ID: 7 -> Mapped ID: 8
Original ID: 8 -> Mapped ID: 9
Original ID: 9 -> Mapped ID: 10
Original ID: 10 -> Mapped ID: 11
Original ID: 11 -> Mapped ID: 13
Original ID: 12 -> Mapped ID: 14
Original ID: 13 -> Mapped ID: 15
Original ID: 14 -> Mapped ID: 16
Original ID: 15 -> Mapped ID: 17
Original ID: 16 -> Mapped ID: 18
Original ID: 17 -> Mapped ID: 19
Original ID: 18 -> Mapped ID: 20
Original ID: 19 -> Mapped ID: 21
Original ID: 20 -> Mapped ID: 22
Original ID: 21 -> Mapped ID: 23
Original ID: 22 -> Mapped ID: 24
Original ID: 23 -> Mapped ID: 25
Original ID: 24 -> Mapped ID: 27
Original ID: 25 -> Mapped ID: 28
Original ID: 26 -> Mapped ID: 31
Original ID: 27 -> Mapped ID: 32
Original ID: 28 -> Mapped ID: 33
Original ID: 29 -> Mapped ID:

In [4]:
class COCODataset(Dataset):
    def __init__(self, image_dir, annotation_file, image_features, max_bboxes=10, transform=None):
        self.image_dir = image_dir
        self.coco = COCO(annotation_file)
        self.image_ids = list(self.coco.imgs.keys())  # List of image IDs
        self.transform = transform
        self.image_features = image_features
        self.max_bboxes = max_bboxes
        self.num_classes = 80  # COCO has 80 categories, adjust if needed

    def __len__(self):
        return len(self.image_ids)
    
    def pad_bounding_boxes(self, bbox):
        """
        Pad or truncate bounding boxes to a fixed size.
        """
        bbox_tensor = torch.tensor(bbox, dtype=torch.float32)
        num_bboxes = bbox_tensor.size(0)

        if num_bboxes > self.max_bboxes:
            # Truncate bounding boxes
            bbox_tensor = bbox_tensor[:self.max_bboxes]
        else:
            # Pad with zeros
            pad_size = self.max_bboxes - num_bboxes
            padding = torch.zeros((pad_size, bbox_tensor.size(1)))
            bbox_tensor = torch.cat([bbox_tensor, padding], dim=0)

        return bbox_tensor
    
    def __getitem__(self, idx):
        # Get the image ID
        img_id = self.image_ids[idx]
        img_info = self.coco.loadImgs(img_id)[0]

        # Load image
        img_path = os.path.join(self.image_dir, img_info['file_name'])
        image = Image.open(img_path).convert('RGB')

        # Load annotations
        annotations = self.coco.getAnnIds(imgIds=img_id)
        annotations = self.coco.loadAnns(annotations)
        
        # Initialize an empty mask (will be filled in with object masks), bounding boxes, and category counts
        mask = np.zeros((image.size[1], image.size[0]), dtype=np.float32)  # HxW for mask
        bounding_boxes = []
        feature_vector = []
        category_counts = np.zeros(self.num_classes, dtype=np.float32)  # For counting the number of objects per class

        # Loop through annotations and add segmentation masks, bounding boxes, and category counts
        for ann in annotations:
            if 'segmentation' in ann:
                ann_mask = self.coco.annToMask(ann)
                mask = np.maximum(mask, ann_mask)  # Combine multiple masks (take max)
            # Bounding boxes
            if 'bbox' in ann:
                bbox = ann['bbox']  # [x, y, width, height]
                bounding_boxes.append(bbox)
            # Image features
            if 'coco_url' in ann: 
                feature_vector = self.image_features.get(ann['coco_url'], np.zeros(2048))  # Adjust default value if needed
            
            # Count the category
            category_id = ann['category_id']
            category_counts[original_to_mapped[category_id]] += 1  # Categories are 1-indexed in COCO, so subtract 1
        
        # Pad/truncate bounding boxes
        bounding_boxes = self.pad_bounding_boxes(bounding_boxes)

        # Apply transformations (if any)
        if self.transform:
            image, mask, bounding_boxes = self.transform(image, mask, bounding_boxes)
            
        # Convert to tensor
        feature_vector = torch.tensor(feature_vector, dtype=torch.float32)
        category_counts = torch.tensor(category_counts, dtype=torch.float32)
        bounding_boxes = torch.tensor(bounding_boxes, dtype=torch.float32)

        return image, mask, bounding_boxes, feature_vector, category_counts


In [5]:
# Data transformation
# transform = transforms.Compose([
#     transforms.Resize((224, 224)),  # Resize image to match ResNet input size
#     transforms.ToTensor(),
#     transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # Normalize with ImageNet statistics
# ])

class CustomTransform:
    def __init__(self, size):
        self.size = size
        self.transform_image = transforms.Compose([
            transforms.Resize(self.size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize with ImageNet statistics
        ])

    def __call__(self, image, mask, bounding_boxes):
        # Get the original dimensions of the image
        original_width, original_height = image.size

        # Apply transformations to the image
        image = self.transform_image(image)
        
        # Resize mask (ensure it's the same size as the image)
        mask = Image.fromarray(mask)  # Convert mask to PIL Image for resizing
        mask = mask.resize((self.size[1], self.size[0]), Image.NEAREST)  # Use nearest neighbor to preserve the mask
        mask = np.array(mask)  # Convert back to NumPy array

        # Resize bounding boxes based on the new image size
        w_ratio = self.size[1] / float(original_width)  # Width resize ratio
        h_ratio = self.size[0] / float(original_height)  # Height resize ratio
        adjusted_bboxes = []
        for bbox in bounding_boxes:
            x, y, w, h = bbox
            x *= w_ratio
            y *= h_ratio
            w *= w_ratio
            h *= h_ratio
            adjusted_bboxes.append([x, y, w, h])

        # Convert mask to tensor
        mask = torch.tensor(mask, dtype=torch.float32)

        # Convert bounding boxes to tensor
        adjusted_bboxes = torch.tensor(adjusted_bboxes, dtype=torch.float32)

        # Return the transformed image, mask, and bounding boxes
        return image, mask, adjusted_bboxes


transform = CustomTransform(size=(224, 224))  # Resize to 224x224


In [None]:
data_type = "train"
dataset = "train2017"

# Set your paths
train_images_dir = f'data/COCO/{dataset}'
train_annotations_dir = f'data/COCO/annotations/instances_{dataset}.json'

# Load image feature
features_dir = os.path.join("data","COCO","extracted_features",f'{data_type}_image_features.json')
with open(features_dir, 'r') as f:
    image_features = json.load(f)

num_bboxes = 20

# Create dataset and dataloader
train_dataset = COCODataset(train_images_dir, train_annotations_dir,image_features,max_bboxes=num_bboxes, transform=transform)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

loading annotations into memory...


In [7]:
# Sanity check

print("train_dataset size:", len(train_dataset))
print("train_loader size:", len(train_dataloader))

train_dataset size: 118287
train_loader size: 3697


# Model

In [8]:
class MultiModalModel(nn.Module):
    def __init__(self, resnet_output_size=2048, num_bboxes=4, num_classes=80): 
        """
        Initialize the multi-modal model. 
        Args:
        - resnet_output_size: The size of the ResNet feature vector (2048 for ResNet-50).
        - num_bboxes: The number of bounding box entries (e.g., 4 for x, y, width, height).
        - num_classes: The number of output classes (i.e., the number of categories to predict counts for).
        """
        super(MultiModalModel, self).__init__()

        # Image feature processing (ResNet features)
        self.resnet_fc = nn.Linear(resnet_output_size, 512)

        # Segmentation mask processing (using a simple CNN)
        self.mask_conv1 = nn.Conv2d(1, 64, kernel_size=3, padding=1)
        self.mask_conv2 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.mask_fc = nn.Linear(128 * 64 * 64, 512)  # You may need to adjust the size depending on image resolution

        # Bounding box processing (fully connected layer)
        self.bbox_fc = nn.Linear(num_bboxes, 128)

        # Final count output (instead of classification, we predict a count for each category)
        self.fc_count = nn.Linear(512 + 512 + 128, num_classes)  # Output count for each class

    def forward(self, image, mask, bbox, feature_vector, category_ids):
        """
        Forward pass for multi-modal model
        Args:
        - image: Tensor of image data, shape [batch_size, channels, height, width]
        - mask: Tensor of segmentation mask data, shape [batch_size, height, width]
        - bbox: Tensor of bounding boxes, shape [batch_size, num_bboxes]
        - feature_vector: Tensor of feature vectors from ResNet, shape [batch_size, 2048]
        - category_ids: Tensor of category IDs, shape [batch_size, num_categories]
        """

        # Process the ResNet feature vector
        feature_vector_out = self.resnet_fc(feature_vector)  # Shape: [batch_size, 512]

        # Process the segmentation mask
        mask_out = F.relu(self.mask_conv1(mask))  # Apply convolution
        mask_out = F.relu(self.mask_conv2(mask_out))  # Apply second convolution
        mask_out = mask_out.view(mask_out.size(0), -1)  # Flatten for FC layer
        mask_out = self.mask_fc(mask_out)  # Shape: [batch_size, 512]

        # Process the bounding boxes
        bbox_out = F.relu(self.bbox_fc(bbox))  # Shape: [batch_size, 128]

        # Concatenate all the features
        combined_features = torch.cat((feature_vector_out, mask_out, bbox_out), dim=1)  # Shape: [batch_size, 1152]

        # Final count output (predicting the count for each category)
        count_output = self.fc_count(combined_features)  # Shape: [batch_size, num_classes]
        # Each value in count_output corresponds to the predicted count for a category

        return count_output


In [11]:
# Initialize the model, optimizer, and loss functions
num_epochs = 10
model = MultiModalModel(resnet_output_size=2048, num_bboxes=num_bboxes, num_classes=80).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.0001)

# Loss function: Mean Squared Error for regression
criterion_count = nn.MSELoss()  # For predicting counts

# Training loop with tqdm for progress bars
for epoch in range(num_epochs):
    model.train()

    # Use tqdm to iterate through the DataLoader
    for images, masks, bounding_boxes, feature_vectors, category_ids in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch"):
        masks = masks.to(device)
        bounding_boxes = bounding_boxes.to(device)
        feature_vectors = feature_vectors.to(device)
        category_ids = category_ids.to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        count_output = model(masks, bounding_boxes, feature_vectors, category_ids)

        # Calculate the count loss (regression)
        loss_count = criterion_count(count_output, category_ids.float())  # category_ids should be the true count of each class
        
        # You may also calculate segmentation and bbox losses if you need them
        # loss_segmentation = criterion_segmentation(segmentation_output, masks)
        # loss_bbox = criterion_bbox(bbox_output, bounding_boxes)

        # Total loss (fusion of regression losses)
        total_loss = loss_count  # + loss_segmentation + loss_bbox (if necessary)

        # Backpropagation
        total_loss.backward()
        optimizer.step()

        # Print loss values for each epoch
        print(f'Epoch [{epoch+1}/{num_epochs}], '
              f'Count Loss: {loss_count:.4f}, Total Loss: {total_loss:.4f}')



  bounding_boxes = torch.tensor(bounding_boxes, dtype=torch.float32)
Epoch 1/10:   0%|          | 0/3697 [00:01<?, ?batch/s]


RuntimeError: stack expects each tensor to be equal size, but got [3, 4] at entry 0 and [2, 4] at entry 2