In [2]:
import torch

In [3]:
vitb16 = torch.hub.load('facebookresearch/dino:main', 'dino_vitb16')

Using cache found in /home/atricham/.cache/torch/hub/facebookresearch_dino_main


In [4]:
model = vitb16

In [6]:
resnet50 = torch.hub.load('facebookresearch/dino:main', 'dino_resnet50')

Using cache found in /home/atricham/.cache/torch/hub/facebookresearch_dino_main


In [7]:
model = resnet50

In [8]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Define a custom dataset class
class CustomDataset(Dataset):
    def __init__(self, csv_file, image_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.transform = transform

        # Map unique class labels to integer values
        unique_classes = self.data['class_name'].unique()
        self.class_to_int = {class_name: idx for idx, class_name in enumerate(unique_classes)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_id = self.data.iloc[idx, 0]  # 'image_id' is the first column
        image_filename = f"{image_id}.jpg"  # Add ".jpg" extension
        image_path = os.path.join(self.image_dir, image_filename)
        image = Image.open(image_path).convert("RGB")

        # Extract width and height
        class_name = self.data.iloc[idx, 1]
        width = self.data.iloc[idx, 8]  # 'width' is in the ninth column
        height = self.data.iloc[idx, 9]  # 'height' is in the tenth column

        # Extract bounding box coordinates or calculate them if missing
        x_min = self.data.iloc[idx, 4]
        y_min = self.data.iloc[idx, 5]
        x_max = self.data.iloc[idx, 6]
        y_max = self.data.iloc[idx, 7]

        if pd.isna(x_min) or pd.isna(y_min) or pd.isna(x_max) or pd.isna(y_max):
            # Calculate bounding box coordinates as a percentage of image dimensions
            x_min = 0
            y_min = 0
            x_max = 1.0  # Set x_max to 100% of image width
            y_max = 1.0  # Set y_max to 100% of image height

        # bbox = [x_min, y_min, x_max, y_max]
        bbox = torch.tensor([x_min, y_min, x_max, y_max], dtype=torch.float32)


        # Map class name to integer label
        class_label = self.class_to_int[class_name]

        if self.transform:
            image = self.transform(image)
            
        # Return data as a dictionary
        data_dict = {
            'images': image,
            'targets': {
                'class_label': class_label,
                'bbox': bbox,
                'width': width,
                'height': height
            }
        }

        return data_dict

# Define a custom collate function for the DataLoader
def custom_collate_fn(batch):
    images = [item['images'] for item in batch]
    targets = [item['targets'] for item in batch]

    # Convert images and targets into a batch
    images = torch.stack(images, dim=0)
    class_labels = torch.tensor([item['class_label'] for item in targets], dtype=torch.long)
    bboxes = torch.stack([item['bbox'] for item in targets], dim=0) 
    widths = torch.tensor([item['width'] for item in targets], dtype=torch.float32)
    heights = torch.tensor([item['height'] for item in targets], dtype=torch.float32)

    return {
        'images': images,
        'class_labels': class_labels,
        'bboxes': bboxes,
        'widths': widths,
        'heights': heights
    }

# Specify the path to your CSV file and image directory
csv_file = 'train.csv'  # Replace with the actual path to your CSV file
image_dir = 'dataset-jpg/train'  # Replace with the actual path to your image directory

# Define any image transformations you want to apply (e.g., resizing, normalization)
transform = transforms.Compose([transforms.Resize((224, 224)),
                                transforms.ToTensor()])

# Create an instance of the custom dataset
custom_dataset = CustomDataset(csv_file, image_dir, transform=transform)

# Create a data loader with custom collate function
batch_size = 8  # Adjust the batch size according to your needs
train_data_loader = DataLoader(custom_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)


In [9]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image

# Define a custom dataset class for the test data
class TestDataset(Dataset):
    def __init__(self, csv_file, image_dir, transform=None):
        self.data = pd.read_csv(csv_file)
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        image_id = self.data.iloc[idx, 0]  # 'image_id' is the first column
        image_filename = f"{image_id}.jpg"  # Add ".jpg" extension
        image_path = os.path.join(self.image_dir, image_filename)
        image = Image.open(image_path).convert("RGB")

        # Extract width and height
        width = self.data.iloc[idx, 1]  # 'width' is in the second column
        height = self.data.iloc[idx, 2]  # 'height' is in the third column

        if self.transform:
            image = self.transform(image)

        # Create targets dictionary
        targets = {
            'width': width,
            'height': height
        }

        return image, targets

# Specify the path to your test CSV file and image directory
test_csv_file = 'test.csv'  # Replace with the actual path to your test CSV file
test_image_dir = 'dataset-jpg/test'  # Replace with the actual path to your test image directory

# Define any image transformations you want to apply (e.g., resizing, normalization)
transform = transforms.Compose([transforms.Resize((224, 224)),
                                transforms.ToTensor()])

# Create an instance of the custom test dataset
test_dataset = TestDataset(test_csv_file, test_image_dir, transform=transform)

# Create a data loader for the test data
batch_size = 8  # Adjust the batch size according to your needs
test_data_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Now you can load data using the test_data_loader and pass it to your model for bounding box detection
# To mount the data to a device, you can do something like this:
# for images, targets in test_data_loader:
#     images = images.to(device)
#     targets = {key: value.to(device) for key, value in targets.items()}


In [10]:
num_epochs = 3
# Define the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [17]:

# Define the number of object classes and bounding box parameters
num_classes = 14  # Replace with the number of object classes in your dataset
num_bbox_params = 4

In [32]:
# Define a custom object detection model
class ObjectDetectionModel(nn.Module):
    def __init__(self, backbone_model, num_classes, num_bbox_params=4):
        super(ObjectDetectionModel, self).__init__()

        # Load the ResNet-50 backbone
        self.resnet_backbone = backbone_model

        # Object classification head
        self.cls_head = nn.Sequential(
            nn.Linear(2048, 512),  # Adjust input features if needed
            nn.ReLU(),
            nn.Linear(512, num_classes)  # Output logits for each class
        )

        # Bounding box regression head
        self.bbox_head = nn.Sequential(
            nn.Linear(2048, 512),  # Adjust input features if needed
            nn.ReLU(),
            nn.Linear(512, num_bbox_params)  # Output bounding box parameters
        )

        def forward(self, x):
            # Backbone feature extraction
            x = self.resnet_backbone(x)

            # Object classification
            class_logits = self.cls_head(x)

            # Bounding box regression
            bbox_pred = self.bbox_head(x)

            return class_logits, bbox_pred



# Instantiate your custom object detection model with the ResNet-50 backbone
object_detection_model = ObjectDetectionModel(model, num_classes)

# Get the number of output channels for class_logits and bbox_pred from the custom object detection model
num_class_logits_channels = object_detection_model.cls_head[-1].out_features
num_bbox_pred_channels = object_detection_model.bbox_head[-1].out_features

# Print the number of output channels
print("Number of output channels for class_logits:", num_class_logits_channels)
print("Number of output channels for bbox_pred:", num_bbox_pred_channels)


Number of output channels for class_logits: 14
Number of output channels for bbox_pred: 4


In [20]:
# Define the loss functions
import torch.nn as nn
import torch.optim as optim
classification_criterion = nn.CrossEntropyLoss()
regression_criterion = nn.SmoothL1Loss()

learning_rate = 0.001 

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [41]:
from tqdm import tqdm  # Import tqdm

# Train the model
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    # Use tqdm to create a progress bar for the training loop
    tqdm_data_loader = tqdm(train_data_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    for data_dict in tqdm_data_loader:
        images = data_dict['images'].to(device)
        class_labels = data_dict['class_labels'].to(device)
        bboxes = data_dict['bboxes'].to(device)
        widths = data_dict['widths'].to(device)
        heights = data_dict['heights'].to(device)
        
        # Forward pass
        cls_logits = model(images)[:8]
        bbox_pred = model(images)[:8]

        # class_logits, bbox_pred = model(images)
        bbox_pred = bbox_pred[:, :4]

        # Calculate classification loss
        classification_loss = classification_criterion(cls_logits, class_labels)

        # Calculate regression loss
        regression_loss = regression_criterion(bbox_pred, bboxes)

        # Calculate the total loss for this batch
        batch_loss = classification_loss + regression_loss

        # Backpropagation
        optimizer.zero_grad()
        batch_loss.backward()
        optimizer.step()

        # Accumulate the loss for this epoch
        total_loss += batch_loss.item()

        # Update the progress bar description with the current loss
        tqdm_data_loader.set_postfix(loss=batch_loss.item())

    # Print the average loss for the epoch
    average_loss = total_loss / len(train_data_loader)
    print(f"Epoch [{epoch + 1}/{num_epochs}] Loss: {average_loss}")


Epoch 1/3: 100%|██████████| 8490/8490 [26:23<00:00,  5.36it/s, loss=1.29e+3] 


Epoch [1/3] Loss: 667.5331004577475


Epoch 2/3: 100%|██████████| 8490/8490 [18:22<00:00,  7.70it/s, loss=453]    


Epoch [2/3] Loss: 651.836465227646


Epoch 3/3: 100%|██████████| 8490/8490 [18:31<00:00,  7.64it/s, loss=705]    

Epoch [3/3] Loss: 636.2471576672701



