In [15]:
import torch
import cv2
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sn
from torchvision import ops, transforms as trans
import torchvision
from torchvision.transforms import functional as F
import warnings
from ast import literal_eval

My proposed solution for this dataset has two parts:
- Find two-pet images using Torchvision *fasterrcnn_resnet50_fpn* and split them into two seperate images since we have images with two pets but don't have location information. We find if we have a pet or two pets in the images.
- Build a multi-headed classification and segmentation model that outputs breed, cat or dog (since the breed is unique), and the mask.

In this notbook, I calculate/present performance metrics for each method.

In [2]:
test_path = 'inputs/csv/df_test.csv'
# Please update the directory based on the data directory path on your local machine.
img_dir = '/media/sophie/SDD_Data/Data/cats_and_dogs/data/'

### Pet detection model performance
Torchvision *fasterrcnn_resnet50_fpn* object detection model

In [3]:
def calculate_area(bbox):
    x1, y1, x2, y2 = bbox
    width = x2 - x1
    length = y2 - y1
    return width * length

In [4]:
def find_two_biggest_bbox(bbox):
    # Calculate areas of bounding boxes
    areas = [(box[2] - box[0]) * (box[3] - box[1]) for box in bbox]

    # Sort the bounding boxes based on area (descending order)
    sorted_boxes = sorted(zip(bbox, areas), key=lambda x: x[1], reverse=True)

    # Get the two biggest bounding boxes
    big_box_0 = sorted_boxes[0][0]
    big_box_1 = sorted_boxes[1][0]
    bboxes = [big_box_0, big_box_1]
    return bboxes

In [5]:
def calculate_iou(box1, box2):
    # Extract the coordinates of the boxes
    x1_box1, y1_box1, x2_box1, y2_box1 = box1
    x1_box2, y1_box2, x2_box2, y2_box2 = box2

    # Calculate the coordinates of the intersection rectangle
    x_left = max(x1_box1, x1_box2)
    y_top = max(y1_box1, y1_box2)
    x_right = min(x2_box1, x2_box2)
    y_bottom = min(y2_box1, y2_box2)

    # Calculate the area of intersection rectangle
    intersection_area = max(0, x_right - x_left + 1) * max(0, y_bottom - y_top + 1)

    # Calculate the area of both bounding boxes
    box1_area = (x2_box1 - x1_box1 + 1) * (y2_box1 - y1_box1 + 1)
    box2_area = (x2_box2 - x1_box2 + 1) * (y2_box2 - y1_box2 + 1)

    # Calculate the union area by subtracting the intersection area
    union_area = box1_area + box2_area - intersection_area

    # Calculate the IoU
    iou = intersection_area / union_area
    return iou


In [6]:
def get_num_of_pets(bbox):
    """
    Set of rules to detect number of pets in an image.
    """

    if len(bbox) > 2:
        # find two biggest bounding boxes if more than two objects are detected.
        bbox = find_two_biggest_bbox(bbox)
    if len(bbox) == 0:
        # we know there is at least one pet in the image even if it is not detected
        num_of_pets = 1
    elif len(bbox) == 1:
        num_of_pets = 1
    else:
        area_0 = calculate_area(bbox[0])
        area_1 = calculate_area(bbox[1])
        IoU = calculate_iou(bbox[0], bbox[1])
        if IoU > 0.5:
            num_of_pets = 1
        # if one bbox is much smaller than the other, it is a random bbox.
        elif area_1 < area_0 / 2 or area_0 < area_1 / 2:
            num_of_pets = 1
        else:
            num_of_pets = 2

    return num_of_pets


In [7]:
def detect_num_of_pets(model, img):
    image_tensor = F.to_tensor(img)
    image_tensor = torch.unsqueeze(image_tensor, 0)
    with torch.no_grad():
        predictions = model(image_tensor)
    # Process the predictions
    boxes = predictions[0]['boxes']
    scores = predictions[0]['scores']

    # Filter predictions
    cat_dog_indices = [i for i, score in enumerate(scores) if score > 0.85]
    filtered_boxes = boxes[cat_dog_indices]
    pets = get_num_of_pets(filtered_boxes.numpy())
    return pets

In [8]:
df_test = pd.read_csv(test_path)

In [36]:
df_test.head()

Unnamed: 0,id,pet_id,label,breed_label
0,2202c98d-3950-5138-9724-0175c63ba4c6,[634],0,[5]
1,8eac65ff-3695-56d4-b233-e8fd7f3275f8,[3455],0,[5]
2,3f1eaf17-25cb-5a77-8b38-c08ff7bf2efb,[4597],1,[16]
3,84982479-4c5c-5c2b-98b3-e09c7c6fa79f,[6232],1,[14]
4,4f8a373d-026f-5bed-b9b0-404e9485d281,[210],1,[21]


In [21]:
# load torchvision object detection model to detect number of pets in the image.
od_model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
correct = 0
wrong = 0
od_model.eval()
for i, row in df_test.iterrows():
    
    image = Image.open(img_dir + row['id'] + '/image.jpg')
    n_pets = detect_num_of_pets(od_model, image)
    
    breed = np.array(literal_eval(row['breed_label']))
    if n_pets == len(breed):
        correct += 1
    
    else:
        wrong += 1


In [22]:
# Accuracy
accuracy = correct/len(df_test) 
accuracy

0.9065934065934066

In [37]:
two_pet_images = df_test['breed_label'].apply(lambda x: len(np.array(literal_eval(x))) > 1).sum()
two_pet_images

37

In [35]:
single_pet_images = len(df_test) - two_pet_images
single_pet_images

145

The object detection model achieves an accuracy rate of approximately 90%. While this metric provides some indication of performance, it may not be the most suitable measure due to a class imbalance between single-pet and two-pet images. To gain better insights in real-world scenarios, alternative metrics should be considered. Nevertheless, when considering the images themselves, the accuracy metric does reflect the actual performance.

### Multi-headed segmentation classification model
I built a UNet architecture for both segmentation and classification tasks (pet mask and breed detection). The model was trained for 15 epochs, and the performance of the classification and segmentation heads was evaluated. It should be noted that due to time limitations, the model could not be trained for an extended period, resulting in unsatisfactory performance.
All metric calculations are done in *test.py* and a comprehensive presentation of the metrics can be found in the *outout/metrics*.

I used Mean Dice Coefficient metric to calculate the performance of the segmentation head. The Mean Dice Coefficient is *0.76269*

To evaluate the performance of the breed classification task, I utilize the classification performance report provided by the *sklearn* library. These methods allow for a detailed analysis of the model's performance, including metrics such as precision, recall, F1 score, and accuracy. The following is overall weighted average precision, recall, and F1-score. Comprehensive presentation of the metrics and confusion matrix can be found in the outout/metrics.

 precision    recall  f1-score  
    0.34       0.28     0.27
                