In [1]:
import torch
import torchvision.models as models
import torch.nn as nn
import os

In [None]:
edited_components_dir = "edited_components"

if not os.path.exists(edited_components_dir):
    os.makedirs(edited_components_dir)

# YOLO

For our object detection we are using YOLOv8. We are actually using 2 seperate models for person and face detectinon. To apply the unified backbone into the YOLOv8 model branches, we need to modify the YOLOv8 model to accept the backbone's output as an input. To do that, we need to remove the image input layers from the YOLOv8 model and replace it with the backbone's output. We will work on the backbone's output later, for now, we neeed simply need to remove the image input layers from the YOLOv8 model and add a small adapter layer to replace it.

In [None]:
from ultralytics import YOLO

In [None]:
model_path = 'component_models/yolov8n.pt'
yolo_model = YOLO(model_path)

In [None]:
print(yolo_model.model)

Printing out YOLO shows that the first two layers are for augmenting the image to be processed. We can remove these two layers and use the rest of the network to process the image. 

In [None]:
class CustomYOLO(nn.Module):
    def __init__(self, yolo_model, backbone_channels=512):
        super(CustomYOLO, self).__init__()
        
        # Remove the first two layers of YOLO, which are for processing input
        self.yolo = nn.Sequential(*list(yolo_model.model)[2:])
        
        # Adapter to match YOLO's expected features (32 channels)
        self.adapter = nn.Conv2d(backbone_channels, 32, kernel_size=1)
        
    def forward(self, backbone_features):
        x = self.adapter(backbone_features)
        x = self.yolo(x)
        return x

In [None]:
model_path = 'component_models/yolov8n.pt'
yolo_model = YOLO(model_path)
feature_extractor = CustomYOLO(yolo_model)
torch.save(feature_extractor.state_dict(), 'edited_components/custom_yolo.pth')

In [None]:
model_path = 'component_models/yolov8n-face.pt'
yolo_face_model = YOLO(model_path)
feature_extractor = CustomYOLO(yolo_face_model)
torch.save(feature_extractor.state_dict(), 'edited_components/custom_yolo_face.pth')

# AdaFace

# ViT Pose

# Backbone

For our backbone we chose Resnet50. We chose Resnet50 because it is a well known and well tested backbone that has been used in many object detection models. This class copies the Resnet50 model format, exept for the last layer. Instead of a classifyer output, we have branches depending on the task. The task is passed into the forward funciton, and only the relevant branch's connection is returned. The full multitask branch is not processed at this stage, but due to the seperate architectures between YOLOv8, AdaFace, and ViT Pose, we need the backbone to have seperate outputs to properly provide valid inputs to each model.

As opposed to downloading the backbone via `download_models.ipynb`, we can instead take the simpler approach of using the `torchvision` library to download the model. This is done by calling `torchvision.models.resnet50(pretrained=True)`. This will download the model and load the pretrained weights. We can then modify the model to have the desired outputs.

In [2]:
resnet_model = models.resnet50(pretrained=True)



In [3]:
print(resnet_model)

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

As stated above, we copy the structure of Resnet50, but modify the last layer to have inputs for the individual branches.

In [None]:
class MultiTaskResNetFeatureExtractor(nn.Module):
    def __init__(self, original_model):
        super(MultiTaskResNetFeatureExtractor, self).__init__()

        # Define the layers of the original model
        self.conv1 = original_model.conv1
        self.bn1 = original_model.bn1
        self.relu = original_model.relu
        self.maxpool = original_model.maxpool
        self.layer1 = original_model.layer1
        self.layer2 = original_model.layer2
        self.layer3 = original_model.layer3
        self.layer4 = original_model.layer4
        self.avgpool = original_model.avgpool
        self.flatten = nn.Flatten()
        
        # Define separate heads for different outputs, which have different sizes
        self.yolo_head = nn.Conv2d(2048, 512, kernel_size=1)
        self.ada_face_head = nn.Linear(2048, 1)  # TODO Change the output size to match ADA Face
        self.vit_pose_head = nn.Linear(2048, 1)  # TODO Change the output size to match ViT Pose

    def forward(self, x, current_task):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)

        # Get outputs for each head
        return torch.where(current_task == 'person_detection' or current_task == 'face_detection',
            self.yolo_forward(x),
                torch.where(current_task == 'face_identification',
                        self.ada_face_forward(x),
                        self.vit_pose_forward(x)))
    
    def yolo_forward(self, x):
        return self.yolo_head(x)
    
    def ada_face_forward(self, x):
        x = self.flatten(x)
        return self.ada_face_head(x)
    
    def vit_pose_forward(self, x):
        x = self.flatten(x)
        return self.vit_pose_head(x)


In [9]:
feature_extractor = MultiTaskResNetFeatureExtractor(resnet_model)
torch.save(feature_extractor.state_dict(), 'edited_components/resnet_feature_extractor.pth')

In [None]:
feature_extractor = MultiTaskResNetFeatureExtractor(resnet_model)
print(feature_extractor)

# Combined Model

In [None]:
class CombinedModel(nn.Module):
    def __init__(self, backbone, yolo_face, yolo_person, ada_face, vit_pose):
        super(CombinedModel, self).__init__()
        self.current_task = 'person_detection'
        self.backbone = backbone
        self.yolo_face = CustomYOLO(yolo_face, backbone_channels=512)
        self.yolo_person = CustomYOLO(yolo_person, backbone_channels=512)
        
        # TODO Implement these task models
        self.ada_face = CustomAdaFace(ada_face)
        self.vit_pose = CustomVitPose(vit_pose)


    def set_task(self, task_name):
        supported_tasks = ['face_detection', 'person_detection', 'pose_estimation', 'face_identification']
        if task_name not in supported_tasks:
            raise ValueError(f"Task {task_name} not supported. Available tasks: {', '.join(supported_tasks)}")
        self.current_task = task_name
        
    def forward(self, x):
        # Get features from backbone for the task
        features = self.backbone(x, self.current_task)

        return torch.where(self.current_task == 'pose_estimation',
            self.ada_face(features),
            torch.where(self.current_task == 'person_detection',
                    self.yolo_person(features),
                    torch.where(self.current_task == 'face_detection',
                        self.yolo_face(features),
                        self.vit_pose(features))))

