# Library

In [9]:
import torch
import torchvision
from torchvision import models
from torch import nn
from torchvision.ops.roi_pool import RoIPool
import torch.nn.functional as F


# Faster R-CNN


Faster R-CNN was introduced in 2015 by Shaoqing Ren and his colleagues [link](https://arxiv.org/abs/1506.01497).
The previous R-CNN algorithm used selective search to generate region proposals and processed each ROI separately on the image, which was very time-consuming.
Faster R-CNN, however, performs region proposal on the feature map instead of the original image.
Moreover, it uses a Region Proposal Network (RPN) instead of traditional/manual methods.
The RPN is implemented as a neural network, which allows end-to-end learning and significantly improves localization accuracy

In [10]:
class feature_extract(nn.Module):
    def __init__(self ):
        super().__init__()
        model=models.resnet50(pretrained=True)
        self.model=nn.Sequential(*list(model.children())[:-2])
    
    def forward(self,x):
        x=self.model(x)
        return x


In this implementation, we used ResNet-50 and removed the last two layers, which were originally designed for classification, because our goal is to generate feature maps.

In [17]:
class rpn(nn.Module):
    def __init__(self, inchanel,midchanel,anchors):
        super().__init__()
        self.conv=nn.Conv2d(inchanel,midchanel,kernel_size=3,padding=2)
        self.cls=nn.Conv2d(midchanel,anchors*2,1)
        self.bbox=nn.Conv2d(midchanel,anchors*4,1)

    def forward(self,x):
        x=self.conv(x)
        cls=self.cls(x)
        box=self.bbox(x)
        return cls,box

In RPN, we place small fixed anchors on the feature map and assign them fixed sizes.
Then, using the cls output, we estimate the probability of an object existing in each anchor.
(Each anchor corresponds to a region relative to the original image.)

In [18]:
class head(nn.Module):
    def __init__(self,inchanel,num_anchor,num_class):
        super().__init__()
        self.roi=RoIPool(output_size=(7,7),spatial_scale=1/16)
        self.fc1=nn.Linear(inchanel*7*7,1024)
        self.fc2=nn.Linear(1024,1024)
        self.cls=nn.Linear(1024,num_anchor*num_class)
        self.bbox=nn.Linear(1024,num_anchor*4)

    def forward(self,x,rois):
        x=self.roi(x,rois)
        x = x.view(x.size(0), -1)  #flatten
        x=self.fc1(x)
        x=self.fc2(x)
        cls=self.cls(x)
        bbox=self.bbox(x)
        return cls,bbox



In the Head, each ROI is cropped from the feature map and resized to a fixed size.
It then passes through two fully connected layers (FC).
Finally, the network predicts the bounding box coordinates and the class p

In [21]:
class faster_rcnn(nn.Module):
    def __init__(self):
        super().__init__()
        self.feature=feature_extract()
        self.rpn=rpn(2048,512,9)
        self.head=head(2048,9,3)

    def forward(self,image,rois):
        feature=self.feature(image)
        rpn_cls,rpn_box=self.rpn(feature)
        cls_head,cls_box=self.head(feature,rois)
        return rpn_cls, rpn_box, cls_head, cls_box




RPN: looks at several small regions of the image (anchors) and says, "there might be an object here or not."

It does not yet know the objectâ€™s class or its precise coordinates.

Head (Fast R-CNN): works on the ROIs selected by the RPN and predicts the exact bounding box and the actual class of the object.

In [22]:

model = faster_rcnn()  
images = torch.randn(2, 3, 224, 224)
rois = torch.tensor([[0, 10, 10, 100, 100], [1, 50, 50, 150, 150]], dtype=torch.float)  # [batch_idx, x1, y1, x2, y2]
    
rpn_logits, rpn_bbox, cls_logits, bbox_reg = model(images, rois)
print("RPN logits:", rpn_logits.shape)
print("RPN bbox:", rpn_bbox.shape)
print("RCNN class logits:", cls_logits.shape)
print("RCNN bbox:", bbox_reg.shape)

RPN logits: torch.Size([2, 18, 9, 9])
RPN bbox: torch.Size([2, 36, 9, 9])
RCNN class logits: torch.Size([2, 27])
RCNN bbox: torch.Size([2, 36])


# Yolo

In 2015, Joseph Redmon introduced the YOLO (You Only Look Once) model. Unlike two-stage architectures such as Faster R-CNN, which first generate region proposals and then perform classification and regression on them, YOLO performs detection in a single stage (single-shot) and in one pass through the network. This approach is very fast and suitable for real-time applications.

In [23]:
class yolo(nn.Module):
    def __init__(self, s,c,b):
        super().__init__()
        self.s=s
        self.b=b
        self.c=c
       # model=models.resnet50(pretrained=True)
        self.feature=nn.Sequential(
            nn.Conv2d(3,16,3,padding=1),
            nn.ReLU(),
            nn.Conv2d(16,32,3),
            nn.MaxPool2d(2,2),       
        )

        self.classifier=nn.Sequential(
            nn.Flatten(),
            nn.Linear(64*self.s**2,516),
            nn.ReLU(),
            nn.Linear(512,self.s**2*(self.b*4+c))
        )


    def forward(self,x):
        x=self.feature(x)
        x=self.classifier(x)
        x=x.view(-1,self.s,self.s,self.b*5+self.c)
        return x



In [28]:
model = yolo(s=7, c=20, b=2)
x = torch.randn(2, 3, 224, 224)
with torch.no_grad():
    out = model.feature(x)
print(out.shape)



torch.Size([2, 32, 111, 111])


# SSD

The SSD (Single Shot MultiBox Detector) model was introduced in 2016 by Google researchers under the supervision of Wei Liu. This model is a single-shot architecture, meaning that object detection is performed in one step and directly.

Unlike YOLO (which is anchor-free in its newer versions), SSD is an anchor-based model. It can be considered conceptually between Faster R-CNN and YOLO:

. Like Faster R-CNN, it uses anchor boxes,

. But like YOLO, it is single-stage and faster.

The structure of SSD consists of two main parts:

1.Backbone for extracting feature maps

2.Prediction layers (classification + localization) for generating class scores and bounding box coordinates at multiple scales

In terms of speed, SSD is faster than Faster R-CNN, but slightly slower than YOLO.

In [35]:
class ssd_head(nn.Module):
    def __init__(self):
        super().__init__()
        self.anchors=4
        self.classes=4
        self.feature=nn.Sequential(

            nn.Conv2d(3,16,kernel_size=3,padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16,32,kernel_size=3),
            nn.ReLU(),
            nn.Conv2d(32,64,kernel_size=3),
            nn.ReLU()


        )

    def forward(self,x):
        x=self.feature(x)
        return x

In [38]:
class SSD(nn.Module):
    def __init__(self,anchor=4,classes=3):
        super().__init__()
        self.feature=ssd_head()

        self.classes=nn.Conv2d(
            64,anchor*classes,kernel_size=3
        )

        self.anchors=nn.Conv2d(64,anchor*4,kernel_size=3)

    def forward(self,x):
        x=self.feature(x)
        classes=self.classes(x)
        anchor=self.anchors(x)
        return classes,anchor



In [39]:

x = torch.randn(2, 3, 224, 224)

model = SSD(anchor=4, classes=3)

classes_out, anchors_out = model(x)

print("Classes output shape:", classes_out.shape)
print("Anchors output shape:", anchors_out.shape)

Classes output shape: torch.Size([2, 12, 106, 106])
Anchors output shape: torch.Size([2, 16, 106, 106])


# RetinaNet

RetinaNet was introduced by the Facebook AI Research team and performs well in detecting small objects. Like SSD and YOLO, it is a single-stage detector, meaning it predicts both the location and class of objects in one step.
One cool feature of RetinaNet is the focal loss, which helps the model focus more on hard examples during training.
The architecture also includes two subnetworks: one for classification and another for box regression.


In [40]:
class Backbone(nn.Module):
    def __init__(self):
        super().__init__()
        resnet=models.resnet18(pretrained=True)
        self.back=nn.Sequential(*list(resnet.children())[:-2])
        self.out_channels=512

    def forward(self,x):
        back=self.back(x)
        return back


In [41]:
class ratinehead(nn.Module):
    def __init__(self,in_chanel,num_anchors,num_class):
        super().__init__()
        cls_subnet=[]
        anchor_subnet=[]
        for _ in range(5):
            cls_subnet.append(nn.Conv2d(in_chanel,in_chanel,3,padding=1))
            cls_subnet.append(nn.ReLU())
        
        self.cls_subnet=nn.Sequential(*cls_subnet)

        for _ in range(5):
            anchor_subnet.append(nn.Conv2d(in_chanel,in_chanel,3,padding=1))
            anchor_subnet.append(nn.ReLU())
        
        self.anchor_subnet=nn.Sequential(*anchor_subnet)

        self.cls=nn.Conv2d(in_chanel,num_anchors*num_class,kernel_size=3,padding=1)
        self.anchors=nn.Conv2d(in_chanel,num_anchors*4,kernel_size=3,padding=1)


    def forward(self,x):
        
        cls_subnet=self.cls_subnet(x)
        anchor_subnet=self.anchor_subnet(x)

        cls=self.cls(cls_subnet)
        anchor=self.anchors(anchor_subnet)
        return cls,anchor
    


In [44]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, logits, targets):
        ce = F.cross_entropy(logits, targets, reduction="none")
        pt = torch.exp(-ce)
        loss = self.alpha * (1 - pt) ** self.gamma * ce
        return loss.mean()

class RetinaNet(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.backbone = Backbone()
        self.head = ratinehead(self.backbone.out_channels, num_anchors=9,num_class=num_classes)

    def forward(self, x, targets=None):
        feats = self.backbone(x)
        cls_outs, reg_outs = self.head(feats)
        if targets is None:
            return cls_outs, reg_outs

In [45]:
model = RetinaNet(num_classes=3)
x = torch.randn(2, 3, 224, 224)
cls_outs, reg_outs = model(x)
print("Cls output shape:", cls_outs.shape)    # [B, 9*C, H, W]
print("Reg output shape:", reg_outs.shape)    # [B, 9*4, H, W]

Cls output shape: torch.Size([2, 27, 7, 7])
Reg output shape: torch.Size([2, 36, 7, 7])
