In [45]:
from ultralytics import YOLO
import torch.nn as nn
import copy
import torch
from ultralytics.nn.modules import Concat, C2f, Conv


pretrained_model = YOLO('yolov8m.pt').model
backbone = nn.Sequential(*list(pretrained_model.model.children())[:10])


In [46]:
class CustomBackbone(nn.Module):
    def __init__(self, layers, out_idx=[2, 4, 9]):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        self.out_idx = out_idx
        
    def forward(self, x):
        outputs = []
        for idx, layer in enumerate(self.layers):
            x = layer(x)
            if idx in self.out_idx:
                outputs.append(x)
        return outputs

In [47]:
backbone_rgb = CustomBackbone(backbone)
backbone_ir = copy.deepcopy(backbone_rgb)

backbone_ir.layers[0].conv = nn.Conv2d(1, 48, kernel_size=3, stride=2, padding=1, bias=False)
print(backbone_rgb.layers[0].conv.weight.shape)  # Powinno być torch.Size([48, 3, 3, 3])
print(backbone_ir.layers[0].conv.weight.shape)   # Powinno być torch.Size([48, 1, 3, 3])

torch.Size([48, 3, 3, 3])
torch.Size([48, 1, 3, 3])


In [48]:
class CustomYOLO(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        # osobny backbone dla RGB i dla IR
        self.backbone_rgb = backbone_rgb
        self.backbone_ir = backbone_ir

        ### tutaj próbny podział, jeszcze nie udalo mi sie dokonczyc tej idei
        self.neck = nn.ModuleList(list(pretrained_model.model.children())[10:22])  # Warstwy necku
        self.head = list(pretrained_model.model.children())[22]  # Warstwa Detect

        # Reszta modelu (neck + head)
        self.neck_head = nn.Sequential(*list(pretrained_model.model.children())[10:])
        
    # def forward(self, x_rgb, x_ir):
    #     # Przetwórz obrazy przez backbone
    #     features_rgb = self.backbone_rgb(x_rgb)
    #     features_ir = self.backbone_ir(x_ir)
        
    #     for i, feat in enumerate(features_rgb):
    #         print(f"Kształt features_rgb[{i}]:", feat.shape)
    #     for i, feat in enumerate(features_ir):
    #         print(f"Kształt features_ir[{i}]:", feat.shape)
        
    #     fused_features = [f_rgb + f_ir for f_rgb, f_ir in zip(features_rgb, features_ir)]
        
    #     # Przekaż do neck/head
    #     return self.neck_head(fused_features)
    def forward(self, x_rgb, x_ir):
        # Przetwarzanie przez backbone
        p3_rgb, p4_rgb, p5_rgb = self.backbone_rgb(x_rgb)
        p3_ir, p4_ir, p5_ir = self.backbone_ir(x_ir)
        
        # Fuzja cech (suma)
        x = [
            p3_rgb + p3_ir,  # [1, 96, 160, 160]
            p4_rgb + p4_ir,  # [1, 192, 80, 80]
            p5_rgb + p5_ir   # [1, 576, 20, 20]
        ]

        # Mapowanie indeksów warstw necku
        layer_mapping = {
            0: ("upsample", 2),    # Upsample p5 (20x20 → 40x40)
            1: ("concat", [2, 1]), # Concat p5_upsampled + p4
            2: ("c2f", 768, 384),  # C2f(768→384)
            3: ("upsample", 2),    # Upsample do 80x80
            4: ("concat", [2, 0]), # Concat z p3
            5: ("c2f", 480, 192),  # C2f(480→192)
            6: ("conv", 192, 192, 3, 2), # Downsample 160x160 → 80x80
            7: ("concat", [1, 0]), # Concat z p4
            8: ("c2f", 576, 384),  # C2f(576→384)
            9: ("conv", 384, 384, 3, 2), # Downsample 80x80 → 40x40
            10: ("concat", [0, 2]), # Concat z p5
            11: ("c2f", 1152, 576) # C2f(1152→576)
        }

        # Ręczne przetwarzanie necku
        for i, layer in enumerate(self.neck):
            op_type = layer_mapping[i][0]
            
            if op_type == "upsample":
                x[-1] = layer(x[-1])  # Upsample ostatniego tensora
                
            elif op_type == "concat":
                idx1, idx2 = layer_mapping[i][1]
                x.append(layer([x[idx1], x[idx2]]))  # Concat
                
            elif op_type == "c2f":
                x[-1] = layer(x[-1])  # C2f
                
            elif op_type == "conv":
                x[-1] = layer(x[-1])  # Conv z downsamplingu

        return self.head(x[-3:])  # Ostatnie 3 tensory dla Detect

In [49]:
# Inicjalizacja modelu
model = CustomYOLO(pretrained_model)

x_rgb = torch.randn(1, 3, 640, 640)
x_ir = torch.randn(1, 1, 640, 640)

# Forward pass
output = model(x_rgb, x_ir)

RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 40 but got size 80 for tensor number 1 in the list.