In [None]:
from ultralytics import YOLO
import torch.nn as nn
import copy
import torch
from ultralytics.nn.modules import Concat, C2f, Conv


pretrained_model = YOLO('yolov8m.pt').model
backbone = nn.Sequential(*list(pretrained_model.model.children())[:10])


In [None]:
class CustomBackbone(nn.Module):
    def __init__(self, layers, out_idx=[2, 4, 9]):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        self.out_idx = out_idx
        
    def forward(self, x):
        outputs = []
        for idx, layer in enumerate(self.layers):
            x = layer(x)
            if idx in self.out_idx:
                outputs.append(x)
        return outputs

In [None]:
backbone_rgb = CustomBackbone(backbone)
backbone_ir = copy.deepcopy(backbone_rgb)

backbone_ir.layers[0].conv = nn.Conv2d(1, 48, kernel_size=3, stride=2, padding=1, bias=False)
print(backbone_rgb.layers[0].conv.weight.shape)
print(backbone_ir.layers[0].conv.weight.shape)

torch.Size([48, 3, 3, 3])
torch.Size([48, 1, 3, 3])


In [None]:
class CustomNeck(nn.Module):
    def __init__(self, fused_channels=[96, 192, 576]):
        super().__init__()
        # Inicjalizacja warstw taka sama jak wcześniej
        self.upsample1 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_p4 = C2f(fused_channels[2] + fused_channels[1], 384, n=2)
        self.upsample2 = nn.Upsample(scale_factor=2, mode='nearest')
        self.c2f_p3 = C2f(384 + fused_channels[0], 192, n=2)
        self.conv_p3 = Conv(192, 192, k=3, s=2)
        self.c2f_d1 = C2f(192 + 384, 384, n=2)
        self.conv_p4 = Conv(384, 384, k=3, s=2)
        self.c2f_d2 = C2f(384 + 576, 576, n=2)

    def forward(self, fused_features):
        p3, p4, p5 = fused_features
        print(f"\nWejście do necku: P3={p3.shape}, P4={p4.shape}, P5={p5.shape}")

        # Top-down path (P5 -> P4)
        x = self.upsample1(p5)
        print(f"Po upsample1: {x.shape}")
        x = torch.cat([x, p4], 1)
        print(f"Po concat z P4: {x.shape}")
        x_p4 = self.c2f_p4(x)
        print(f"Po c2f_p4: {x_p4.shape}")

        # Top-down path (P4 -> P3)
        x = self.upsample2(x_p4)
        print(f"Po upsample2: {x.shape}")
        x = torch.cat([x, p3], 1)
        print(f"Po concat z P3: {x.shape}")
        x_p3 = self.c2f_p3(x)
        print(f"Po c2f_p3: {x_p3.shape}")

        # Bottom-up path (P3 -> P4)
        x = self.conv_p3(x_p3)
        print(f"Po conv_p3 (downsample): {x.shape}")
        x = torch.cat([x, x_p4], 1)
        print(f"Po concat z x_p4: {x.shape}")
        x_d1 = self.c2f_d1(x)
        print(f"Po c2f_d1: {x_d1.shape}")

        # Bottom-up path (P4 -> P5)
        x = self.conv_p4(x_d1)
        print(f"Po conv_p4 (downsample): {x.shape}")
        x = torch.cat([x, p5], 1)
        print(f"Po concat z P5: {x.shape}")
        x_d2 = self.c2f_d2(x)
        print(f"Po c2f_d2: {x_d2.shape}")

        return [x_p3, x_d1, x_d2]

In [None]:
class CustomYOLO(nn.Module):
    def __init__(self, pretrained_model, backbone_rgb, backbone_ir):
        super().__init__()
        self.backbone_rgb = backbone_rgb
        self.backbone_ir = backbone_ir
        
        # Inicjalizacja necku z odpowiednimi parametrami
        self.neck_head = CustomNeck(fused_channels=[96, 192, 576])
        
        # Przeniesienie warstwy Detect z pretrenowanego modelu
        self.detect = copy.deepcopy(pretrained_model.model[-1])
        
    def forward(self, x_rgb, x_ir):
        # Ekstrakcja cech
        features_rgb = self.backbone_rgb(x_rgb)
        features_ir = self.backbone_ir(x_ir)
        
        # Fuzja przez sumowanie
        fused = [f_rgb + f_ir for f_rgb, f_ir in zip(features_rgb, features_ir)]
        
        # Przetwarzanie przez neck
        neck_outputs = self.neck_head(fused)
        
        # Detekcja
        return self.detect(neck_outputs)

In [None]:
# Przykładowe dane wejściowe
x_rgb = torch.randn(1, 3, 640, 640)
x_ir = torch.randn(1, 1, 640, 640)

model = CustomYOLO(pretrained_model, backbone_rgb, backbone_ir)
outputs = model(x_rgb, x_ir)

# Sprawdzenie kształtów
for out in outputs:
    print(out.shape)  # Powinno być (batch, num_anchors, H, W)


Wejście do necku: P3=torch.Size([1, 192, 80, 80]), P4=torch.Size([1, 384, 40, 40]), P5=torch.Size([1, 576, 20, 20])
Po upsample1: torch.Size([1, 576, 40, 40])
Po concat z P4: torch.Size([1, 960, 40, 40])


RuntimeError: Given groups=1, weight of size [384, 768, 1, 1], expected input[1, 960, 40, 40] to have 768 channels, but got 960 channels instead

In [None]:
def validate_shapes(model, input_size=(640, 640)):
    # Generujemy dummy data
    x_rgb = torch.randn(1, 3, *input_size)
    x_ir = torch.randn(1, 1, *input_size)
    
    print("="*50)
    print("WALIDACJA KSZTAŁTÓW - START")
    print("="*50 + "\n")

    # Forward pass przez backbone
    with torch.no_grad():
        # ----------------------------
        # Etap 1: Backbone RGB
        # ----------------------------
        print("[BACKBONE RGB]")
        features_rgb = []
        x = x_rgb.clone()
        for idx, layer in enumerate(model.backbone_rgb.layers):
            x = layer(x)
            if idx in model.backbone_rgb.out_idx:
                features_rgb.append(x)
                print(f"Warstwa {idx}: {x.shape}")
        print("\n")

        # ----------------------------
        # Etap 2: Backbone IR
        # ----------------------------
        print("[BACKBONE IR]")
        features_ir = []
        x = x_ir.clone()
        for idx, layer in enumerate(model.backbone_ir.layers):
            x = layer(x)
            if idx in model.backbone_ir.out_idx:
                features_ir.append(x)
                print(f"Warstwa {idx}: {x.shape}")
        print("\n")

        # ----------------------------
        # Etap 3: Fuzja cech
        # ----------------------------
        print("[FUZJA CECH]")
        fused = []
        for i, (f_rgb, f_ir) in enumerate(zip(features_rgb, features_ir)):
            print(f"Przed fuzją - RGB[{i}]: {f_rgb.shape}, IR[{i}]: {f_ir.shape}")
            fused.append(f_rgb + f_ir)
            print(f"Po fuzji [{i}]: {fused[-1].shape}\n")
        print("\n")

        # ----------------------------
        # Etap 4: Przetwarzanie w necku
        # ----------------------------
        print("[NECK]")
        neck_outputs = model.neck_head(fused)
        print("\nKońcowe wyjścia z necku:")
        for i, out in enumerate(neck_outputs):
            print(f"Output {i}: {out.shape}")
        print("\n")

        # ----------------------------
        # Etap 5: Warstwa Detect
        # ----------------------------
        print("[DETECT]")
        print("Wejścia do Detect:", [o.shape for o in neck_outputs])
        detect_output = model.detect(neck_outputs)
        
        print("\nKońcowe wyjścia Detect:")
        if isinstance(detect_output, tuple):
            for i, out in enumerate(detect_output):
                print(f"Output {i}: {out.shape}")
        else:
            print(detect_output.shape)

    print("\n" + "="*50)
    print("WALIDACJA ZAKOŃCZONA")
    print("="*50)

# Inicjalizacja modelu
model = CustomYOLO(pretrained_model, backbone_rgb, backbone_ir)

# Uruchomienie walidacji
validate_shapes(model, input_size=(640, 640))

WALIDACJA KSZTAŁTÓW - START

[BACKBONE RGB]
Warstwa 2: torch.Size([1, 96, 160, 160])
Warstwa 4: torch.Size([1, 192, 80, 80])
Warstwa 9: torch.Size([1, 576, 20, 20])


[BACKBONE IR]
Warstwa 2: torch.Size([1, 96, 160, 160])
Warstwa 4: torch.Size([1, 192, 80, 80])
Warstwa 9: torch.Size([1, 576, 20, 20])


[FUZJA CECH]
Przed fuzją - RGB[0]: torch.Size([1, 96, 160, 160]), IR[0]: torch.Size([1, 96, 160, 160])
Po fuzji [0]: torch.Size([1, 96, 160, 160])

Przed fuzją - RGB[1]: torch.Size([1, 192, 80, 80]), IR[1]: torch.Size([1, 192, 80, 80])
Po fuzji [1]: torch.Size([1, 192, 80, 80])

Przed fuzją - RGB[2]: torch.Size([1, 576, 20, 20]), IR[2]: torch.Size([1, 576, 20, 20])
Po fuzji [2]: torch.Size([1, 576, 20, 20])



[NECK]

Wejście do necku: P3=torch.Size([1, 96, 160, 160]), P4=torch.Size([1, 192, 80, 80]), P5=torch.Size([1, 576, 20, 20])
Po upsample1: torch.Size([1, 576, 40, 40])


RuntimeError: Sizes of tensors must match except in dimension 1. Expected size 40 but got size 80 for tensor number 1 in the list.