In [16]:
from ultralytics import YOLO
import torch.nn as nn
import copy
import torch
from ultralytics.nn.modules import Concat, C2f, Conv, SPPF

In [17]:
pretrained_model = YOLO('yolov8m.pt').model
backbone = nn.Sequential(*list(pretrained_model.model.children())[:10])

In [18]:
class CustomBackbone(nn.Module):
    def __init__(self, layers, out_idx=[4, 6, 8]):
        super().__init__()
        self.layers = nn.ModuleList(layers)
        self.out_idx = out_idx
        
    def forward(self, x):
        outputs = []
        for idx, layer in enumerate(self.layers):
            x = layer(x)
            if idx in self.out_idx:
                outputs.append(x)
        return outputs

In [19]:
backbone_rgb = CustomBackbone(backbone)
backbone_ir = copy.deepcopy(backbone_rgb)

# Modyfikacja pierwszej konwolucji dla IR (1 kanał)
backbone_ir.layers[0].conv = nn.Conv2d(1, 48, kernel_size=3, stride=2, padding=1, bias=False)

print(backbone_rgb.layers[0].conv.weight.shape)
print(backbone_ir.layers[0].conv.weight.shape)

torch.Size([48, 3, 3, 3])
torch.Size([48, 1, 3, 3])


In [20]:
# Definicja CustomNeck
class CustomNeck(nn.Module):
    def __init__(self, fused_channels):
        super().__init__()
        # fused_channels to liczba kanałów po konkatenacji, np. [192, 384, 576]
        self.layer9 = SPPF(fused_channels[2], fused_channels[2] // 2)  # SPPF dla najgłębszej skali
        self.layer10 = nn.Upsample(scale_factor=2, mode='nearest')
        self.layer11 = Concat()
        self.layer12 = C2f(fused_channels[2] // 2 + fused_channels[1], fused_channels[1] // 2, n=2)
        self.layer13 = nn.Upsample(scale_factor=2, mode='nearest')
        self.layer14 = Concat()
        self.layer15 = C2f(fused_channels[1] // 2 + fused_channels[0], fused_channels[0] // 2, n=2)
        self.layer16 = Conv(fused_channels[0] // 2, fused_channels[0] // 2, 3, s=2)
        self.layer17 = Concat()
        self.layer18 = C2f(fused_channels[0] // 2 + fused_channels[1], fused_channels[1] // 2, n=2)
        self.layer19 = Conv(fused_channels[1] // 2, fused_channels[1] // 2, 3, s=2)
        self.layer20 = Concat()
        self.layer21 = C2f(fused_channels[1] // 2 + fused_channels[2], fused_channels[2] // 2, n=2)

    def forward(self, fused):
        feat1, feat2, feat3 = fused  # feat1: warstwa 4, feat2: warstwa 6, feat3: warstwa 8

        x = self.layer9(feat3)
        x = self.layer10(x)
        x = self.layer11([x, feat2])
        x = self.layer12(x)
        x = self.layer13(x)
        x = self.layer14([x, feat1])
        feat_shallow = self.layer15(x)

        x = self.layer16(feat_shallow)
        x = self.layer17([x, feat2])
        feat_mid = self.layer18(x)
        x = self.layer19(feat_mid)
        x = self.layer20([x, feat3])
        feat_deep = self.layer21(x)

        return [feat_shallow, feat_mid, feat_deep]

In [21]:
# Definicja CustomYOLO
class CustomYOLO(nn.Module):
    def __init__(self, pretrained_model, backbone_rgb, backbone_ir):
        super().__init__()
        self.backbone_rgb = backbone_rgb
        self.backbone_ir = backbone_ir
        
        # Standardowe kanały dla YOLOv8m po warstwach 4, 6, 8 po konkatenacji: [192, 384, 576]
        self.neck_head = CustomNeck(fused_channels=[192, 384, 576])
        
        # Przeniesienie warstwy Detect z pretrenowanego modelu
        self.detect = copy.deepcopy(pretrained_model.model[-1])
        self.args = pretrained_model.args
        
    def forward(self, x_rgb, x_ir):
        # Ekstrakcja cech
        features_rgb = self.backbone_rgb(x_rgb)
        features_ir = self.backbone_ir(x_ir)
        
        # Fuzja przez konkatenację wzdłuż wymiaru kanałów
        fused = [torch.cat([f_rgb, f_ir], dim=1) for f_rgb, f_ir in zip(features_rgb, features_ir)]
        
        # Przetwarzanie przez neck
        neck_outputs = self.neck_head(fused)
        
        # Detekcja
        return self.detect(neck_outputs)

# Inicjalizacja modelu
custom_model = CustomYOLO(pretrained_model, backbone_rgb, backbone_ir)

In [22]:
# import xml.etree.ElementTree as ET
# import os

# def convert_voc_to_yolo(xml_path, output_path):
#     tree = ET.parse(xml_path)
#     root = tree.getroot()
#     size = root.find('size')
#     img_width = int(size.find('width').text)
#     img_height = int(size.find('height').text)
    
#     with open(output_path, 'w') as f:
#         for obj in root.findall('object'):
#             class_id = 0  # Dla klasy 'person', dostosuj jeśli masz inne klasy
#             bbox = obj.find('bndbox')
#             xmin = float(bbox.find('xmin').text)
#             ymin = float(bbox.find('ymin').text)
#             xmax = float(bbox.find('xmax').text)
#             ymax = float(bbox.find('ymax').text)
            
#             # Obliczenie znormalizowanych współrzędnych
#             x_center = (xmin + xmax) / 2 / img_width
#             y_center = (ymin + ymax) / 2 / img_height
#             width = (xmax - xmin) / img_width
#             height = (ymax - ymin) / img_height
            
#             f.write(f"{class_id} {x_center} {y_center} {width} {height}\n")

# # Przykład użycia dla folderu z adnotacjami
# annotations_dir = 'LLVIP/Annotations'  # Zmień na swój folder
# for xml_file in os.listdir(annotations_dir):
#     if xml_file.endswith('.xml'):
#         xml_path = os.path.join(annotations_dir, xml_file)
#         txt_path = xml_path.replace('.xml', '.txt')
#         convert_voc_to_yolo(xml_path, txt_path)

In [23]:
from torch.utils.data import Dataset, DataLoader
import cv2
import os
import torch

class MultimodalYOLODataset(Dataset):
    def __init__(self, rgb_dir, ir_dir, annotations_dir):
        self.rgb_dir = rgb_dir
        self.ir_dir = ir_dir
        self.annotations_dir = annotations_dir
        self.img_files = sorted(os.listdir(rgb_dir))  # Lista plików obrazów

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        img_name = self.img_files[idx]
        rgb_path = os.path.join(self.rgb_dir, img_name)
        ir_path = os.path.join(self.ir_dir, img_name)
        label_path = os.path.join(self.annotations_dir, img_name.replace('.jpg', '.txt'))

        # Wczytanie obrazów
        img_rgb = cv2.imread(rgb_path)
        img_rgb = cv2.cvtColor(img_rgb, cv2.COLOR_BGR2RGB)  # Konwersja BGR na RGB
        img_ir = cv2.imread(ir_path, cv2.IMREAD_GRAYSCALE)  # IR jako skala szarości
        img_ir = img_ir[..., None]  # Dodanie wymiaru kanału: (H, W) -> (H, W, 1)

        # Konwersja na tensory i normalizacja (dostosuj rozmiar, np. 640x640, jeśli potrzeba)
        img_rgb = torch.from_numpy(img_rgb).permute(2, 0, 1).float() / 255.0
        img_ir = torch.from_numpy(img_ir).permute(2, 0, 1).float() / 255.0

        # Wczytanie adnotacji
        if os.path.exists(label_path):
            with open(label_path, 'r') as f:
                labels = [list(map(float, line.split())) for line in f.readlines()]
            labels = torch.tensor(labels) if labels else torch.zeros((0, 5))
        else:
            labels = torch.zeros((0, 5))  # Brak adnotacji

        return img_rgb, img_ir, labels

# Utworzenie dataloadera dla zbioru treningowego
train_dataset = MultimodalYOLODataset(
    rgb_dir='LLVIP/visible/train',
    ir_dir='LLVIP/infrared/train',
    annotations_dir='LLVIP/Annotations'
)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [24]:
from ultralytics import YOLO

# Wczytaj model (np. YOLOv8)
model = YOLO('yolov8m.pt')

# Sprawdź dostępne atrybuty modelu
print(dir(model.model))

['T_destination', '__annotations__', '__call__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattr__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_apply', '_backward_hooks', '_backward_pre_hooks', '_buffers', '_call_impl', '_clip_augmented', '_compiled_call_impl', '_descale_pred', '_forward_hooks', '_forward_hooks_always_called', '_forward_hooks_with_kwargs', '_forward_pre_hooks', '_forward_pre_hooks_with_kwargs', '_get_backward_hooks', '_get_backward_pre_hooks', '_get_name', '_is_full_backward_hook', '_load_from_state_dict', '_load_state_dict_post_hooks', '_load_state_dict_pre_hooks', '_maybe_warn_non_full_backward_hook', '_modules', '_named_members', '_non_persistent_buffers_set', '_parameters'

In [25]:
import torch
from ultralytics.utils.loss import v8DetectionLoss

# Zakładamy, że masz zdefiniowany model (custom_model)
model = custom_model
model.train()
loss_fn = v8DetectionLoss(model)  # Funkcja straty YOLO
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    for batch in train_loader:
        img_rgb, img_ir, targets = batch
        optimizer.zero_grad()
        outputs = model(img_rgb, img_ir)  # Model przyjmuje oba obrazy
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
    print(f"Epoka {epoch+1}/{num_epochs}, Strata: {loss.item()}")

AttributeError: 'CustomYOLO' object has no attribute 'model'