In [None]:
from ultralytics import YOLO
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
import glob
import cv2

In [None]:
#from rolo_model import ROLO_LSTM
#from yolo_feature_extractor import YOLOv8FeatureExtractor
#from rolo_dataset import RoloSequenceDataset

model = YOLO('yolov8s.pt')

Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\stefa\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt'...


100%|██████████| 21.5M/21.5M [00:01<00:00, 22.0MB/s]


In [None]:
class YOLOv8FeatureExtractor:
    def __init__(self, model_path='yolov8s.pt', device=None, feature_layer=-2):
        """
        model_path: Ruta al modelo YOLOv8 (p.ej. 'yolov8s.pt')
        device: 'cuda' o 'cpu'
        feature_layer: Índice de la capa desde donde extraer características
        """
        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = YOLO(model_path).to(self.device)
        self.model.eval()
        self.feature_layer = feature_layer

    def extract(self, image):
        """
        Procesa una imagen y devuelve bbox y feature vector.
        image: numpy array (HxWx3) en formato RGB
        returns: bbox (x, y, w, h) normalizado y feature vector
        """
        # Paso 1: Run YOLOv8 forward
        result = self.model.predict(image, verbose=False)[0]

        # Paso 2: Obtener detecciones
        detections = result.boxes
        if detections is None or len(detections.conf) == 0:
            return None, None  # No detección

        # Tomar la detección más confiable (sin filtrar por clase)
        scores = detections.conf
        boxes = detections.xywhn  # Normalizado (x_center, y_center, width, height)
        best_idx = scores.argmax()
        bbox = boxes[best_idx].cpu().numpy()

        # Paso 3: Extraer características desde la capa deseada
        with torch.no_grad():
            input_tensor = self.model.transforms(image, self.device)[0]
            feature_maps = self.model.model[:self.feature_layer + 1](input_tensor)

            if isinstance(feature_maps, (list, tuple)):
                feat = feature_maps[-1]
            else:
                feat = feature_maps

            # Pooling para vector fijo
            feature_vector = torch.nn.functional.adaptive_avg_pool2d(feat, 1).squeeze().flatten()
            feature_vector = feature_vector.cpu().numpy()

        return bbox, feature_vector

In [None]:
class ROLO_LSTM(nn.Module):
    def __init__(self, feature_dim=1024, hidden_dim=512, num_layers=1):
        """
        feature_dim: dimensión del vector de características (D)
        hidden_dim: dimensión del estado oculto de la LSTM
        num_layers: número de capas LSTM
        """
        super(ROLO_LSTM, self).__init__()

        self.input_dim = feature_dim + 4  # [features + bbox]
        self.hidden_dim = hidden_dim

        self.lstm = nn.LSTM(input_size=self.input_dim,
                            hidden_size=self.hidden_dim,
                            num_layers=num_layers,
                            batch_first=True)

        self.fc = nn.Linear(self.hidden_dim, 4)  # predice bbox: (x, y, w, h)

    def forward(self, x, hidden=None):
        """
        x: Tensor de entrada (batch_size, seq_len, input_dim)
        hidden: estado inicial de la LSTM (opcional)
        returns: predicciones (batch_size, seq_len, 4)
        """
        lstm_out, hidden = self.lstm(x, hidden)
        output = self.fc(lstm_out)
        return output, hidden

In [2]:
print(model.model)  # Muestra toda la arquitectura

DetectionModel(
  (model): Sequential(
    (0): Conv(
      (conv): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(32, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (1): Conv(
      (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
      (act): SiLU(inplace=True)
    )
    (2): C2f(
      (cv1): Conv(
        (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
      (cv2): Conv(
        (conv): Conv2d(96, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.03, affine=True, track_running_stats=True)
        (act): SiLU(inplace=True)
      )
    