In [9]:
import torch
import torch.nn as nn

# --------------------------
# Bloques base
# --------------------------


class Conv(nn.Module):
    """Conv 2D + SiLU"""

    def __init__(self, in_ch, out_ch, k=1, s=1, p=None):
        super().__init__()
        if p is None:
            p = k // 2
        self.conv = nn.Conv2d(
            in_ch, out_ch, kernel_size=k, stride=s, padding=p, bias=False
        )
        self.bn = nn.BatchNorm2d(out_ch)
        self.act = nn.SiLU()

    def forward(self, x):
        return self.act(self.bn(self.conv(x)))


class Bottleneck(nn.Module):
    """Mini-Bottleneck: 1x1 + 3x3 + Residual"""

    def __init__(self, in_ch, out_ch, shortcut=True, expansion=0.5):
        super().__init__()
        hidden_ch = int(out_ch * expansion)
        self.cv1 = Conv(in_ch, hidden_ch, k=1, s=1)
        self.cv2 = Conv(hidden_ch, out_ch, k=3, s=1)
        self.use_shortcut = shortcut and in_ch == out_ch

    def forward(self, x):
        y = self.cv1(x)
        y = self.cv2(y)
        if self.use_shortcut:
            y = y + x
        return y


class C2f(nn.Module):
    """C2f block: Conv 1x1 -> n bottlenecks -> concatenation -> Conv 1x1"""

    def __init__(self, in_ch, out_ch, n=2, expansion=0.5):
        super().__init__()
        hidden_ch = int(out_ch * expansion)
        self.cv1 = Conv(in_ch, hidden_ch, k=1, s=1)
        self.bottlenecks = nn.ModuleList(
            [
                Bottleneck(hidden_ch, hidden_ch, shortcut=True, expansion=1.0)
                for _ in range(n)
            ]
        )
        self.cv2 = Conv(hidden_ch * (n + 1), out_ch, k=1, s=1)

    def forward(self, x):
        x1 = self.cv1(x)
        outputs = [x1]
        for b in self.bottlenecks:
            x1 = b(x1)
            outputs.append(x1)
        y = torch.cat(outputs, dim=1)
        return self.cv2(y)


# --------------------------
# Modelo YOLOv8n-cls completo
# --------------------------


class YOLOv8nCls(nn.Module):
    def __init__(self, num_classes=1000):
        super().__init__()
        # --------------------------
        # Backbone
        # --------------------------
        self.stem = Conv(3, 16, k=3, s=2)  # 224x224x3 -> 112x112x16
        self.conv1 = Conv(16, 32, k=3, s=2)  # 112x112x16 -> 56x56x32
        self.c2f1 = C2f(32, 32, n=1)  # 56x56x32 -> 56x56x32
        self.conv2 = Conv(32, 64, k=3, s=2)  # 56x56x32 -> 28x28x64
        self.c2f2 = C2f(64, 64, n=2)  # 28x28x64 -> 28x28x64
        self.conv3 = Conv(64, 128, k=3, s=2)  # 28x28x64 -> 14x14x128
        self.c2f3 = C2f(128, 128, n=3)  # 14x14x128 -> 14x14x128
        self.conv4 = Conv(128, 256, k=3, s=2)  # 14x14x128 -> 7x7x256
        self.c2f4 = C2f(256, 256, n=1)  # 7x7x256 -> 7x7x256

        # --------------------------
        # Head (clasificación)
        # --------------------------
        self.global_pool = nn.AdaptiveAvgPool2d(1)  # 7x7x256 -> 1x1x256
        self.fc = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.stem(x)
        x = self.conv1(x)
        x = self.c2f1(x)
        x = self.conv2(x)
        x = self.c2f2(x)
        x = self.conv3(x)
        x = self.c2f3(x)
        x = self.conv4(x)
        x = self.c2f4(x)
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)
        x = self.fc(x)
        return x


# --------------------------
# Test rápido
# --------------------------

if __name__ == "__main__":
    model = YOLOv8nCls(num_classes=10)
    x = torch.randn(1, 3, 224, 224)
    y = model(x)
    print(y.shape)  # torch.Size([1, 10])

torch.Size([1, 10])


In [16]:
import torchvision.transforms as transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

# Transformaciones: ajustar tamaño y normalizar
transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ColorJitter(brightness=0.2, contrast=0.3, saturation=0.3),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ]
)

val_transform = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
    ]
)
val_dataset = ImageFolder(root="dataset_split/val", transform=val_transform)


train_dataset = ImageFolder(root="dataset_split/train", transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim

# 1. Definir el modelo
model = YOLOv8nCls()  # tu modelo recreado
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# 2. Definir la función de pérdida
criterion = nn.CrossEntropyLoss()  # porque es clasificación

# 3. Definir el optimizador
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 4. Bucle de entrenamiento
num_epochs = 50  # ajusta según tu dataset

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()  # resetear gradientes
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    train_acc = 100 * correct / total
    print(
        f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, Accuracy: {train_acc:.2f}%"
    )

# 5. Guardar pesos entrenados
torch.save(model.state_dict(), "modelo_entrenado.pth")

Epoch 1/50, Loss: 4.9588, Accuracy: 63.88%
Epoch 2/50, Loss: 2.2107, Accuracy: 71.62%
Epoch 3/50, Loss: 0.8896, Accuracy: 77.25%
Epoch 4/50, Loss: 0.6387, Accuracy: 77.25%
Epoch 5/50, Loss: 0.4997, Accuracy: 79.12%
Epoch 6/50, Loss: 0.4941, Accuracy: 77.25%
Epoch 7/50, Loss: 0.4776, Accuracy: 79.12%
Epoch 8/50, Loss: 0.4337, Accuracy: 82.62%
Epoch 9/50, Loss: 0.4650, Accuracy: 80.12%
Epoch 10/50, Loss: 0.4438, Accuracy: 79.38%
Epoch 11/50, Loss: 0.4200, Accuracy: 81.62%
Epoch 12/50, Loss: 0.4238, Accuracy: 80.00%
Epoch 13/50, Loss: 0.4142, Accuracy: 81.75%
Epoch 14/50, Loss: 0.4110, Accuracy: 82.00%
Epoch 15/50, Loss: 0.4062, Accuracy: 83.00%
Epoch 16/50, Loss: 0.4062, Accuracy: 80.62%
Epoch 17/50, Loss: 0.4019, Accuracy: 82.00%
Epoch 18/50, Loss: 0.3719, Accuracy: 85.12%
Epoch 19/50, Loss: 0.3624, Accuracy: 83.88%
Epoch 20/50, Loss: 0.4071, Accuracy: 82.62%
Epoch 21/50, Loss: 0.3982, Accuracy: 82.00%
Epoch 22/50, Loss: 0.3773, Accuracy: 83.38%
Epoch 23/50, Loss: 0.3651, Accuracy: 84.5

In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()


val_acc = 100 * correct / total
print(f"Validation Accuracy: {val_acc:.2f}%")

Validation Accuracy: 80.60%


In [23]:
import os
from PIL import Image

folder = "Images_test"
model.eval()

with torch.no_grad():
    for filename in os.listdir(folder):
        if filename.endswith((".jpg", ".png", ".jpeg")):  # filtra solo imágenes
            img_path = os.path.join(folder, filename)
            img = Image.open(img_path)
            img = transform(img).unsqueeze(0).to(device)  # agrega batch dimension

            output = model(img)
            pred_class = output.argmax(dim=1).item()

            print(f"{filename} --> Predicción: {train_dataset.classes[pred_class]}")

10.jpeg --> Predicción: bird
11.jpg --> Predicción: bird
12.jpg --> Predicción: bird
13.jpg --> Predicción: no_bird
14.jpg --> Predicción: no_bird
16.jpg --> Predicción: no_bird
3.jpg --> Predicción: no_bird
349664072_639634714750284_7488197136792291295_n.jpg --> Predicción: bird
4.jpg --> Predicción: bird
5.jpg --> Predicción: bird
6.jpeg --> Predicción: bird
65a9d7da0d6bb119203b1c13.jpg --> Predicción: no_bird
7.jpeg --> Predicción: bird
8.jpg --> Predicción: bird
9.jpg --> Predicción: bird
images.jpg --> Predicción: no_bird
images_1.jpeg --> Predicción: bird
orig-1437426411440.jpg --> Predicción: bird
pantanos-scaled.jpg --> Predicción: bird
paseo-en-catamaran-insonoro.jpg --> Predicción: no_bird
