In [28]:
#library imports
import os
import numpy as np
from PIL import Image
from torch.utils.data import Dataset
import torch
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch.optim as optim
from torchvision import transforms

In [29]:
#Load concept names
concept_names = []
with open("../aml-2025-feathers-in-focus/attributes.txt") as f:
    for line in f:
        cid, cname = line.strip().split(" ", 1)
        concept_names.append(cname)

#load concept values per image
concept_values = np.load("../aml-2025-feathers-in-focus/attributes.npy", allow_pickle=True)

In [30]:
#Set up a dataset class to load and prepare the bird images with concepts
class BirdConceptDataset(Dataset):
    
    def __init__(self, csv_path, img_root, concept_file, transform=None):
        # load the relevant data
        self.df = pd.read_csv(csv_path)
        self.img_root = img_root
        self.transform = transform
        self.concepts = np.load(concept_file)
        
    def __len__(self):
        # Return the number of samples
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(self.img_root + row.image_path).convert("RGB")
        
        # Always convert to tensor (required by DataLoader)
        if self.transform:
            x = self.transform(img)
        else:
            x = transforms.ToTensor()(img)  # convert PIL to tensor
        
        y_class = torch.tensor(row.label, dtype=torch.long)
    
        # Use class label to get concept vector (assuming 1 row per class in attributes.npy)
        y_concept = torch.tensor(self.concepts[row.label - 1], dtype=torch.float32)
        return x, y_concept, y_class

In [31]:
# Create a CNN model for concept prediction
class ConceptPredictorCNN(nn.Module):
    def __init__(self, num_concepts):
        super().__init__()
        # Convolutional layers
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        
        # Adaptive pooling to get fixed-size feature map
        self.adaptive_pool = nn.AdaptiveAvgPool2d((4, 4))  # 4x4 feature map

        # Fully connected layers
        self.fc1 = nn.Linear(128 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, num_concepts)
        
        # Activation for continuous concepts
        self.activation = nn.Sigmoid()
    
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        
        # Adaptive pooling ensures output is always 128x4x4
        x = self.adaptive_pool(x)

        # Flatten
        x = x.view(x.size(0), -1)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.activation(self.fc2(x))
        return x

In [32]:
# Number of concepts
num_concepts = len(concept_names)

# Number of bird classes
num_classes = 200

# Create label predictor model based on the concept vector
class LabelPredictor(nn.Module):
    """
    Takes a concept vector and predicts the bird class.
    """
    def __init__(self, num_concepts, num_classes):
        super().__init__()
        # Fully connected layers
        self.fc1 = nn.Linear(num_concepts, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, num_classes)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        logits = self.fc3(x)
        return logits

In [33]:
# Concept predictor CNN
concept_model = ConceptPredictorCNN(num_concepts=num_concepts)

# Label predictor MLP
label_model = LabelPredictor(num_concepts=num_concepts, num_classes=num_classes)

In [34]:
# Hyperparameters
batch_size = 32
lr = 1e-3
num_epochs = 10

# Dataset & DataLoader
train_dataset = BirdConceptDataset(
    csv_path="../aml-2025-feathers-in-focus/train_images.csv",
    img_root="../aml-2025-feathers-in-focus",
    concept_file="../aml-2025-feathers-in-focus/attributes.npy",
    transform=None  # or add transforms if desired
)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Model & optimizer
concept_model = ConceptPredictorCNN(num_concepts=num_concepts)
optimizer = optim.Adam(concept_model.parameters(), lr=lr)
criterion = nn.MSELoss()  # for continuous concept regression

# Training loop
concept_model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for imgs, y_concept, _ in train_loader:  # ignore class label here
        optimizer.zero_grad()
        outputs = concept_model(imgs.float())  # ensure float tensors
        loss = criterion(outputs, y_concept)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}")

RuntimeError: stack expects each tensor to be equal size, but got [3, 375, 500] at entry 0 and [3, 334, 500] at entry 1