In [2]:
# imports
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as T
import torchvision.models as models
from torchvision import transforms
from torchvision.models import convnext_tiny, ConvNeXt_Tiny_Weights




In [3]:
#Set GPU as device to use.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [7]:
df_train = pd.read_csv("../aml-2025-feathers-in-focus/train_images.csv")
df_train.head()

complete_bird_attributes = pd.read_csv("complete_bird_attributes.csv", index_col='class_key')

In [4]:
num_concepts = complete_bird_attributes.shape[1]

# test
example_label = df_train["label"].iloc[0]
example_concepts = complete_bird_attributes.loc[example_label]
print(example_concepts)


has_bill_shape::curved_(up_or_down)    0
has_bill_shape::dagger                 0
has_bill_shape::hooked                 0
has_bill_shape::needle                 0
has_bill_shape::hooked_seabird         1
                                      ..
has_crown_color::buff                  0
has_wing_pattern::solid                1
has_wing_pattern::spotted              0
has_wing_pattern::striped              0
has_wing_pattern::multi-colored        0
Name: 1, Length: 312, dtype: int64


In [5]:
class BirdConceptDataset(Dataset):
    def __init__(self, csv_df, attributes_df, images_root):
        self.df = csv_df
        self.attributes = attributes_df
        self.images_root = images_root
        
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Load label
        label = self.df.iloc[idx]["label"]
        
        # Load concept vector
        concept_vec = torch.tensor(self.attributes.loc[label].values, dtype=torch.float32)
        
        # Build full image path
        img_rel_path = self.df.iloc[idx]["image_path"]
        img_path = os.path.join(self.images_root, os.path.basename(img_rel_path))

        # Load image using your method
        image = Image.open(img_path).convert("RGB")
        image = self.transform(image)

        return image, concept_vec


In [6]:
train_images_dir = "../aml-2025-feathers-in-focus/train_images/cropped_train_images/"
train_images = [
    os.path.join(train_images_dir, f)
    for f in os.listdir(train_images_dir)
    if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))
]


In [7]:
train_dataset = BirdConceptDataset(
    csv_df=df_train,
    attributes_df=complete_bird_attributes,
    images_root=train_images_dir
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [5]:
# proven working model, computationally efficient

class ConceptNet(nn.Module):
    def __init__(self, num_outputs):
        super().__init__()
        
        self.backbone = models.resnet18(pretrained=False)
        self.backbone.fc = nn.Identity()   # remove classification head
        
        self.fc = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_outputs)
            # nn.Sigmoid()   # multilabel output
        )
        
    def forward(self, x):
        x = self.backbone(x)
        x = self.fc(x)
        return x


In [None]:
#Load ConvNext-Tiny

class ConceptNet(nn.Module):
    def __init__(self, num_outputs):
        super().__init__()

        # Load ConvNeXt-Tiny
        weights = ConvNeXt_Tiny_Weights.DEFAULT
        backbone = convnext_tiny(weights=weights)

        # Remove classification head â†’ keep feature extractor
        # ConvNeXt stores the classifier in backbone.classifier
        backbone.classifier = nn.Identity()

        # The final layer before classifier outputs (B, 768)
        self.backbone = nn.Sequential(
            backbone.features,
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten()
        )

        # Replace FC block to match feature size = 768
        self.fc = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_outputs)
            # nn.Sigmoid()  # uncomment for multilabel
        )

    def forward(self, x):
        x = self.backbone(x)
        x = self.fc(x)
        return x


In [None]:
# Load efficientnet-B2

class ConceptNet(nn.Module):
    def __init__(self, num_outputs):
        super().__init__()

        # Load EfficientNet-B2 pretrained on ImageNet
        self.backbone = models.efficientnet_b2(
            weights=models.EfficientNet_B2_Weights.IMAGENET1K_V1
        )

        # EfficientNet-B2 final embedding size = 1408
        in_features = self.backbone.classifier[1].in_features

        # Remove original classification head
        self.backbone.classifier = nn.Identity()

        # Custom multilabel head
        self.fc = nn.Sequential(
            nn.Linear(in_features, 256),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(256, num_outputs)   # BCEWithLogitsLoss â†’ no sigmoid
        )

    def forward(self, x):
        x = self.backbone(x)  # output shape: (batch, 1408)
        x = self.fc(x)
        return x


In [9]:
model = ConceptNet(num_outputs=num_concepts)
model = model.to(device)
criterion = nn.BCEWithLogitsLoss()    # output is already sigmoid
criterion = criterion.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)



In [10]:
num_epochs = 100

model.train()
for epoch in range(num_epochs):
    total_loss = 0

    for images, targets in train_loader:
        optimizer.zero_grad()

        images = images.to(device)
        targets = targets.to(device)
        preds = model(images)         # predictions (batch_size Ã— num_concepts)
        loss = criterion(preds, targets)

        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss:.4f}")

    torch.save(model.state_dict(), "bestest_model.pth")


Epoch 1/100, Loss: 45.0372
Epoch 2/100, Loss: 27.0394
Epoch 3/100, Loss: 24.2288
Epoch 4/100, Loss: 20.8821
Epoch 5/100, Loss: 18.1358
Epoch 6/100, Loss: 15.9824
Epoch 7/100, Loss: 14.2028
Epoch 8/100, Loss: 12.7480
Epoch 9/100, Loss: 11.6040
Epoch 10/100, Loss: 10.6893
Epoch 11/100, Loss: 9.7507
Epoch 12/100, Loss: 8.8752
Epoch 13/100, Loss: 8.2316
Epoch 14/100, Loss: 7.5299
Epoch 15/100, Loss: 7.0479
Epoch 16/100, Loss: 6.4913
Epoch 17/100, Loss: 5.9905
Epoch 18/100, Loss: 5.5940
Epoch 19/100, Loss: 5.1662
Epoch 20/100, Loss: 4.8246
Epoch 21/100, Loss: 4.4484
Epoch 22/100, Loss: 4.2689
Epoch 23/100, Loss: 3.9854
Epoch 24/100, Loss: 3.7824
Epoch 25/100, Loss: 3.5111
Epoch 26/100, Loss: 3.3451
Epoch 27/100, Loss: 3.1954
Epoch 28/100, Loss: 2.9836
Epoch 29/100, Loss: 2.8257
Epoch 30/100, Loss: 2.7014
Epoch 31/100, Loss: 2.5912
Epoch 32/100, Loss: 2.4520
Epoch 33/100, Loss: 2.3547
Epoch 34/100, Loss: 2.2490
Epoch 35/100, Loss: 2.1455
Epoch 36/100, Loss: 2.1027
Epoch 37/100, Loss: 1.9870


In [None]:
torch.save(model.state_dict(), "best_model.pth")

In [8]:
test_images_dir = "../aml-2025-feathers-in-focus/test_images/cropped_test_images/"

test_images = [
    os.path.join(test_images_dir, f)
    for f in os.listdir(test_images_dir)
    if f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))
]

print(f"Found {len(test_images)} test images")


img_size = 224
test_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor()
])


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# make sure model structure matches training
model = ConceptNet(num_outputs=complete_bird_attributes.shape[1])
model.load_state_dict(torch.load("bestest_model.pth", map_location=device))
model = model.to(device)
model.eval()


#num_concepts = len(complete_bird_attributes.columns)
#topk = 1  # number of concepts to keep per image

threshold = 0.5
pred_list = []

with torch.no_grad():
    for img_path in tqdm(test_images, desc="Predicting test images"):
        img = Image.open(img_path).convert("RGB")
        x = test_transform(img).unsqueeze(0).to(device)

        logits = model(x)
        probs = torch.sigmoid(logits).cpu().numpy().squeeze()  # (num_concepts,)

        # ðŸ”¥ Every predicted concept above threshold becomes 1
        pred_vec = (probs >= threshold).astype(int)

        pred_list.append(pred_vec)


pred_array = np.stack(pred_list, axis=0)  # shape: (4000, 312)

# Use column names from complete_bird_attributes
pred_df = pd.DataFrame(pred_array, columns=complete_bird_attributes.columns)
pred_df.insert(0, "image_path", [os.path.basename(f) for f in test_images])

print(pred_df.shape)  # should be (4000, 312)

pred_df.head()


pred_df.to_csv("test_predictions.csv", index=False)


Found 4000 test images


Predicting test images: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4000/4000 [03:07<00:00, 21.37it/s]


(4000, 313)


In [10]:
#KNN with K=1 on the predicted concept vectors to find the nearest training image
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np

# Drop image_path column for NN, only keep concept vectors
test_features = pred_df.drop(columns=["image_path"]).values  # shape (4000, 312)

# Make sure training attribute matrix is aligned (columns in same order)
train_features = complete_bird_attributes.values  # shape (num_birds, 312)
train_index = complete_bird_attributes.index.values  # class_key values

# Fit Nearest Neighbors model
k = 1
nn_model = NearestNeighbors(n_neighbors=k, metric='cosine')  # can use 'cosine' or 'euclidean'
nn_model.fit(train_features)


distances, indices = nn_model.kneighbors(test_features)  # indices shape (4000, k)

nearest_class_keys = train_index[indices[:, 0]]  # shape (4000,)
output_df = pd.DataFrame({
    "image_path": pred_df["image_path"].values,
    "class_key": nearest_class_keys
})


#Adjust the output to match the submission format
output_df = output_df.iloc[::-1].reset_index(drop=True)
middle_index = 1 
output_df.insert(middle_index, "id", range(1, len(output_df) + 1))
output_df = output_df.rename(columns={"class_key": "label"})
output_df = output_df.drop(columns=["image_path"])


output_df.to_csv("test_nearest_neighbors.csv", index=False)

