Data Preparation

In [4]:
import os

def scan_dataset(root_dir):
    """
    root_dir: 'data/train' or 'data/val' or 'data/test'
    Returns: list of image paths and their class names
    """
    image_paths = []
    labels = []
    
    for class_name in sorted(os.listdir(root_dir)):
        class_folder = os.path.join(root_dir, class_name)
        if not os.path.isdir(class_folder):
            continue
        for fname in sorted(os.listdir(class_folder)):
            if fname.lower().endswith(('.jpg','.jpeg','.png')):
                image_paths.append(os.path.join(class_folder, fname))
                labels.append(class_name)
    
    return image_paths, labels

train_paths, train_labels = scan_dataset("data/train")
val_paths, val_labels = scan_dataset("data/val")
test_paths, _ = scan_dataset("data/test")  

# Build a mapping
classes = sorted(set(train_labels))  # ['eiffel','stonehenge',...]
class_to_idx = {c:i for i,c in enumerate(classes)}

# Map labels to integers
train_labels_idx = [class_to_idx[c] for c in train_labels]
val_labels_idx = [class_to_idx[c] for c in val_labels]


In [5]:
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms

class LandmarkDataset(Dataset):
    def __init__(self, image_paths, labels=None, transform=None): # Stores image paths, labels, and transforms.
        self.image_paths = image_paths
        self.labels = labels  # None for test set
        self.transform = transform
        
    def __len__(self): # returns the number of images in the dataset.
        return len(self.image_paths)
    
    def __getitem__(self, idx): # loads and returns an image and its label (if available).
        img_path = self.image_paths[idx]
        img = Image.open(img_path).convert("RGB")
        if self.transform:
            img = self.transform(img)
        
        if self.labels is not None:
            label = self.labels[idx]
            return img, label
        else:
            return img, img_path  # return path for query images


In [6]:
from torchvision import transforms

train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.2,0.2,0.2,0.1),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])

val_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])


In [7]:
from torch.utils.data import DataLoader

train_dataset = LandmarkDataset(train_paths, train_labels_idx, transform=train_transform)
val_dataset = LandmarkDataset(val_paths, val_labels_idx, transform=val_transform)
test_dataset = LandmarkDataset(test_paths, labels=None, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

DELF

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models, transforms
from torch.utils.data import DataLoader
from PIL import Image
import numpy as np


Define DELF Backbone (ResNet50) <br>
DELF uses a CNN backbone (ResNet50 in TensorFlow). We can remove the fully connected layers and use feature maps from the last convolutional block.

In [None]:
class DELFBackbone(nn.Module):
    def __init__(self, pretrained=True):
        super(DELFBackbone, self).__init__()
        resnet = models.resnet50(pretrained=pretrained)
        # Take only convolutional layers (exclude avgpool and fc)
        self.features = nn.Sequential(*list(resnet.children())[:-2])
        
    def forward(self, x):
        # x: [B,3,H,W]
        feature_map = self.features(x)  # [B, 2048, H/32, W/32]
        return feature_map

GeM Pooling Layer (for global features if needed) <br>
DELF paper uses GeM pooling for global descriptors. For local features, we may skip or just use attention over feature maps.

In [None]:
class GeM(nn.Module):
    def __init__(self, p=3.0, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return F.avg_pool2d(x.clamp(min=self.eps).pow(self.p), (x.size(-2), x.size(-1))).pow(1./self.p)


Simple Attention Module (optional) <br>
DELF applies attention to feature maps to pick keypoints.

In [None]:
class AttentionModule(nn.Module):
    def __init__(self, in_channels=2048):
        super(AttentionModule, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 512, kernel_size=1)
        self.conv2 = nn.Conv2d(512, 1, kernel_size=1)
    
    def forward(self, x):
        att = F.relu(self.conv1(x))
        att = torch.sigmoid(self.conv2(att))  # attention map: [B,1,H,W]
        return att


DELF Model Combining Backbone + Attention

In [None]:
class DELF(nn.Module):
    def __init__(self, pretrained=True):
        super(DELF, self).__init__()
        self.backbone = DELFBackbone(pretrained)
        self.attention = AttentionModule()
    
    def forward(self, x):
        # x: [B,3,H,W]
        fmap = self.backbone(x)               # [B,2048,H/32,W/32]
        att_map = self.attention(fmap)        # [B,1,H/32,W/32]
        descriptors = fmap * att_map           # apply attention
        return descriptors, att_map


Feature Extraction Loop

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DELF(pretrained=True).to(device)
model.eval()

# Example DataLoader
# from dataset import train_loader
features_list = []

with torch.no_grad():
    for images, labels in train_loader:
        images = images.to(device)
        desc, att = model(images)
        # flatten spatial locations
        B,C,H,W = desc.shape
        desc = desc.view(B,C,H*W).permute(0,2,1)  # [B, H*W, C]
        features_list.append(desc.cpu().numpy())


In [None]:
def extract_features(model, dataloader, device, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    model.eval()
    
    with torch.no_grad():
        for imgs, labels_or_paths in tqdm(dataloader):
            imgs = imgs.to(device)
            feats = model(imgs)
            
            # Save features per image
            for i in range(imgs.size(0)):
                img_name = labels_or_paths[i] if isinstance(labels_or_paths[i], str) else str(labels_or_paths[i])
                
                if 'global' in feats:
                    gfeat = feats['global'][i].cpu().numpy()
                    np.save(os.path.join(output_dir, f"{img_name}_global.npy"), gfeat)
                
                if 'local' in feats:
                    ldesc = feats['local']['descriptors'][i].cpu().numpy()  # [num_kp, C]
                    att = feats['local']['attention'][i].cpu().numpy()
                    np.savez(os.path.join(output_dir, f"{img_name}_local.npz"),
                             descriptors=ldesc, attention=att)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DELG(pretrained=True, use_global=True, use_local=True).to(device)

# Extract features for train, val, test
extract_features(model, train_loader, device, output_dir="./features/train")
extract_features(model, val_loader, device, output_dir="./features/val")
extract_features(model, test_loader, device, output_dir="./features/test")