In [1]:
import os
import json
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import DataLoader, Dataset
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import pairwise_distances
import numpy as np

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create label to index mapping
labels = ['calling', 'clapping', 'cycling', 'dancing', 'drinking', 'eating', 'fighting',
          'hugging', 'laughing', 'listening_to_music', 'running', 'sitting', 'sleeping',
          'texting', 'using_laptop']
label_to_idx = {label: idx for idx, label in enumerate(labels)}

# Custom Dataset class to handle JSON label mapping
class HumanActivityDataset(Dataset):
    def __init__(self, image_dir, image_info_path, transform=None):
        self.image_dir = image_dir
        self.transform = transform
        with open(image_info_path, 'r') as f:
            self.image_info = json.load(f)
        self.image_paths = list(self.image_info.keys())

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        while True:
            image_path = self.image_paths[idx]
            full_image_path = os.path.join(self.image_dir, image_path)

            if not os.path.exists(full_image_path):
                print(f"File not found: {full_image_path}")
                idx = (idx + 1) % len(self)
                continue

            try:
                image = Image.open(full_image_path).convert('RGB')
                break  # Exit loop if image is successfully loaded
            except Exception as e:
                print(f"Error loading image: {e}")
                idx = (idx + 1) % len(self)

        label = label_to_idx[self.image_info[image_path]]  # Convert string label to index

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label)

# Data loading and preprocessing
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

train_dataset = HumanActivityDataset(
    '/content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/train',
    '/content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/train_image_info.json',
    transform=transform
)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

query_dataset = HumanActivityDataset(
    '/content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/query_images',
    '/content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/test_image_info.json',
    transform=transform
)
query_loader = DataLoader(query_dataset, batch_size=1, shuffle=False)

gallery_dataset = HumanActivityDataset(
    '/content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery',
    '/content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/test_image_info.json',
    transform=transform
)
gallery_loader = DataLoader(gallery_dataset, batch_size=1, shuffle=False)

# Model definition
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.fc = nn.Sequential(
            nn.Linear(256 * 28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 128)
        )

    def forward_once(self, x):
        output = self.cnn(x)
        output = output.view(output.size()[0], -1)
        output = self.fc(output)
        return output

    def forward(self, input1, input2):
        output1 = self.forward_once(input1)
        output2 = self.forward_once(input2)
        return output1, output2

# Contrastive loss function
def contrastive_loss(output1, output2, label, margin=1.0):
    euclidean_distance = nn.functional.pairwise_distance(output1, output2)
    loss = torch.mean((1 - label) * torch.pow(euclidean_distance, 2) +
                      (label) * torch.pow(torch.clamp(margin - euclidean_distance, min=0.0), 2))
    return loss

# Training the model
model = SiameseNetwork().to(device)
criterion = contrastive_loss
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    for img1, label1 in tqdm(train_loader):
        if img1 is None or label1 is None:
            continue
        img1, label1 = img1.to(device), label1.to(device, dtype=torch.float)

        # Create pairs of images (for demonstration, using the same images; modify as needed)
        img2, label2 = img1, label1
        label = (label1 == label2).float().to(device)

        optimizer.zero_grad()
        output1, output2 = model(img1, img2)
        loss = criterion(output1, output2, label)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Extracting features and evaluating similarity
def extract_features(model, data_loader):
    model.eval()
    features = []
    labels = []
    with torch.no_grad():
        for img, label in data_loader:
            if img is None or label is None:
                continue
            img = img.to(device)
            feature = model.forward_once(img)
            features.append(feature.cpu().numpy())
            labels.append(label.cpu().numpy())
    return np.vstack(features), np.hstack(labels)

query_features, query_labels = extract_features(model, query_loader)
gallery_features, gallery_labels = extract_features(model, gallery_loader)

# Compute cosine similarity between query and gallery features
similarity_matrix = 1 - pairwise_distances(query_features, gallery_features, metric='cosine')

# def calculate_map_at_k(similarity_matrix, query_labels, gallery_labels, k):
#     num_queries = similarity_matrix.shape[0]
#     average_precisions = []
#     for i in range(num_queries):
#         relevant_indices = np


def calculate_map_at_k(similarity_matrix, query_labels, gallery_labels, k):
    num_queries = similarity_matrix.shape[0]
    average_precisions = []
    for i in range(num_queries):
        relevant_indices = np.where(gallery_labels == query_labels[i])[0]
        sorted_indices = np.argsort(-similarity_matrix[i])[:k]
        hits = np.isin(sorted_indices, relevant_indices).astype(int)
        precision_at_k = np.cumsum(hits) / (np.arange(k) + 1)
        average_precisions.append(np.sum(precision_at_k * hits) / len(relevant_indices))
    return np.mean(average_precisions)

map_at_1 = calculate_map_at_k(similarity_matrix, query_labels, gallery_labels, 1)
map_at_10 = calculate_map_at_k(similarity_matrix, query_labels, gallery_labels, 10)
map_at_50 = calculate_map_at_k(similarity_matrix, query_labels, gallery_labels, 50)

print(f"mAP@1: {map_at_1:.4f}")
print(f"mAP@10: {map_at_10:.4f}")
print(f"mAP@50: {map_at_50:.4f}")

# Calculate mean rank
def calculate_mean_rank(similarity_matrix, query_labels, gallery_labels):
    ranks = []
    for i in range(similarity_matrix.shape[0]):
        relevant_indices = np.where(gallery_labels == query_labels[i])[0]
        sorted_indices = np.argsort(-similarity_matrix[i])
        rank = np.min([np.where(sorted_indices == relevant_index)[0][0] + 1 for relevant_index in relevant_indices])
        ranks.append(rank)
    return np.mean(ranks)

mean_rank = calculate_mean_rank(similarity_matrix, query_labels, gallery_labels)
print(f"Mean Rank: {mean_rank:.4f}")

100%|██████████| 358/358 [57:24<00:00,  9.62s/it]


Epoch 1, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:35<00:00,  3.76it/s]


Epoch 2, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.79it/s]


Epoch 3, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.80it/s]


Epoch 4, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.79it/s]


Epoch 5, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.79it/s]


Epoch 6, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.78it/s]


Epoch 7, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.79it/s]


Epoch 8, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.79it/s]


Epoch 9, Loss: 0.9999772906303406


100%|██████████| 358/358 [01:34<00:00,  3.80it/s]


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
File not found: /content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery/Image_3437.jpg
File not found: /content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery/Image_7923.jpg
File not found: /content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery/Image_10587.jpg
File not found: /content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery/Image_10454.jpg
File not found: /content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery/Image_6714.jpg
File not found: /content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery/Image_4680.jpg
File not found: /content/drive/MyDrive/human_activity_retrieval_dataset (1)/human_activity_retrieval_dataset/gallery/Image_5258.jpg
File not 