In [1]:
import os
import torch
import torchvision
from torchvision import datasets
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor, Lambda
import matplotlib.pyplot as plt
import requests
from zipfile import ZipFile
from io import BytesIO
import numpy as np
import zipfile
import os


zip_file_path = r'C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k.zip' 
extract_dir = r'C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k_extracted'

# Crea la directory di estrazione se non esiste
os.makedirs(extract_dir, exist_ok=True)

# Estrai il file ZIP solo se esiste
if os.path.exists(zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"File '{zip_file_path}' estratto con successo nella directory '{extract_dir}'")
    print(f"Contenuti della directory '{extract_dir}':\n{os.listdir(extract_dir)}")
else:
    print(f"File zip '{zip_file_path}' non trovato. Assicurati che il dataset sia estratto in '{extract_dir}'.")



File 'C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k.zip' estratto con successo nella directory 'C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k_extracted'
Contenuti della directory 'C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k_extracted':
['SPair-71k']


In [2]:
from PIL import Image
import glob
import json


class Normalize(object):
    def __init__(self, image_keys):
        self.image_keys = image_keys
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    def __call__(self, image):
        for key in self.image_keys:
            image[key] /= 255.0
            image[key] = self.normalize(image[key])
        return image


def read_img(path):
    img = np.array(Image.open(path).convert('RGB'))

    return torch.tensor(img.transpose(2, 0, 1).astype(np.float32))


class SPairDataset(Dataset):
    def __init__(self, pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype):

        self.datatype = datatype
        self.pck_alpha = pck_alpha
        self.ann_files = open(os.path.join(layout_path, dataset_size, datatype + '.txt'), "r").read().split('\n')
        self.ann_files = self.ann_files[:len(self.ann_files) - 1]
        self.pair_ann_path = pair_ann_path
        self.image_path = image_path
        self.categories = list(map(lambda x: os.path.basename(x), glob.glob('%s/*' % image_path)))
        self.categories.sort()
        self.transform = Normalize(['src_img', 'trg_img'])

    def __len__(self):
        return len(self.ann_files)

    def __getitem__(self, idx):
        
        raw_line = self.ann_files[idx]
        ann_filename = raw_line.replace(':', '_')
        ann_file = ann_filename + '.json'
        json_path = os.path.join(self.pair_ann_path, self.datatype, ann_file)

        with open(json_path) as f:
            annotation = json.load(f)

        category = annotation['category']
        src_img = read_img(os.path.join(self.image_path, category, annotation['src_imname']))
        trg_img = read_img(os.path.join(self.image_path, category, annotation['trg_imname']))

        trg_bbox = annotation['trg_bndbox']
        pck_threshold = max(trg_bbox[2] - trg_bbox[0],  trg_bbox[3] - trg_bbox[1]) * self.pck_alpha

        sample = {'pair_id': annotation['pair_id'],
                  'filename': annotation['filename'],
                  'src_imname': annotation['src_imname'],
                  'trg_imname': annotation['trg_imname'],
                  'src_imsize': src_img.size(),
                  'trg_imsize': trg_img.size(),

                  'src_bbox': annotation['src_bndbox'],
                  'trg_bbox': annotation['trg_bndbox'],
                  'category': annotation['category'],

                  'src_pose': annotation['src_pose'],
                  'trg_pose': annotation['trg_pose'],

                  'src_img': src_img,
                  'trg_img': trg_img,
                  'src_kps': torch.tensor(annotation['src_kps']).float(),
                  'trg_kps': torch.tensor(annotation['trg_kps']).float(),

                  'mirror': annotation['mirror'],
                  'vp_var': annotation['viewpoint_variation'],
                  'sc_var': annotation['scale_variation'],
                  'truncn': annotation['truncation'],
                  'occlsn': annotation['occlusion'],

                  'pck_threshold': pck_threshold}

        if self.transform:
            sample = self.transform(sample)

        return sample

if __name__ == '__main__':
    base_dir = r"C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k_extracted\SPair-71k\SPair-71k"    
    pair_ann_path = os.path.join(base_dir, 'PairAnnotation')
    layout_path = os.path.join(base_dir, 'Layout')
    image_path = os.path.join(base_dir, 'JPEGImages')
    dataset_size = 'large'
    pck_alpha = 0.05
    
    # Verifica che i percorsi esistano prima di creare il dataset
    if os.path.exists(pair_ann_path) and os.path.exists(layout_path) and os.path.exists(image_path):
        trn_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='trn')
        val_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='val')
        test_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='test')

        trn_dataloader = DataLoader(trn_dataset, num_workers=0)
        val_dataloader = DataLoader(val_dataset, num_workers=0)
        test_dataloader = DataLoader(test_dataset, num_workers=0)
        print("Dataset caricati correttamente.")
    else:
        print(f"Errore: Impossibile trovare i percorsi del dataset in '{base_dir}'.\nVerifica l'estrazione e controlla se la struttura delle cartelle corrisponde.")

Dataset caricati correttamente.


In [3]:
# PCK PER POINT - OFFICIAL DINOv2 VERSION (WITH PADDING FIX)
import torch
import math 
import numpy as np
import torch.nn.functional as F
from tqdm import tqdm

# 1. LOAD OFFICIAL MODEL (Replaces Hugging Face)
print("Loading Official DINOv2 Model from Torch Hub...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitb14')
model.to(device)
model.eval() 

print(f"Model loaded on: {device}")

# Helper function for Padding
def pad_to_multiple(x, k=14):
    """
    Pads the image (bottom and right) so that H and W are multiples of k.
    """
    h, w = x.shape[-2:]
    new_h = math.ceil(h / k) * k #  return the ceiling value of a number
    new_w = math.ceil(w / k) * k
    
    pad_bottom = new_h - h
    pad_right = new_w - w
    
    if pad_bottom == 0 and pad_right == 0:
        return x
    return F.pad(x, (0, pad_right, 0, pad_bottom), value=0)

# Initialize counters
class_pck_data = {}
class_pck_image = {}

with torch.no_grad(): # Disable gradients
    for i, data in enumerate(tqdm(test_dataloader, desc="Evaluation")):
        
        category = data['category'][0]
        if category not in class_pck_data:
            class_pck_data[category] = {
                'total_keypoints': 0,
                'correct_kps_0_05': 0,
                'correct_kps_0_1': 0,
                'correct_kps_0_2': 0

            }
        if category not in class_pck_image:
            class_pck_image[category] = {
                'total_image': 0,
                'image_value_sum_0_05': 0, # Accumulatore per le medie delle singole immagini
                'image_value_sum_0_1': 0,
                'image_value_sum_0_2': 0
            }
            # Counters specific for THIS image
        img_tot_keypoints = 0
        img_correct_keypoints_0_05 = 0
        img_correct_keypoints_0_1 = 0
        img_correct_keypoints_0_2 = 0

        src_img = data['src_img'].to(device) # torch.Size([1, 3, 333, 500])
       
        trg_img = data['trg_img'].to(device)

        # --- FIX: APPLY PADDING ---
        # Ensure dimensions are multiples of 14 to avoid AssertionError
        src_img_padded = pad_to_multiple(src_img, 14)
        trg_img_padded = pad_to_multiple(trg_img, 14)

        
        # We pass the PADDED images
        dict_src = model.forward_features(src_img_padded) # Python dictionary. 
        dict_trg = model.forward_features(trg_img_padded)
        
        feats_src = dict_src["x_norm_patchtokens"] # [Batch_Size, Num_Patches, Dimension]
        feats_trg = dict_trg["x_norm_patchtokens"]
        
        # --- IMPORTANT: GRID CALCULATION ---
        # We must use PADDED dimensions for the grid, otherwise indices will drift
        _, _, H_padded, W_padded = src_img_padded.shape 
        
        # We keep ORIGINAL dimensions for valid boundary checks
        _, _, H_orig, W_orig = data['src_img'].shape

        patch_size = 14
        w_grid = W_padded // patch_size 
        h_grid = H_padded // patch_size

        kps_list_src = data['src_kps'][0] 
        trg_kps_gt = data['trg_kps'][0] 
        
        bbox_list = data['trg_bbox'] 

        # Estraiamo i 4 valori scalari per l'immagine corrente (indice batch 0)
        x_min = bbox_list[0][0].item()
        y_min = bbox_list[1][0].item()
        x_max = bbox_list[2][0].item()
        y_max = bbox_list[3][0].item()

        w_bbox = x_max - x_min
        h_bbox = y_max - y_min
        # La dimensione di riferimento è il lato massimo della BBox
        max_side = max(w_bbox, h_bbox)
        
        # Calcoliamo le 3 soglie in pixel
        thr_05 = max_side * 0.05
        thr_10 = max_side * 0.10
        thr_20 = max_side * 0.20
        # Get threshold value
        #         
        for n_keypoint, keypoint_src in enumerate(kps_list_src):

            x_src_val = keypoint_src[0].item()
            y_src_val = keypoint_src[1].item()

            # NaN Check
            if math.isnan(x_src_val) or math.isnan(y_src_val):
                continue
            
            x_pixel_src = int(x_src_val)
            y_pixel_src = int(y_src_val)

            # Boundary Check on ORIGINAL image (ignore points in padded area if any)
            if not (0 <= x_pixel_src < W_orig and 0 <= y_pixel_src < H_orig):
                continue

            # Grid Clamp
            x_patch_src = min(x_pixel_src // patch_size, w_grid - 1)
            y_patch_src = min(y_pixel_src // patch_size, h_grid - 1)

            # 3. INDEX CALCULATION
            patch_index_src = (y_patch_src * w_grid) + x_patch_src

            # Extract Vector
            source_vec = feats_src[0, patch_index_src, :]

            # Cosine Similarity
            similarity_map = torch.cosine_similarity(source_vec, feats_trg[0], dim=-1)

            # Prediction
            patch_idx_spatial = torch.argmax(similarity_map).item()

            # Convert Index -> Grid -> Pixel
            x_col_pred = patch_idx_spatial % w_grid
            y_row_pred = patch_idx_spatial // w_grid

            x_pred_pixel = x_col_pred * patch_size + (patch_size // 2)
            y_pred_pixel = y_row_pred * patch_size + (patch_size // 2)

            # Ground Truth Check
            gt_x = trg_kps_gt[n_keypoint, 0].item()
            gt_y = trg_kps_gt[n_keypoint, 1].item()

            if math.isnan(gt_x) or math.isnan(gt_y):
                continue
            if not (0 <= gt_x < W_orig and 0 <= gt_y < H_orig):
                continue

            # Distance & Update
            distance = math.sqrt((x_pred_pixel - gt_x)**2 + (y_pred_pixel - gt_y)**2)

           
            is_correct_05 = distance <= thr_05
            is_correct_10 = distance <= thr_10
            is_correct_20 = distance <= thr_20

            # Update Category Data
            class_pck_data[category]['total_keypoints'] += 1
            if is_correct_05: class_pck_data[category]['correct_kps_0_05'] += 1
            if is_correct_10: class_pck_data[category]['correct_kps_0_1'] += 1
            if is_correct_20: class_pck_data[category]['correct_kps_0_2'] += 1

            # Update Image Data
            img_tot_keypoints += 1
            if is_correct_05: img_correct_keypoints_0_05 += 1
            if is_correct_10: img_correct_keypoints_0_1 += 1
            if is_correct_20: img_correct_keypoints_0_2 += 1
        
        # AGGIORNAMENTO DATI CATEGORIA (PCK PER IMAGE)
        # Se l'immagine aveva almeno un punto valido, calcoliamo la sua accuratezza
        if img_tot_keypoints > 0:
            image_accuracy_0_05 = img_correct_keypoints_0_05 / img_tot_keypoints
            image_accuracy_0_1 = img_correct_keypoints_0_1 / img_tot_keypoints
            image_accuracy_0_2 = img_correct_keypoints_0_2 / img_tot_keypoints

            
            class_pck_image[category]['total_image'] += 1
            class_pck_image[category]['image_value_sum_0_05'] += image_accuracy_0_05
            class_pck_image[category]['image_value_sum_0_1'] += image_accuracy_0_1
            class_pck_image[category]['image_value_sum_0_2'] += image_accuracy_0_2
       

Loading Official DINOv2 Model from Torch Hub...


Using cache found in C:\Users\nicol/.cache\torch\hub\facebookresearch_dinov2_main


Model loaded on: cuda


Evaluation: 100%|██████████| 12234/12234 [26:38<00:00,  7.65it/s]


In [4]:
# CALCOLO PCK PER IMAGE
print("--- PCK per Class ---")
class_pck_0_05_list = []
class_pck_0_1_list = []
class_pck_0_2_list = []

for category, data in class_pck_image.items():
    total_image = data['total_image']
    correct_image_0_05 = data['image_value_sum_0_05']
    correct_image_0_1 = data['image_value_sum_0_1']
    correct_image_0_2 = data['image_value_sum_0_2']

    pck_0_05 = (correct_image_0_05 / total_image) * 100 if total_image > 0 else 0
    pck_0_1 = (correct_image_0_1 / total_image) * 100 if total_image > 0 else 0
    pck_0_2 = (correct_image_0_2 / total_image) * 100 if total_image > 0 else 0

    print(f"Category: {category}")
    print(f"  PCK@0.05: {pck_0_05:.2f}% ({correct_image_0_05}/{total_image})")
    print(f"  PCK@0.1: {pck_0_1:.2f}% ({correct_image_0_1}/{total_image})")
    print(f"  PCK@0.2: {pck_0_2:.2f}% ({correct_image_0_2}/{total_image})")
    print("-" * 20)

    if total_image> 0: # Only add to the list if there were keypoints for this class
        class_pck_0_05_list.append(pck_0_05)
        class_pck_0_1_list.append(pck_0_1)
        class_pck_0_2_list.append(pck_0_2)

# 4. Calculate and Display Overall Mean PCK
print("\n--- Overall Mean PCK ---")
overall_mean_pck_0_05 = sum(class_pck_0_05_list) / len(class_pck_0_05_list) if class_pck_0_05_list else 0
overall_mean_pck_0_1 = sum(class_pck_0_1_list) / len(class_pck_0_1_list) if class_pck_0_1_list else 0
overall_mean_pck_0_2 = sum(class_pck_0_2_list) / len(class_pck_0_2_list) if class_pck_0_2_list else 0

print(f"Overall Mean PCK@0.05: {overall_mean_pck_0_05:.2f}%")
print(f"Overall Mean PCK@0.1: {overall_mean_pck_0_1:.2f}%")
print(f"Overall Mean PCK@0.2: {overall_mean_pck_0_2:.2f}%")



--- PCK per Class ---
Category: aeroplane
  PCK@0.05: 37.37% (256.69845381656506/687)
  PCK@0.1: 49.32% (338.8228772259731/687)
  PCK@0.2: 62.51% (429.413891255803/687)
--------------------
Category: bicycle
  PCK@0.05: 20.13% (126.84765512265517/630)
  PCK@0.1: 31.50% (198.46590909090895/630)
  PCK@0.2: 39.78% (250.62063492063461/630)
--------------------
Category: bird
  PCK@0.05: 30.57% (214.6260572760573/702)
  PCK@0.1: 45.11% (316.65251692751707/702)
  PCK@0.2: 57.62% (404.49004329004305/702)
--------------------
Category: boat
  PCK@0.05: 16.18% (105.18730158730165/650)
  PCK@0.1: 26.84% (174.43888888888884/650)
  PCK@0.2: 43.84% (284.94722222222214/650)
--------------------
Category: bottle
  PCK@0.05: 7.96% (66.94325396825404/841)
  PCK@0.1: 15.51% (130.40357142857152/841)
  PCK@0.2: 29.87% (251.22896825396813/841)
--------------------
Category: bus
  PCK@0.05: 24.98% (159.13415799886394/637)
  PCK@0.1: 36.02% (229.4485494081082/637)
  PCK@0.2: 46.52% (296.3500358301827/637)
--

In [6]:
#PCK per point
print("--- PCK per Class ---")
class_pck_0_05_list = []
class_pck_0_1_list = []
class_pck_0_2_list = []

for category, data in class_pck_data.items():
    total_kps = data['total_keypoints']
    correct_kps_0_05 = data['correct_kps_0_05']
    correct_kps_0_1 = data['correct_kps_0_1']
    correct_kps_0_2 = data['correct_kps_0_2']

    pck_0_05 = (correct_kps_0_05 / total_kps) * 100 if total_kps > 0 else 0
    pck_0_1 = (correct_kps_0_1 / total_kps) * 100 if total_kps > 0 else 0
    pck_0_2 = (correct_kps_0_2 / total_kps) * 100 if total_kps > 0 else 0

    print(f"Category: {category}")
    print(f"  PCK@0.05: {pck_0_05:.2f}% ({correct_kps_0_05}/{total_kps})")
    print(f"  PCK@0.1: {pck_0_1:.2f}% ({correct_kps_0_1}/{total_kps})")
    print(f"  PCK@0.2: {pck_0_2:.2f}% ({correct_kps_0_2}/{total_kps})")
    print("-" * 20)

    if total_kps > 0: # Only add to the list if there were keypoints for this class
        class_pck_0_05_list.append(pck_0_05)
        class_pck_0_1_list.append(pck_0_1)
        class_pck_0_2_list.append(pck_0_2)

# 4. Calculate and Display Overall Mean PCK
print("\n--- Overall Mean PCK ---")
overall_mean_pck_0_05 = sum(class_pck_0_05_list) / len(class_pck_0_05_list) if class_pck_0_05_list else 0
overall_mean_pck_0_1 = sum(class_pck_0_1_list) / len(class_pck_0_1_list) if class_pck_0_1_list else 0
overall_mean_pck_0_2 = sum(class_pck_0_2_list) / len(class_pck_0_2_list) if class_pck_0_2_list else 0

print(f"Overall Mean PCK@0.05: {overall_mean_pck_0_05:.2f}%")
print(f"Overall Mean PCK@0.1: {overall_mean_pck_0_1:.2f}%")
print(f"Overall Mean PCK@0.2: {overall_mean_pck_0_2:.2f}%")



--- PCK per Class ---
Category: aeroplane
  PCK@0.05: 41.03% (2141/5218)
  PCK@0.1: 52.70% (2750/5218)
  PCK@0.2: 65.06% (3395/5218)
--------------------
Category: bicycle
  PCK@0.05: 24.74% (869/3513)
  PCK@0.1: 38.14% (1340/3513)
  PCK@0.2: 47.14% (1656/3513)
--------------------
Category: bird
  PCK@0.05: 32.53% (1292/3972)
  PCK@0.1: 47.96% (1905/3972)
  PCK@0.2: 59.82% (2376/3972)
--------------------
Category: boat
  PCK@0.05: 16.64% (502/3016)
  PCK@0.1: 27.98% (844/3016)
  PCK@0.2: 45.62% (1376/3016)
--------------------
Category: bottle
  PCK@0.05: 8.89% (481/5409)
  PCK@0.1: 16.60% (898/5409)
  PCK@0.2: 31.93% (1727/5409)
--------------------
Category: bus
  PCK@0.05: 31.64% (1322/4178)
  PCK@0.1: 44.85% (1874/4178)
  PCK@0.2: 55.84% (2333/4178)
--------------------
Category: car
  PCK@0.05: 25.84% (866/3352)
  PCK@0.1: 38.87% (1303/3352)
  PCK@0.2: 49.67% (1665/3352)
--------------------
Category: cat
  PCK@0.05: 44.33% (2602/5870)
  PCK@0.1: 53.48% (3139/5870)
  PCK@0.2: 66