In [None]:
import os
import torch
import torchvision
from torchvision import datasets
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor, Lambda
import matplotlib.pyplot as plt
import requests
from zipfile import ZipFile
from io import BytesIO
import numpy as np
import zipfile
import os


zip_file_path = r'C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k.zip' 
extract_dir = r'C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k_extracted'

# Crea la directory di estrazione se non esiste
os.makedirs(extract_dir, exist_ok=True)

# Estrai il file ZIP solo se esiste
if os.path.exists(zip_file_path):
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print(f"File '{zip_file_path}' estratto con successo nella directory '{extract_dir}'")
    print(f"Contenuti della directory '{extract_dir}':\n{os.listdir(extract_dir)}")
else:
    print(f"File zip '{zip_file_path}' non trovato. Assicurati che il dataset sia estratto in '{extract_dir}'.")



In [6]:
from PIL import Image
import glob
import json


class Normalize(object):
    def __init__(self, image_keys):
        self.image_keys = image_keys
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    def __call__(self, image):
        for key in self.image_keys:
            image[key] /= 255.0
            image[key] = self.normalize(image[key])
        return image


def read_img(path):
    img = np.array(Image.open(path).convert('RGB'))

    return torch.tensor(img.transpose(2, 0, 1).astype(np.float32))


class SPairDataset(Dataset):
    def __init__(self, pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype):

        self.datatype = datatype
        self.pck_alpha = pck_alpha
        self.ann_files = open(os.path.join(layout_path, dataset_size, datatype + '.txt'), "r").read().split('\n')
        self.ann_files = self.ann_files[:len(self.ann_files) - 1]
        self.pair_ann_path = pair_ann_path
        self.image_path = image_path
        self.categories = list(map(lambda x: os.path.basename(x), glob.glob('%s/*' % image_path)))
        self.categories.sort()
        self.transform = Normalize(['src_img', 'trg_img'])

    def __len__(self):
        return len(self.ann_files)

    def __getitem__(self, idx):
        # 1. Recupera la riga dal file di layout
        # Esempio: "000001...:aeroplane"
        raw_line = self.ann_files[idx]

        # 2. LA FIX: Sostituisci i due punti ':' con l'underscore '_'
        # Da "000001...:aeroplane" diventa "000001..._aeroplane"
        ann_filename = raw_line.replace(':', '_')

        # 3. Aggiungi l'estensione .json
        ann_file = ann_filename + '.json'

        # 4. Costruisci il percorso completo usando self.datatype (trn/val/test)
        # Il percorso finale sarà: .../PairAnnotation/trn/000001..._aeroplane.json
        json_path = os.path.join(self.pair_ann_path, self.datatype, ann_file)

        with open(json_path) as f:
            annotation = json.load(f)

        category = annotation['category']
        src_img = read_img(os.path.join(self.image_path, category, annotation['src_imname']))
        trg_img = read_img(os.path.join(self.image_path, category, annotation['trg_imname']))

        trg_bbox = annotation['trg_bndbox']
        pck_threshold = max(trg_bbox[2] - trg_bbox[0],  trg_bbox[3] - trg_bbox[1]) * self.pck_alpha

        sample = {'pair_id': annotation['pair_id'],
                  'filename': annotation['filename'],
                  'src_imname': annotation['src_imname'],
                  'trg_imname': annotation['trg_imname'],
                  'src_imsize': src_img.size(),
                  'trg_imsize': trg_img.size(),

                  'src_bbox': annotation['src_bndbox'],
                  'trg_bbox': annotation['trg_bndbox'],
                  'category': annotation['category'],

                  'src_pose': annotation['src_pose'],
                  'trg_pose': annotation['trg_pose'],

                  'src_img': src_img,
                  'trg_img': trg_img,
                  'src_kps': torch.tensor(annotation['src_kps']).float(),
                  'trg_kps': torch.tensor(annotation['trg_kps']).float(),

                  'mirror': annotation['mirror'],
                  'vp_var': annotation['viewpoint_variation'],
                  'sc_var': annotation['scale_variation'],
                  'truncn': annotation['truncation'],
                  'occlsn': annotation['occlusion'],

                  'pck_threshold': pck_threshold}

        if self.transform:
            sample = self.transform(sample)

        return sample

if __name__ == '__main__':
    base_dir = r"C:\Users\nicol\Documents\PoliTo\AdvancedML\project\SPair-71k_extracted\SPair-71k\SPair-71k"    
    pair_ann_path = os.path.join(base_dir, 'PairAnnotation')
    layout_path = os.path.join(base_dir, 'Layout')
    image_path = os.path.join(base_dir, 'JPEGImages')
    dataset_size = 'large'
    pck_alpha = 0.2
    
    # Verifica che i percorsi esistano prima di creare il dataset
    if os.path.exists(pair_ann_path) and os.path.exists(layout_path) and os.path.exists(image_path):
        trn_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='trn')
        val_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='val')
        test_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='test')

        trn_dataloader = DataLoader(trn_dataset, num_workers=0)
        val_dataloader = DataLoader(val_dataset, num_workers=0)
        test_dataloader = DataLoader(test_dataset, num_workers=0)
        print("Dataset caricati correttamente.")
    else:
        print(f"Errore: Impossibile trovare i percorsi del dataset in '{base_dir}'.\nVerifica l'estrazione e controlla se la struttura delle cartelle corrisponde.")

Dataset caricati correttamente.


In [None]:
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import requests
import torch
import math 
from transformers import AutoModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)
model.eval() # il modello in modalità valutazione (congela dropout, ecc)

print(f"Modello caricato su: {device}")

# Inizializza i contatori per la metrica PCK
total_keypoints = 0
#correct_kps_0_05 = 0
#correct_kps_0_1 = 0
correct_kps_0_2 = 0
class_pck_data = {}

with torch.no_grad(): # Disabilita il calcolo dei gradienti (risparmia memoria RAM/VRAM)
    for i, data in enumerate(tqdm(test_dataloader, desc="Valutazione")):

        # Retrieve the category for the current item
        category = data['category'][0]

        # Initialize category entry in class_pck_data if it doesn't exist
        if category not in class_pck_data:
            class_pck_data[category] = {
                'total_keypoints': 0,
                'correct_kps_0_2': 0
            }

        # Estraiamo le immagini sorgente e target dal dizionario
        src_img = data['src_img'].to(device)
        trg_img = data['trg_img'].to(device)

        outputs_src = model(pixel_values=src_img)
        outputs_trg = model(pixel_values=trg_img)

        # Estrazione delle feature
        feats_src = outputs_src.last_hidden_state
        feats_trg = outputs_trg.last_hidden_state
        
        _, _, H, W = data['src_img'].shape # Dimensioni dell'immagine sorgente

        patch_size = 14
        w_grid = W // patch_size # Numero di patch in orizzontale

        kps_list_src = data['src_kps'][0] # Keypoints dell'immagine sorgente
        trg_kps_gt = data['trg_kps'][0] # Keypoints ground truth dell'immagine target

        pck_threshold = data['pck_threshold']
        
        # Loop sui singoli keypoint
        for n_keypoint, keypoint_src in enumerate(kps_list_src):

            # A. PRENDI LE COORDINATE (Pixel) del keypoint sorgente
            x_pixel_src = int(keypoint_src[0].item())
            y_pixel_src = int(keypoint_src[1].item())

            # B. CALCOLA L'INDICE DELLA PATCH (Matematica della griglia) nell'immagine sorgente
            x_patch_src = x_pixel_src // patch_size
            y_patch_src = y_pixel_src // patch_size

            patch_index_src = 1 + (y_patch_src * w_grid) + x_patch_src

            # Controllo di sicurezza per non uscire dai bordi
            max_patches_src = feats_src.shape[1]
            if patch_index_src >= max_patches_src:
                patch_index_src = max_patches_src - 1

            # C. ESTRAI IL VETTORE (Feature) del keypoint sorgente
            source_vec = feats_src[0, patch_index_src, :]

            # D. COSINE SIMILARITY con tutte le patch del target
            similarity_map = torch.cosine_similarity(source_vec, feats_trg[0], dim=-1)

            # Trova l'indice del massimo valore di somiglianza
            max_sim_idx_from_map = torch.argmax(similarity_map).item()

            # Converti questo indice nel 0-based index della patch visiva per il calcolo delle coordinate
            if max_sim_idx_from_map == 0: # Se il CLS token è il più simile
                patch_idx_0based_for_grid_calc = 0
            else:
                patch_idx_0based_for_grid_calc = max_sim_idx_from_map - 1

            # Converti l'indice 0-based della patch visiva in coordinate (colonna, riga) della griglia
            x_col_max_patch = patch_idx_0based_for_grid_calc % w_grid
            y_row_max_patch = patch_idx_0based_for_grid_calc // w_grid

            # Converti le coordinate della griglia in coordinate pixel predette (al centro della patch)
            x_pred_pixel = x_col_max_patch * patch_size + (patch_size // 2)
            y_pred_pixel = y_row_max_patch * patch_size + (patch_size // 2)

            # E. Confronto con il Ground Truth e calcolo PCK@T
            gt_x = trg_kps_gt[n_keypoint, 0].item()
            gt_y = trg_kps_gt[n_keypoint, 1].item()

            # Calcola la distanza euclidea tra predetto e GT
            distance = math.sqrt((x_pred_pixel - gt_x)**2 + (y_pred_pixel - gt_y)**2)

            # Aggiorna i contatori PCK (CLASS-SPECIFIC)
            class_pck_data[category]['total_keypoints'] += 1
            if distance <= pck_threshold:
                class_pck_data[category]['correct_kps_0_2'] += 1


Modello caricato su: cuda


Valutazione: 100%|██████████| 12234/12234 [23:36<00:00,  8.64it/s]


In [None]:
# 3. Calculate and Display PCK for key point per Class
print("--- PCK per Class ---")
#class_pck_0_05_list = []
#class_pck_0_1_list = []
class_pck_0_2_list = []

for category, data in class_pck_data.items():
    total_kps = data['total_keypoints']
    #correct_kps_0_05 = data['correct_kps_0_05']
    #correct_kps_0_1 = data['correct_kps_0_1']
    correct_kps_0_2 = data['correct_kps_0_2']

    #pck_0_05 = (correct_kps_0_05 / total_kps) * 100 if total_kps > 0 else 0
    #pck_0_1 = (correct_kps_0_1 / total_kps) * 100 if total_kps > 0 else 0
    pck_0_2 = (correct_kps_0_2 / total_kps) * 100 if total_kps > 0 else 0

    print(f"Category: {category}")
    #print(f"  PCK@0.05: {pck_0_05:.2f}% ({correct_kps_0_05}/{total_kps})")
    #print(f"  PCK@0.1: {pck_0_1:.2f}% ({correct_kps_0_1}/{total_kps})")
    print(f"  PCK@0.2: {pck_0_2:.2f}% ({correct_kps_0_2}/{total_kps})")
    print("-" * 20)

    if total_kps > 0: # Only add to the list if there were keypoints for this class
        #class_pck_0_05_list.append(pck_0_05)
        #class_pck_0_1_list.append(pck_0_1)
        class_pck_0_2_list.append(pck_0_2)

# 4. Calculate and Display Overall Mean PCK
print("\n--- Overall Mean PCK ---")
#overall_mean_pck_0_05 = sum(class_pck_0_05_list) / len(class_pck_0_05_list) if class_pck_0_05_list else 0
#overall_mean_pck_0_1 = sum(class_pck_0_1_list) / len(class_pck_0_1_list) if class_pck_0_1_list else 0
overall_mean_pck_0_2 = sum(class_pck_0_2_list) / len(class_pck_0_2_list) if class_pck_0_2_list else 0

#print(f"Overall Mean PCK@0.05: {overall_mean_pck_0_05:.2f}%")
#print(f"Overall Mean PCK@0.1: {overall_mean_pck_0_1:.2f}%")
print(f"Overall Mean PCK@0.2: {overall_mean_pck_0_2:.2f}%")

# 5. Final Task: Summarize the results
##print("\n--- Summary ---")
#print("The analysis provides detailed PCK scores for each object category at different thresholds (0.05, 0.1, 0.2).")
#print("This class-specific breakdown highlights the model's performance on various object types, indicating categories where the model performs strongly or weakly.")
#print("For instance, a low PCK score for a specific category might suggest that the model struggles with keypoint localization for objects in that category.")
#print("The overall mean PCK scores then provide a consolidated view of the model's average performance across all categories.")
#print(f"The model achieved an overall mean PCK@0.05 of {overall_mean_pck_0_05:.2f}%, PCK@0.1 of {overall_mean_pck_0_1:.2f}%, and PCK@0.2 of {overall_mean_pck_0_2:.2f}%.")

--- PCK per Class ---
Category: aeroplane
  PCK@0.05: 38.22% (2103/5502)
--------------------
Category: bicycle
  PCK@0.05: 22.74% (879/3866)
--------------------
Category: bird
  PCK@0.05: 30.61% (1331/4348)
--------------------
Category: boat
  PCK@0.05: 14.94% (506/3386)
--------------------
Category: bottle
  PCK@0.05: 7.35% (472/6422)
--------------------
Category: bus
  PCK@0.05: 29.32% (1313/4478)
--------------------
Category: car
  PCK@0.05: 24.39% (876/3592)
--------------------
Category: cat
  PCK@0.05: 42.63% (2642/6198)
--------------------
Category: chair
  PCK@0.05: 6.35% (232/3652)
--------------------
Category: cow
  PCK@0.05: 23.67% (1268/5358)
--------------------
Category: dog
  PCK@0.05: 30.61% (1470/4802)
--------------------
Category: horse
  PCK@0.05: 8.35% (393/4704)
--------------------
Category: motorbike
  PCK@0.05: 25.84% (879/3402)
--------------------
Category: person
  PCK@0.05: 23.23% (998/4296)
--------------------
Category: pottedplant
  PCK@0.05: 4.8