In [26]:
from torch.utils.data import Dataset
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np
import os
import glob
import json


class Normalize(object):
    def __init__(self, image_keys):
        self.image_keys = image_keys
        self.normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

    def __call__(self, image):
        for key in self.image_keys:
            image[key] /= 255.0
            image[key] = self.normalize(image[key])
        return image


def read_img(path):
    img = np.array(Image.open(path).convert('RGB'))

    return torch.tensor(img.transpose(2, 0, 1).astype(np.float32))


class SPairDataset(Dataset):
    def __init__(self, pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype):

        self.datatype = datatype
        self.pck_alpha = pck_alpha
        self.ann_files = open(os.path.join(layout_path, dataset_size, datatype + '.txt'), "r").read().split('\n')
        self.ann_files = self.ann_files[:len(self.ann_files) - 1]
        self.pair_ann_path = pair_ann_path
        self.image_path = image_path
        self.categories = list(map(lambda x: os.path.basename(x), glob.glob('%s/*' % image_path)))
        self.categories.sort()
        self.transform = Normalize(['src_img', 'trg_img'])

    def __len__(self):
        return len(self.ann_files)

    def __getitem__(self, idx):
        
        ann_filename = self.ann_files[idx]
        ann_file = ann_filename + '.json'
        json_path = os.path.join(self.pair_ann_path, self.datatype, ann_file)

        with open(json_path) as f:
            annotation = json.load(f)

        category = annotation['category']
        src_img = read_img(os.path.join(self.image_path, category, annotation['src_imname']))
        trg_img = read_img(os.path.join(self.image_path, category, annotation['trg_imname']))

        trg_bbox = annotation['trg_bndbox']
        pck_threshold = max(trg_bbox[2] - trg_bbox[0],  trg_bbox[3] - trg_bbox[1]) * self.pck_alpha

        sample = {'pair_id': annotation['pair_id'],
                  'filename': annotation['filename'],
                  'src_imname': annotation['src_imname'],
                  'trg_imname': annotation['trg_imname'],
                  'src_imsize': src_img.size(),
                  'trg_imsize': trg_img.size(),

                  'src_bbox': annotation['src_bndbox'],
                  'trg_bbox': annotation['trg_bndbox'],
                  'category': annotation['category'],

                  'src_pose': annotation['src_pose'],
                  'trg_pose': annotation['trg_pose'],

                  'src_img': src_img,
                  'trg_img': trg_img,
                  'src_kps': torch.tensor(annotation['src_kps']).float(),
                  'trg_kps': torch.tensor(annotation['trg_kps']).float(),

                  'mirror': annotation['mirror'],
                  'vp_var': annotation['viewpoint_variation'],
                  'sc_var': annotation['scale_variation'],
                  'truncn': annotation['truncation'],
                  'occlsn': annotation['occlusion'],

                  'pck_threshold': pck_threshold}

        if self.transform:
            sample = self.transform(sample)

        return sample

if __name__ == '__main__':
    base_dir = os.path.abspath(os.path.curdir)
    dataset_dir = os.path.join(base_dir, 'dataset')
    pair_ann_path = os.path.join(dataset_dir, 'PairAnnotation')
    layout_path = os.path.join(dataset_dir, 'Layout')
    image_path = os.path.join(dataset_dir, 'JPEGImages')
    dataset_size = 'large'
    pck_alpha = 0.05
    
    # Verifica che i percorsi esistano prima di creare il dataset
    if os.path.exists(pair_ann_path) and os.path.exists(layout_path) and os.path.exists(image_path):
        trn_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='trn')
        val_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='val')
        test_dataset = SPairDataset(pair_ann_path, layout_path, image_path, dataset_size, pck_alpha, datatype='test')

        trn_dataloader = DataLoader(trn_dataset, num_workers=0)
        val_dataloader = DataLoader(val_dataset, num_workers=0)
        test_dataloader = DataLoader(test_dataset, num_workers=0)
        print("Dataset caricati correttamente.")
    else:
        print(f"Errore: Impossibile trovare i percorsi del dataset in '{base_dir}'.\nVerifica l'estrazione e controlla se la struttura delle cartelle corrisponde.")

Dataset caricati correttamente.


In [27]:
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import requests
import torch
import math 
from transformers import AutoModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)
model.eval() # il modello in modalità valutazione (congela dropout, ecc)

print(f"Modello caricato su: {device}")

# Inizializza i contatori per la metrica PCK
total_keypoints = 0
#correct_kps_0_05 = 0
#correct_kps_0_1 = 0
correct_kps_0_2 = 0
class_pck_data = {}

with torch.no_grad(): # Disabilita il calcolo dei gradienti (risparmia memoria RAM/VRAM)
    for i, data in enumerate(tqdm(test_dataloader, desc="Valutazione")):
        category = data['category'][0]
        if category not in class_pck_data:
            class_pck_data[category] = {
                'total_keypoints': 0,
                'correct_kps_0_2': 0
            }

        src_img = data['src_img'].to(device)
        trg_img = data['trg_img'].to(device)

        outputs_src = model(pixel_values=src_img)
        outputs_trg = model(pixel_values=trg_img)

        feats_src = outputs_src.last_hidden_state
        feats_trg = outputs_trg.last_hidden_state
        
        _, _, H, W = data['src_img'].shape 

        patch_size = 14
        w_grid = W // patch_size # Numero di patch in orizzontale

        kps_list_src = data['src_kps'][0] # Keypoints dell'immagine sorgente
        trg_kps_gt = data['trg_kps'][0] # Keypoints ground truth dell'immagine target

        pck_threshold = data['pck_threshold']
        
        for n_keypoint, keypoint_src in enumerate(kps_list_src):

            x_pixel_src = int(keypoint_src[0].item())
            y_pixel_src = int(keypoint_src[1].item())

            # CALCOLA L'INDICE DELLA PATCH nell'immagine sorgente
            x_patch_src = x_pixel_src // patch_size
            y_patch_src = y_pixel_src // patch_size

            patch_index_src = 1 + (y_patch_src * w_grid) + x_patch_src

            # Controllo di sicurezza per non uscire dai bordi
            max_patches_src = feats_src.shape[1]
            if patch_index_src >= max_patches_src:
                patch_index_src = max_patches_src - 1

            # ESTRAI IL VETTORE (Feature) del keypoint sorgente
            source_vec = feats_src[0, patch_index_src, :]

            # COSINE SIMILARITY con tutte le patch del target
            similarity_map = torch.cosine_similarity(source_vec, feats_trg[0], dim=-1)

            max_sim_idx_from_map = torch.argmax(similarity_map).item()

            if max_sim_idx_from_map == 0: # Se il CLS token è il più simile
                patch_idx_0based_for_grid_calc = 0
            else:
                patch_idx_0based_for_grid_calc = max_sim_idx_from_map - 1

            # Converti l'indice 0-based della patch visiva in coordinate (colonna, riga) 
            x_col_max_patch = patch_idx_0based_for_grid_calc % w_grid
            y_row_max_patch = patch_idx_0based_for_grid_calc // w_grid

            # Converti le coordinate della griglia in coordinate pixel predette (al centro della patch)
            x_pred_pixel = x_col_max_patch * patch_size + (patch_size // 2)
            y_pred_pixel = y_row_max_patch * patch_size + (patch_size // 2)

            gt_x = trg_kps_gt[n_keypoint, 0].item()
            gt_y = trg_kps_gt[n_keypoint, 1].item()

            # Calcola la distanza euclidea tra predetto e GT
            distance = math.sqrt((x_pred_pixel - gt_x)**2 + (y_pred_pixel - gt_y)**2)

            # Aggiorna i contatori PCK 
            class_pck_data[category]['total_keypoints'] += 1
            if distance <= pck_threshold:
                class_pck_data[category]['correct_kps_0_2'] += 1


Modello caricato su: cuda


Valutazione:  16%|█▋        | 1993/12234 [09:32<49:03,  3.48it/s]  


KeyboardInterrupt: 

In [None]:
# CALCOLO PCK PER IMAGE
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import requests
import torch
import math 
from transformers import AutoModel
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)
model.eval() # il modello in modalità valutazione (congela dropout, ecc)

print(f"Modello caricato su: {device}")

# Inizializza i contatori per la metrica PCK
correct_kps_0_05 = 0
#correct_kps_0_1 = 0
#correct_kps_0_2 = 0
class_pck_data = {}

with torch.no_grad(): # Disabilita il calcolo dei gradienti
    for i, data in enumerate(tqdm(test_dataloader, desc="Valutazione")):

        # Retrieve the category for the current item
        category = data['category'][0]

        # Initialize category entry in class_pck_data if it doesn't exist
        if category not in class_pck_data:
            class_pck_data[category] = {
                'total_image': 0,
                'image_value_0_05': 0,
            }
        class_pck_data[category]['total_image'] +=1

        tot_num_keypoint = 0
        correct_keypoints = 0

        src_img = data['src_img'].to(device)
        trg_img = data['trg_img'].to(device)

        outputs_src = model(pixel_values=src_img)
        outputs_trg = model(pixel_values=trg_img)

        feats_src = outputs_src.last_hidden_state
        feats_trg = outputs_trg.last_hidden_state
        
        _, _, H, W = data['src_img'].shape 

        patch_size = 14
        w_grid = W // patch_size # Numero di patch in orizzontale

        kps_list_src = data['src_kps'][0] # Keypoints dell'immagine sorgente
        trg_kps_gt = data['trg_kps'][0] # Keypoints ground truth dell'immagine target

        pck_threshold = data['pck_threshold']
        
        for n_keypoint, keypoint_src in enumerate(kps_list_src):
            
            x_pixel_src = int(keypoint_src[0].item())
            y_pixel_src = int(keypoint_src[1].item())

            # CALCOLA L'INDICE DELLA PATCH nell'immagine sorgente
            x_patch_src = x_pixel_src // patch_size
            y_patch_src = y_pixel_src // patch_size

            patch_index_src = 1 + (y_patch_src * w_grid) + x_patch_src

            # Controllo di sicurezza per non uscire dai bordi
            max_patches_src = feats_src.shape[1]
            if patch_index_src >= max_patches_src:
                patch_index_src = max_patches_src - 1

            # ESTRAI IL VETTORE (Feature) del keypoint sorgente
            source_vec = feats_src[0, patch_index_src, :]

            # COSINE SIMILARITY con tutte le patch del target
            similarity_map = torch.cosine_similarity(source_vec, feats_trg[0], dim=-1)

            max_sim_idx_from_map = torch.argmax(similarity_map).item()

            # Converti questo indice nel 0-based index della patch visiva per il calcolo delle coordinate
            if max_sim_idx_from_map == 0: # Se il CLS token è il più simile
                patch_idx_0based_for_grid_calc = 0
            else:
                patch_idx_0based_for_grid_calc = max_sim_idx_from_map - 1

            # Converti l'indice 0-based della patch visiva in coordinate (colonna, riga) della griglia
            x_col_max_patch = patch_idx_0based_for_grid_calc % w_grid
            y_row_max_patch = patch_idx_0based_for_grid_calc // w_grid

            # Converti le coordinate della griglia in coordinate pixel predette (al centro della patch)
            x_pred_pixel = x_col_max_patch * patch_size + (patch_size // 2)
            y_pred_pixel = y_row_max_patch * patch_size + (patch_size // 2)

            gt_x = trg_kps_gt[n_keypoint, 0].item()
            gt_y = trg_kps_gt[n_keypoint, 1].item()

            # Calcola la distanza euclidea tra predetto e GT
            distance = math.sqrt((x_pred_pixel - gt_x)**2 + (y_pred_pixel - gt_y)**2)

            # Aggiorna i contatori PCK (CLASS-SPECIFIC)
            tot_num_keypoint += 1
            if distance <= pck_threshold:
                correct_keypoints += 1
        class_pck_data[category]['image_value_0_05'] += correct_keypoints/tot_num_keypoint
        del outputs_src, outputs_trg, feats_src, feats_trg


Modello caricato su: cuda


Valutazione: 100%|██████████| 12234/12234 [21:32<00:00,  9.47it/s]


In [10]:
# CALCOLO PCK PER IMAGE
print("--- PCK per Class ---")
class_pck_0_05_list = []
#class_pck_0_1_list = []
#class_pck_0_2_list = []

for category, data in class_pck_data.items():
    total_image = data['total_image']
    correct_image_0_05 = data['image_value_0_05']
    #correct_image_0_1 = data['image_value_0_1']
    #correct_image_0_2 = data['image_value_0_2']

    pck_0_05 = (correct_image_0_05 / total_image) * 100 if total_image > 0 else 0
    #pck_0_1 = (correct_image_0_1 / total_image) * 100 if total_image > 0 else 0
    #pck_0_2 = (correct_image_0_2 / total_image) * 100 if total_image > 0 else 0

    print(f"Category: {category}")
    print(f"  PCK@0.05: {pck_0_05:.2f}% ({correct_image_0_05}/{total_image})")
    #print(f"  PCK@0.1: {pck_0_1:.2f}% ({correct_image_0_1}/{total_image})")
    #print(f"  PCK@0.2: {pck_0_2:.2f}% ({correct_image_0_2}/{total_image})")
    print("-" * 20)

    if total_image> 0: # Only add to the list if there were keypoints for this class
        class_pck_0_05_list.append(pck_0_05)
        #class_pck_0_1_list.append(pck_0_1)
        #class_pck_0_2_list.append(pck_0_2)

# 4. Calculate and Display Overall Mean PCK
print("\n--- Overall Mean PCK ---")
overall_mean_pck_0_05 = sum(class_pck_0_05_list) / len(class_pck_0_05_list) if class_pck_0_05_list else 0
#overall_mean_pck_0_1 = sum(class_pck_0_1_list) / len(class_pck_0_1_list) if class_pck_0_1_list else 0
#overall_mean_pck_0_2 = sum(class_pck_0_2_list) / len(class_pck_0_2_list) if class_pck_0_2_list else 0

print(f"Overall Mean PCK@0.05: {overall_mean_pck_0_05:.2f}%")
#print(f"Overall Mean PCK@0.1: {overall_mean_pck_0_1:.2f}%")
#print(f"Overall Mean PCK@0.2: {overall_mean_pck_0_2:.2f}%")



--- PCK per Class ---
Category: aeroplane
  PCK@0.05: 36.34% (250.7318228598027/690)
--------------------
Category: bicycle
  PCK@0.05: 19.50% (126.73318903318913/650)
--------------------
Category: bird
  PCK@0.05: 31.14% (218.62080142080143/702)
--------------------
Category: boat
  PCK@0.05: 16.00% (112.33492063492066/702)
--------------------
Category: bottle
  PCK@0.05: 7.54% (65.61230158730164/870)
--------------------
Category: bus
  PCK@0.05: 23.66% (152.33914010825796/644)
--------------------
Category: car
  PCK@0.05: 19.56% (110.30733571983582/564)
--------------------
Category: cat
  PCK@0.05: 42.09% (252.52851315351265/600)
--------------------
Category: chair
  PCK@0.05: 6.06% (39.1277417027417/646)
--------------------
Category: cow
  PCK@0.05: 23.97% (153.42627019523312/640)
--------------------
Category: dog
  PCK@0.05: 28.64% (171.85326062826076/600)
--------------------
Category: horse
  PCK@0.05: 7.92% (47.50466477966479/600)
--------------------
Category: motorbike

In [None]:
#PCK per point
print("--- PCK per Class ---")
#class_pck_0_05_list = []
#class_pck_0_1_list = []
class_pck_0_2_list = []

for category, data in class_pck_data.items():
    total_kps = data['total_keypoints']
    #correct_kps_0_05 = data['correct_kps_0_05']
    #correct_kps_0_1 = data['correct_kps_0_1']
    correct_kps_0_2 = data['correct_kps_0_2']

    #pck_0_05 = (correct_kps_0_05 / total_kps) * 100 if total_kps > 0 else 0
    #pck_0_1 = (correct_kps_0_1 / total_kps) * 100 if total_kps > 0 else 0
    pck_0_2 = (correct_kps_0_2 / total_kps) * 100 if total_kps > 0 else 0

    print(f"Category: {category}")
    #print(f"  PCK@0.05: {pck_0_05:.2f}% ({correct_kps_0_05}/{total_kps})")
    #print(f"  PCK@0.1: {pck_0_1:.2f}% ({correct_kps_0_1}/{total_kps})")
    print(f"  PCK@0.2: {pck_0_2:.2f}% ({correct_kps_0_2}/{total_kps})")
    print("-" * 20)

    if total_kps > 0: # Only add to the list if there were keypoints for this class
        #class_pck_0_05_list.append(pck_0_05)
        #class_pck_0_1_list.append(pck_0_1)
        class_pck_0_2_list.append(pck_0_2)

# 4. Calculate and Display Overall Mean PCK
print("\n--- Overall Mean PCK ---")
#overall_mean_pck_0_05 = sum(class_pck_0_05_list) / len(class_pck_0_05_list) if class_pck_0_05_list else 0
#overall_mean_pck_0_1 = sum(class_pck_0_1_list) / len(class_pck_0_1_list) if class_pck_0_1_list else 0
overall_mean_pck_0_2 = sum(class_pck_0_2_list) / len(class_pck_0_2_list) if class_pck_0_2_list else 0

#print(f"Overall Mean PCK@0.05: {overall_mean_pck_0_05:.2f}%")
#print(f"Overall Mean PCK@0.1: {overall_mean_pck_0_1:.2f}%")
print(f"Overall Mean PCK@0.2: {overall_mean_pck_0_2:.2f}%")



--- PCK per Class ---
Category: aeroplane
  PCK@0.2: 60.03% (3303/5502)
--------------------
Category: bicycle
  PCK@0.2: 42.55% (1645/3866)
--------------------
Category: bird
  PCK@0.2: 55.15% (2398/4348)
--------------------
Category: boat
  PCK@0.2: 41.55% (1407/3386)
--------------------
Category: bottle
  PCK@0.2: 26.77% (1719/6422)
--------------------
Category: bus
  PCK@0.2: 53.15% (2380/4478)
--------------------
Category: car
  PCK@0.2: 46.94% (1686/3592)
--------------------
Category: cat
  PCK@0.2: 63.42% (3931/6198)
--------------------
Category: chair
  PCK@0.2: 20.37% (744/3652)
--------------------
Category: cow
  PCK@0.2: 51.49% (2759/5358)
--------------------
Category: dog
  PCK@0.2: 60.08% (2885/4802)
--------------------
Category: horse
  PCK@0.2: 27.89% (1312/4704)
--------------------
Category: motorbike
  PCK@0.2: 53.64% (1825/3402)
--------------------
Category: person
  PCK@0.2: 51.86% (2228/4296)
--------------------
Category: pottedplant
  PCK@0.2: 21.41% (