In [None]:
# Monta Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Percorsi
zip_su_drive = '/content/drive/MyDrive/semantic_correspondence.zip'
zip_locale = '/content/semantic_correspondence.zip'
cartella_destinazione = '/content/'

# Copia lo zip in locale
import shutil
shutil.copy(zip_su_drive, zip_locale)

'/content/semantic_correspondence.zip'

In [None]:
# Estrai lo zip
import zipfile, os
os.makedirs(cartella_destinazione, exist_ok=True)
with zipfile.ZipFile(zip_locale, 'r') as z:
    z.extractall(cartella_destinazione)


In [None]:
# 5. Verify GPU
import torch
print(f"\n✓ GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'No GPU'}")



✓ GPU: Tesla T4


In [None]:
!nvidia-smi

Thu Jan 15 16:34:44 2026       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   45C    P8             11W /   70W |       2MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
import os
os.chdir('/content/semantic_correspondence')

In [6]:
import torch



def extract_dense_features(model, img_tensor, training=False):
    """Extract dense features from DINOv2 model given an input image tensor."""
    context = torch.no_grad() if not training else torch.enable_grad()

    with context:
        #get tokens
        features_dict = model.forward_features(img_tensor)
        patch_tokens = features_dict['x_norm_patchtokens']  # [B, N_patches, D]

        #reshaping to dense feature map
        B, N, D = patch_tokens.shape
        H_patches = W_patches = int(N ** 0.5)  # per img 518x518 con patch 14: 37x37
        dense_features = patch_tokens.reshape(B, H_patches, W_patches, D)
    return dense_features


def pixel_to_patch_coord(x, y, original_size, patch_size=14, resized_size=518):
    """convert pixel coordinates to patch coordinates"""
    #scale to resized image
    scale_x = resized_size / original_size[0]
    scale_y = resized_size / original_size[1]
    x_resized = x * scale_x
    y_resized = y * scale_y

    #compute patch coordinates
    patch_x = int(x_resized // patch_size)
    patch_y = int(y_resized // patch_size)

    #clamp to valid range
    max_patch = resized_size // patch_size - 1
    patch_x = min(max(patch_x, 0), max_patch)
    patch_y = min(max(patch_y, 0), max_patch)

    return patch_x, patch_y


def patch_to_pixel_coord(patch_x, patch_y, original_size, patch_size=14, resized_size=518):
    """Convert patch coordinates back to pixel coordinates with a centering strategy"""
    #center of the patch in resized image
    x_resized = patch_x * patch_size + patch_size / 2
    y_resized = patch_y * patch_size + patch_size / 2

    #scale back to original image size
    scale_x = original_size[0] / resized_size
    scale_y = original_size[1] / resized_size
    x = x_resized * scale_x
    y = y_resized * scale_y

    return x, y

In [7]:

base = '/content/semantic_correspondence/SPair71k'

In [None]:
%cd /content/semantic_correspondence/models/segment_anything
!pip install -e .

/content/semantic_correspondence/models/segment_anything
Obtaining file:///content/semantic_correspondence/models/segment_anything
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: segment_anything
  Attempting uninstall: segment_anything
    Found existing installation: segment_anything 1.0
    Uninstalling segment_anything-1.0:
      Successfully uninstalled segment_anything-1.0
  Running setup.py develop for segment_anything
Successfully installed segment_anything-1.0


In [8]:
%cd /content/semantic_correspondence/

/content/semantic_correspondence


In [10]:
import shutil # Added for copying to Google Drive
from finetuning.simple_eval import simple_evaluate_SAM
import json
from collections import defaultdict
import numpy as np
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import os
from datetime import datetime
import torch.nn.functional as F
import sys
import time

# Add the parent directory of the 'segment_anything' package to sys.path
# sys.path.append(os.path.abspath('/content/semantic_correspondence/models')) # Removed this line as it was not needed for the original error

from SPair71k.devkit.SPairDataset import SPairDataset
from helper_functions import extract_dense_features_SAM, pixel_to_patch_coord, patch_to_pixel_coord
from finetuning.simple_eval import simple_evaluate_SAM
from matching_strategies import find_best_match_argmax
from pck import compute_pck_spair71k
from models.segment_anything.segment_anything import SamPredictor, sam_model_registry # Reverted to original import path

def freeze_model(model):
    """Freeze all model parameters"""
    for param in model.parameters():
        param.requires_grad = False


def unfreeze_last_n_blocks(model, n_blocks):
    """
    Unfreeze the last n_blocks transformer blocks + final norm layer of the SAM image encoder.

    Args:
        model: SAM model
        n_blocks: number of blocks to unfreeze (counting from the end)
    """
    # Access the image encoder part of the SAM model
    image_encoder = model.image_encoder

    total_blocks = len(image_encoder.blocks)

    # Unfreeze last n blocks
    for i in range(total_blocks - n_blocks, total_blocks):
        for param in image_encoder.blocks[i].parameters():
            param.requires_grad = True

    # Also unfreeze the final normalization layer
    # For SAM's ViT, this is typically model.image_encoder.neck.ln_final
    if hasattr(image_encoder, 'neck') and hasattr(image_encoder.neck, 'ln_final'):
        for param in image_encoder.neck.ln_final.parameters():
            param.requires_grad = True
        print(f"Unfrozen last {n_blocks} blocks + final norm layer of SAM image encoder")
    else:
        print(f"Unfrozen last {n_blocks} blocks of SAM image encoder. Final norm layer not found or accessible via 'neck.ln_final'.")


def compute_cross_entropy_loss(src_features, tgt_features, src_kps, trg_kps,
                               src_original_size, tgt_original_size, img_size, patch_size, temperature=10.0):
    """
    Compute cross-entropy loss for semantic correspondence.
    Treats correspondence as a classification problem where each target patch is a class.

    Args:
        src_features: [1, H, W, D] source dense features
        tgt_features: [1, H, W, D] target dense features
        src_kps: [N, 2] source keypoints in pixel coordinates
        trg_kps: [N, 2] target keypoints in pixel coordinates
        src_original_size: (width, height) of original source image
        tgt_original_size: (width, height) of original target image
        img_size: resizing size used during feature extraction
        patch_size: size of each patch
        temperature: softmax temperature (higher = more peaked distribution)

    Returns:
        loss: mean cross-entropy loss across all keypoints
    """
    _, H, W, D = tgt_features.shape
    tgt_flat = tgt_features.reshape(H * W, D)  # [H*W, D]

    losses = []

    for i in range(src_kps.shape[0]):
        src_x, src_y = src_kps[i]
        tgt_x, tgt_y = trg_kps[i]

        # Get source feature at keypoint location
        src_patch_x, src_patch_y = pixel_to_patch_coord(src_x, src_y, src_original_size, patch_size=patch_size, resized_size=img_size)
        src_feature = src_features[0, src_patch_y, src_patch_x, :]  # [D]

        # Get ground truth target patch coordinates
        tgt_patch_x, tgt_patch_y = pixel_to_patch_coord(tgt_x, tgt_y, tgt_original_size, patch_size=patch_size, resized_size=img_size)
        # Compute cosine similarities with all target patches
        similarities = F.cosine_similarity(
            src_feature.unsqueeze(0),  # [1, D]
            tgt_flat,  # [H*W, D]
            dim=1
        )  # [H*W]

        # Convert similarities to log-probabilities
        log_probs = F.log_softmax(similarities * temperature, dim=0)

        # Ground truth index (flatten 2D coordinates to 1D)
        gt_idx = tgt_patch_y * W + tgt_patch_x

        # Negative log-likelihood loss
        loss = -log_probs[gt_idx]
        losses.append(loss)

    return torch.stack(losses).mean()


def train_epoch(model, dataloader, optimizer, device, epoch, img_size, patch_size, temperature=10.0):
    """
    Train for one epoch

    Args:
        model: SAM model
        dataloader: training data loader
        optimizer: optimizer
        device: 'cuda' or 'cpu'
        epoch: current epoch number
        img_size: size to which images are resized for feature extraction
        patch_size: size of each patch
        temperature: softmax temperature for loss

    Returns:
        avg_loss: average loss over the epoch
    """
    model.train()
    total_loss = 0
    num_batches = 0

    for idx, sample in enumerate(dataloader):
        # Prepare data
        src_tensor = sample['src_img'].to(device)  # [1, 3, H, W]
        tgt_tensor = sample['trg_img'].to(device)  # [1, 3, H, W]

        # Resize to 518x518 (DINOv2 expects this size)
        src_tensor = F.interpolate(src_tensor, size=(img_size, img_size), mode='bilinear', align_corners=False)
        tgt_tensor = F.interpolate(tgt_tensor, size=(img_size, img_size), mode='bilinear', align_corners=False)

        #match SAM half precision
        # src_tensor = src_tensor.half() # Removed .half()
        # tgt_tensor = tgt_tensor.half() # Removed .half()

        # Store original sizes for coordinate conversion
        src_original_size = (sample['src_imsize'][2], sample['src_imsize'][1])
        tgt_original_size = (sample['trg_imsize'][2], sample['trg_imsize'][1])

        # Get keypoints
        src_kps = sample['src_kps'].numpy()[0]  # [N, 2]
        trg_kps = sample['trg_kps'].numpy()[0]  # [N, 2]

        # Extract dense features
        src_features = extract_dense_features_SAM(model, src_tensor, image_size=img_size, training=True)
        tgt_features = extract_dense_features_SAM(model, tgt_tensor, image_size=img_size, training=True)

        # Compute loss
        loss = compute_cross_entropy_loss(
            src_features, tgt_features,
            src_kps, trg_kps,
            src_original_size, tgt_original_size,
            img_size, patch_size,
            temperature=temperature
        )

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        num_batches += 1

        if idx == 50:
            print(f"Epoch {epoch}, Batch {idx + 1}/{len(dataloader)}, Loss: {loss.item():.4f}")
            break  # DEBUG: limit to 50 batches per epoch

        # Print progress
        if (idx + 1) % 100 == 0:
            print(f"Epoch {epoch}, Batch {idx + 1}/{len(dataloader)}, Loss: {loss.item():.4f}")

    avg_loss = total_loss / num_batches
    return avg_loss


def main():
    """Main training and evaluation pipeline"""

    # ========== CONFIGURATION ==========
    n_blocks = 2  #to try: 1, 2, 3, 4
    num_epochs = 1
    learning_rate = 1e-4
    batch_size = 1  #SPair-71k has variable-sized images
    temperature = 5  #softmax temperature for cross-entropy loss try 5,10,15
    img_size = 512
    patch_size = 16
    weight_decay = 0.01

    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Create results_SPair71k directory with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    results_dir = f'results_colab/SAM/lr_{learning_rate}_t_{temperature}_blocks_{n_blocks}_{timestamp}'
    # results_dir = f'results_SPair71k/dinov3_base_finetuned_{timestamp}'
    os.makedirs(results_dir, exist_ok=True)
    print(f"Results will be saved to: {results_dir}")

    # ========== LOAD DATASETS ==========
    print("\nLoading SPair-71k dataset...")
    pair_ann_path = f'{base}/PairAnnotation'
    layout_path = f'{base}/Layout'
    image_path = f'{base}/JPEGImages'
    dataset_size = 'large'
    pck_alpha = 0.1 #mock, it's not used in evaluation

    train_dataset = SPairDataset(
        pair_ann_path,
        layout_path,
        image_path,
        dataset_size,
        pck_alpha,  # dummy pck_alpha, not used during training
        datatype='trn'  # training split
    )

    val_dataset = SPairDataset(
        pair_ann_path,
        layout_path,
        image_path,
        dataset_size,
        pck_alpha,
        datatype='val'
    )

    print(f"Training samples: {len(train_dataset)}")
    print(f"Val samples: {len(val_dataset)}")

    # Create data loader
    train_loader = DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=1,
        pin_memory=True if device == 'cuda' else False
    )

    # for n_blocks in [1,2,3,4]:
    print("\n" + "=" * 80)
    print(f"FINETUNING WITH LAST {n_blocks} BLOCKS UNFROZEN")
    print("=" * 80)
    # ========== LOAD MODEL ==========
    print("\nLoading SAM model...")
    model_type = "vit_b"
    checkpoint_path = "models/segment_anything/weights/sam_vit_b_01ec64.pth"
    sam_model = sam_model_registry[model_type](checkpoint=checkpoint_path)
    sam_model.to(device)
    # sam_model.half() # Removed .half()

    # freeze entire model, then unfreeze last N blocks
    freeze_model(sam_model)
    unfreeze_last_n_blocks(sam_model, n_blocks)

    # count trainable parameters
    trainable_params = sum(p.numel() for p in sam_model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in sam_model.parameters())
    print(f"\nTrainable parameters: {trainable_params:,} / {total_params:,} "
            f"({100 * trainable_params / total_params:.2f}%)")


    # ========== OPTIMIZER ==========
    optimizer = optim.AdamW(
        filter(lambda p: p.requires_grad, sam_model.parameters()),
        lr=learning_rate,
        weight_decay=weight_decay
    )

    # Optional: Learning rate scheduler
    # scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)

    # ========== TRAINING LOOP ==========
    print("\n" + "=" * 60)
    print("STARTING TRAINING")
    print("=" * 60)

    # best_pck = -1.0
    # best_epoch = -1
    training_history = []

    for epoch in range(num_epochs):
        print(f"\n{'=' * 60}")
        print(f"Epoch {epoch + 1}/{num_epochs}")
        print('=' * 60)

        # Train for one epoch
        train_loss = train_epoch(
            sam_model, train_loader, optimizer, device, epoch + 1, img_size, patch_size, temperature=temperature
        )
        print(f"\nAverage training loss: {train_loss:.4f}")

        # Update learning rate
        # scheduler.step()
        current_lr = optimizer.param_groups[0]['lr']
        print(f"Learning rate: {current_lr:.6f}")

        # Validate on val set
        print("\nEvaluating on test set...")
        results_val, per_image_metrics = simple_evaluate_SAM(sam_model, val_dataset, device, img_size, patch_size)

        pck_005 = results_val['pck@0.05']['mean']
        pck_010 = results_val['pck@0.10']['mean']
        pck_020 = results_val['pck@0.20']['mean']

        print(f"Val Results:")
        print(f"  PCK@0.05: {pck_005:.2f}%")
        print(f"  PCK@0.10: {pck_010:.2f}%")
        print(f"  PCK@0.20: {pck_020:.2f}%")


        # Save model checkpoint
        # Save checkpoint for this epoch
        ckpt_path = f'{results_dir}/epoch_{epoch + 1}.pth'
        torch.save({
            'epoch': epoch + 1,
            'model_state_dict': sam_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'n_blocks': n_blocks,
            'temperature': temperature,
            'learning_rate': learning_rate,
            'val_pck@0.05': pck_005,
            'val_pck@0.10': pck_010,
            'val_pck@0.20': pck_020,
        }, ckpt_path)
        print(f"\u2713 Checkpoint saved: {ckpt_path}")

        # Track best by PCK@0.10
        # if pck_010 > best_pck:
        #     best_pck = pck_010
        #     best_epoch = epoch + 1
        #     best_ckpt_path = f'{results_dir}/best_model.pth'
        #     torch.save({
        #         'epoch': best_epoch,
        #         'model_state_dict': model.state_dict(),
        #         'optimizer_state_dict': optimizer.state_dict(),
        #         'n_blocks': n_blocks,
        #         'temperature': temperature,
        #         'learning_rate': learning_rate,
        #         'val_pck@0.05': pck_005,
        #         'val_pck@0.10': pck_010,
        #         'val_pck@0.20': pck_020,
        #     }, best_ckpt_path)
        #     print(f"\u2713 Best model saved: {best_ckpt_path}")

        # Store training history
        training_history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            'learning_rate': current_lr,
            'val_pck@0.05': pck_005,
            'val_pck@0.10': pck_010,
            'val_pck@0.20': pck_020,
        })

        # Save intermediate results_SPair71k
        # with open(f'{results_dir}/training_history.json', 'w') as f:
        #     json.dump(training_history, f, indent=2)

        # ========== FINAL RESULTS ==========
        print("\n" + "=" * 60)
        print("TRAINING COMPLETED")
        print("=" * 60)
        # print(f"Best PCK@0.1: {best_pck:.2f}% (Epoch {best_epoch})")
        print(f"Results saved to: {results_dir}")


        # Save metadata for comparison
        metadata = {
            'n_blocks': n_blocks,
            'temperature': temperature,
            'learning_rate': learning_rate,
            'num_epochs': num_epochs,
            # 'best_epoch': best_epoch,
            # 'best_pck@0.05': float(training_history[best_epoch - 1]['val_pck@0.05']),
            # 'best_pck@0.10': float(best_pck),
            # 'best_pck@0.20': float(training_history[best_epoch - 1]['val_pck@0.20']),
            'pck@0.05': float(training_history[-1]['val_pck@0.05']),
            'pck@0.10': float(training_history[-1]['val_pck@0.10']),
            'pck@0.20': float(training_history[-1]['val_pck@0.20']),
            'training_history': training_history,
        }

        with open(f'{results_dir}/metadata.json', 'w') as f:
            json.dump(metadata, f, indent=2)
        print(f"\u2713 Metadata saved: {results_dir}/metadata.json")

    # Automatically copy results to Google Drive
    drive_results_base_path = '/content/drive/MyDrive/Colab_dinov3_finetuning_temp_validation_results_prova/'
    drive_destination_path = os.path.join(drive_results_base_path, os.path.basename(results_dir))

    try:
        if not os.path.exists(drive_results_base_path):
            os.makedirs(drive_results_base_path, exist_ok=True)
        shutil.copytree(results_dir, drive_destination_path)
        print(f"\n\u2713 Successfully copied results to Google Drive: {drive_destination_path}")
    except Exception as e:
        print(f"\n\u2717 Error copying results to Google Drive: {e}")



if __name__ == "__main__":
    main()

Using device: cuda
Results will be saved to: results_colab/SAM/lr_0.0001_t_5_blocks_2_20260115_170020

Loading SPair-71k dataset...
Training samples: 53340
Val samples: 5384

FINETUNING WITH LAST 2 BLOCKS UNFROZEN

Loading SAM model...
Unfrozen last 2 blocks of SAM image encoder. Final norm layer not found or accessible via 'neck.ln_final'.

Trainable parameters: 14,195,456 / 93,735,472 (15.14%)

STARTING TRAINING

Epoch 1/1
Epoch 1, Batch 51/53340, Loss: 5.6469

Average training loss: 5.9605
Learning rate: 0.000100

Evaluating on test set...
Evaluating on 5384 image pairs...


KeyboardInterrupt: 

In [None]:

# Smonta il Drive
drive.flush_and_unmount()