# Matcher: Segment Anything with One Shot Using Feature Matching

This notebook evaluates the Matcher framework on image pairs from the Images folder. For each subfolder containing two images, we'll use each image as a reference for the other to generate segmentation masks.

In [1]:
# # Install the required packages for the Matcher framework
# %pip install future==0.18.2 gradio==3.32.0 gradio-client==0.2.5 POT omegaconf iopath
# # Additional important dependencies that may be needed
# %pip install matplotlib torch torchvision opencv-python timm numpy tqdm

## Setup and Import Dependencies

In [2]:
import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
import cv2
from PIL import Image

print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

# Add Matcher directory to path
matcher_path = os.path.join(os.getcwd(), "Matcher")
if matcher_path not in sys.path:
    sys.path.append(matcher_path)
    print(f"Added {matcher_path} to sys.path")
    
# Add segment_anything to path
sam_path = os.path.join(os.getcwd(), "Matcher", "segment_anything")
if sam_path not in sys.path:
    sys.path.append(sam_path)
    print(f"Added {sam_path} to sys.path")

# Add utils to path
utils_path = os.path.join(os.getcwd(), "Matcher", "utils")
if utils_path not in sys.path:
    sys.path.append(utils_path)
    print(f"Added {utils_path} to sys.path")

CUDA available: False
Added c:\Users\vikra\OneDrive\Desktop\CSE344-CV\A3\Q5\Matcher to sys.path
Added c:\Users\vikra\OneDrive\Desktop\CSE344-CV\A3\Q5\Matcher\segment_anything to sys.path
Added c:\Users\vikra\OneDrive\Desktop\CSE344-CV\A3\Q5\Matcher\utils to sys.path


## Download and Setup Models

We'll load the necessary foundation models used by the Matcher framework: DINOv2 and SAM (Segment Anything Model).

In [3]:
# Load DINOv2 model
print("Loading DINOv2 model...")
dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')
dinov2_vitl14 = dinov2_vitl14.eval()
if torch.cuda.is_available():
    dinov2_vitl14 = dinov2_vitl14.cuda()
print("DINOv2 model loaded successfully!")

Loading DINOv2 model...


Using cache found in C:\Users\vikra/.cache\torch\hub\facebookresearch_dinov2_main


DINOv2 model loaded successfully!


In [4]:
# Download SAM model if not already available
import requests

# Create directory for SAM checkpoint
sam_checkpoint_dir = "sam_checkpoints"
os.makedirs(sam_checkpoint_dir, exist_ok=True)

# Download SAM model (vit_h)
sam_url = "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth"
sam_path = os.path.join(sam_checkpoint_dir, "sam_vit_h_4b8939.pth")

if not os.path.exists(sam_path):
    print(f"Downloading SAM checkpoint to {sam_path}...")
    with requests.get(sam_url, stream=True) as r:
        r.raise_for_status()
        with open(sam_path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)
    print("SAM checkpoint downloaded.")
else:
    print("SAM checkpoint already exists.")

SAM checkpoint already exists.


## Import Matcher Components

Now we'll import the necessary components from the Matcher framework.

In [5]:
# pip install scikit-learn

In [6]:
# %pip install torchvision

In [None]:
# Import required modules
# Import SAM (Segment Anything Model)
from segment_anything.build_sam import sam_model_registry
from segment_anything.predictor import SamPredictor

# Import from Matcher repo
from matcher.Matcher import Matcher

# For DINOv2 image preprocessing
from torchvision import transforms

print("Successfully imported modules from Matcher!")

Successfully imported modules from Matcher!


In [28]:
import cv2
import numpy as np

def read_image(path):
    return cv2.imread(path)

def image_resize(image, height=None, width=None):
    h, w = image.shape[:2]

    if height is not None:
        ratio = height / float(h)
        new_w = int(w * ratio)
        new_h = height
    elif width is not None:
        ratio = width / float(w)
        new_w = width
        new_h = int(h * ratio)
    else:
        return image

    # Make sure dimensions are divisible by 14
    new_h -= new_h % 14
    new_w -= new_w % 14

    return cv2.resize(image, (new_w, new_h))



def ensure_same_size(img1, img2):
    h, w = min(img1.shape[0], img2.shape[0]), min(img1.shape[1], img2.shape[1])
    return cv2.resize(img1, (w, h)), cv2.resize(img2, (w, h))


In [29]:
# Function to initialize SAM model
def get_sam_model(checkpoint, model_type="vit_h"):
    """Initialize and return a SAM model."""
    sam = sam_model_registry[model_type](checkpoint=checkpoint)
    if torch.cuda.is_available():
        sam.to(device="cuda")
    return sam

# Function to get DINOv2 feature maps
def get_dino_feat_maps(dino_model, dino_img):
    """Extract and return feature maps from DINOv2."""
    with torch.no_grad():
        features = dino_model.forward_features(dino_img)
        features_dict = {k: v for k, v in features.items()}
    return features_dict["x_norm_patchtokens"]

# Function to adapt image for DINOv2 processing
def adapt_img_to_dino(img):
    """Preprocess an image for the DINOv2 model."""
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ])
    
    # Convert to PIL if it's numpy array
    if isinstance(img, np.ndarray):
        img = Image.fromarray(img.astype('uint8'))
        
    tensor = transform(img).unsqueeze(0)
    return tensor

dinov2_vitl14 = torch.hub.load('facebookresearch/dinov2', 'dinov2_vitl14')

# Function to get the Matcher model
def get_feature_match_model():
    """Initialize and return the Matcher model."""
    score_filter_cfg = {
        "score_thresh": 0.05,
        "iou_thresh": 0.7,
        "min_area": 100
    }
    return Matcher(encoder=dinov2_vitl14, score_filter_cfg=score_filter_cfg)



# Initialize models
print("Initializing SAM model...")
sam = get_sam_model(checkpoint=sam_path)
print("SAM model initialized!")

print("\nInitializing Feature Matching model...")
matching_model = get_feature_match_model()
print("Feature Matching model initialized!")

Using cache found in C:\Users\vikra/.cache\torch\hub\facebookresearch_dinov2_main


Initializing SAM model...
SAM model initialized!

Initializing Feature Matching model...
Feature Matching model initialized!
SAM model initialized!

Initializing Feature Matching model...
Feature Matching model initialized!


## Define Helper Functions

Let's define helper functions to perform one-shot segmentation using the Matcher framework.

In [60]:
def perform_one_shot_segmentation(ref_image_path, src_image_path):
    """Perform one-shot segmentation from ref → src using SAM and Matcher."""
    # Read images
    ref_image = read_image(ref_image_path)
    src_image = read_image(src_image_path)


    required_size = matching_model.input_size[-1]  # e.g., 518
    ref_image = image_resize(ref_image, height=required_size)
    src_image = image_resize(src_image, height=required_size)


    # Ensure same size
    ref_image, src_image = ensure_same_size(ref_image, src_image)

    # Prepare image for DINOv2
    preprocess_dino = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
    ])

    # Ref + Src as tensors for DINO
    ref_dino_img = preprocess_dino(ref_image).unsqueeze(0)
    src_dino_img = preprocess_dino(src_image).unsqueeze(0)

    if torch.cuda.is_available():
        ref_dino_img = ref_dino_img.cuda()
        src_dino_img = src_dino_img.cuda()

    with torch.no_grad():
        ref_feat_map = get_dino_feat_maps(dinov2_vitl14, ref_dino_img)
        src_feat_map = get_dino_feat_maps(dinov2_vitl14, src_dino_img)

    # -------- Reference Setup -------- #
    ref_tensor = ref_dino_img  # Already normalized and batched

    # Create binary mask at center of image
    h, w = ref_image.shape[:2]
    center_y, center_x = h // 2, w // 2
    mask = np.zeros((h, w), dtype=np.uint8)
    mask[center_y, center_x] = 1  # One-hot point mask

    mask_tensor = torch.from_numpy(mask).unsqueeze(0).unsqueeze(0).float()
    if torch.cuda.is_available():
        mask_tensor = mask_tensor.cuda()

    # Set reference and target
    matching_model.set_reference(ref_tensor, mask_tensor)
    matching_model.set_target(src_feat_map)  # ✅ Only feature map here
    all_sam_masks = matching_model.predict(sam=sam)

    # Return results
    return {
        "reference_image": ref_image,
        "source_image": src_image,
        "masks": all_sam_masks
    }


## Process All Image Pairs

Now let's process all image pairs in the Images folder.

In [61]:
# Get list of all subfolders in the Images directory
images_dir = "Images"
subfolders = [f for f in os.listdir(images_dir) if os.path.isdir(os.path.join(images_dir, f))]
print(f"Found {len(subfolders)} subfolders in the Images directory:")
print(subfolders)

Found 17 subfolders in the Images directory:
['backpack', 'backpack_dog', 'barn', 'bear_plushie', 'berry_bowl', 'can', 'candle', 'cat', 'cat2', 'cat_statue', 'chair', 'clock', 'colorful_sneaker', 'colorful_teapot', 'dog', 'dog2', 'dog3']


In [62]:
# Function to process a single subfolder
def process_subfolder(subfolder):
    print(f"\nProcessing subfolder: {subfolder}")
    subfolder_path = os.path.join(images_dir, subfolder)
    image_files = [f for f in os.listdir(subfolder_path) if f.endswith(('.jpg', '.jpeg', '.png'))]
    
    if len(image_files) < 2:
        print(f"Skipping {subfolder} - insufficient images")
        return
    
    # For this demo, we'll use just the first two images
    img1_path = os.path.join(subfolder_path, image_files[0])
    img2_path = os.path.join(subfolder_path, image_files[1])
    
    print(f"Image pair: {image_files[0]} and {image_files[1]}")
    
    # First direction: Use img1 as reference, img2 as source
    print(f"\nDirection 1: {image_files[0]} (ref) → {image_files[1]} (src)")
    results1 = perform_one_shot_segmentation(img1_path, img2_path)
    visualize_segmentation_results(results1, img1_path, img2_path)
    
    # Second direction: Use img2 as reference, img1 as source
    print(f"\nDirection 2: {image_files[1]} (ref) → {image_files[0]} (src)")
    results2 = perform_one_shot_segmentation(img2_path, img1_path)
    visualize_segmentation_results(results2, img2_path, img1_path)

In [63]:
# Process a few selected subfolders (you can change this to process all)
selected_subfolders = subfolders[:3]  # Start with first 3 for demonstration
print(f"Processing {len(selected_subfolders)} selected subfolders: {selected_subfolders}")

for subfolder in selected_subfolders:
    process_subfolder(subfolder)

Processing 3 selected subfolders: ['backpack', 'backpack_dog', 'barn']

Processing subfolder: backpack
Image pair: 00.jpg and 05.jpg

Direction 1: 00.jpg (ref) → 05.jpg (src)


AttributeError: 'NoneType' object has no attribute 'predictor'

# CLIP

## Process All Remaining Subfolders

Run this cell to process all remaining subfolders.

In [None]:
# Process remaining subfolders
remaining_subfolders = subfolders[3:]
print(f"Processing {len(remaining_subfolders)} remaining subfolders: {remaining_subfolders}")

for subfolder in remaining_subfolders:
    process_subfolder(subfolder)

## Conclusion

In this notebook, we demonstrated the use of the Matcher framework for one-shot segmentation. For each pair of images, we performed segmentation in both directions (using each image as the reference for the other). The results show how well the framework can transfer segmentation from a reference image to a target image using feature matching between DINOv2 features, followed by mask generation with the Segment Anything Model (SAM).