In [None]:
from PIL import Image
from transformers import AutoImageProcessor, AutoModel
import torch
import torch.nn.functional as F

img1_path = "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7e253ff522632d5118acbecbc33591b9.jpg"
img2_path = "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7e253ff522632d5118acbecbc33591b9.jpg"

processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
dinov2_model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)

img1 = Image.open(img1_path)
img2 = Image.open(img2_path)
mask1 = torch.tensor(mask1, dtype=torch.bool, device=device)
mask2 = torch.tensor(mask2, dtype=torch.bool, device=device)
height, width = mask1.shape


inputs1 = processor(images=img1, return_tensors="pt").to(device)
out1 = dinov2_model(**inputs1)
last_hidden1 = out1[0]

inputs2 = processor(images=img2, return_tensors="pt").to(device)
last_hidden2 = dinov2_model(**inputs2)[0]

# Remove the [CLS] token
last_hidden1 = last_hidden1[:, 1:, :]  # Shape: [1, 256, 768]
last_hidden2 = last_hidden2[:, 1:, :]  # Shape: [1, 256, 768]

# Reshape to a square grid (assuming 14x14 patches)
patch_size = 16
feautre_dim = last_hidden1.shape[-1]
reshaped_output1 = last_hidden1.reshape(1, patch_size, patch_size, feautre_dim)
reshaped_output1 = reshaped_output1.permute(0, 3, 1, 2)
reshaped_output1 = F.interpolate(reshaped_output1, size=(height, width), mode='bilinear')
reshaped_output2 = last_hidden2.reshape(1, patch_size, patch_size, feautre_dim)
reshaped_output2 = reshaped_output2.permute(0, 3, 1, 2)
reshaped_output2 = F.interpolate(reshaped_output2, size=(height, width), mode='bilinear')


In [None]:
from PIL import Image
import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, AutoModel
import torch
import torch.nn.functional as F

def display_feature_maps(image_paths):
    """
    Display a list of images alongside their feature norm maps computed using DINOv2.

    Args:
        image_paths (list of str): List of file paths to the input images.
    """
    # Define the device (use GPU if available)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Load the DINOv2 processor and model
    processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
    dinov2_model = AutoModel.from_pretrained('facebook/dinov2-base').to(device)

    # Load and process all images
    imgs = [Image.open(img_path).convert('RGB') for img_path in image_paths]
    inputs = processor(images=imgs, return_tensors="pt").to(device)
    outputs = dinov2_model(**inputs)
    last_hidden_states = outputs.last_hidden_state  # Shape: [batch_size, sequence_length, hidden_dim]

    # Remove the [CLS] token from the outputs
    last_hidden_states = last_hidden_states[:, 1:, :]  # Shape: [batch_size, num_patches, hidden_dim]

    # Get the number of patches and determine the grid size
    batch_size, num_patches, feature_dim = last_hidden_states.shape
    patch_grid_size = int(num_patches ** 0.5)  # Assuming square grid

    # Reshape the outputs to [batch_size, hidden_dim, height, width]
    reshaped_outputs = last_hidden_states.reshape(batch_size, patch_grid_size, patch_grid_size, feature_dim)
    reshaped_outputs = reshaped_outputs.permute(0, 3, 1, 2)  # Shape: [batch_size, hidden_dim, height, width]

    # Set up the plot
    num_images = len(image_paths)
    plt.figure(figsize=(12, 6 * num_images))

    for idx in range(num_images):
        img = imgs[idx]
        reshaped_output = reshaped_outputs[idx:idx + 1]

        # Upsample the feature map to match the original image size
        height, width = img.size[1], img.size[0]
        upsampled_output = F.interpolate(reshaped_output, size=(height, width), mode='bilinear')

        # Compute the feature norm at each spatial location
        feature_norm = upsampled_output.norm(dim=1)  # Shape: [1, height, width]
        feature_norm = feature_norm.squeeze().detach().cpu().numpy()

        # Normalize the feature norm for visualization
        feature_norm -= feature_norm.min()
        feature_norm /= feature_norm.max()

        # Display the original image and the feature norm map side by side
        plt.subplot(num_images, 2, 2 * idx + 1)
        plt.imshow(img)
        plt.axis('off')
        plt.title(f'Original Image {idx + 1}')

        plt.subplot(num_images, 2, 2 * idx + 2)
        plt.imshow(feature_norm, cmap='viridis')
        plt.axis('off')
        plt.title(f'Feature Norm Map {idx + 1}')

    plt.tight_layout()
    plt.show()
    
display_feature_maps(["/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7dcb71ac3a054e86805a0aadbdd58f4d.jpg",
                      "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7cd086cc7da4a2f55ccff02868c47437.jpg",
                      "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7d10b9b39d3cf22d78fcd74c859b907e.jpg",
                      "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7d20e2c3ba8ee0b25e05702a767e985c.jpg",
                      "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7d37a071a0500ca58351240c8190abae.jpg",
                      "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7d71e81b5d039c7be0cd6d5d82e66098.jpg",
                      "/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/7dc00dac796e3329195508d8ca2b924d.jpg",
                      ])

In [None]:
import sfs_util
from PIL import Image
sfs_util.pil_to_tensor("/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/ffe43230ae908c5a39d003b733661f4c.jpg")

In [None]:
import json
import sfs
from PIL import Image
import torch
mast3r = sfs.Mast3r(
        num_responses_per_query=15,
        min_conf_threshold = 1.5,
        matching_conf_threshold = 2.0,
    )
with open("/viscam/projects/sfs/mast3r/clip_keywords.json", 'r') as file:
        clip_keywords = json.load(file)["bad"]
import os

path = '/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/'
# Use os.scandir for efficient directory traversal
# os.scandir returns an iterator of os.DirEntry objects, which include the file path and type information
# This is more efficient than os.listdir when dealing with large directories
img_paths = [entry.path for entry in os.scandir(path) if entry.is_file() and entry.name.endswith('.jpg')][20:30]

scores_dicts = mast3r.get_clip_scores2(img_paths, clip_keywords, 10)

In [8]:

# Display the images along with their corresponding dictionaries from scores_dicts
import matplotlib.pyplot as plt

images = [Image.open(img_path) for img_path in img_paths]

fig, axes = plt.subplots(len(images), 2, figsize=(12, 6 * len(images)))

for idx, (img, scores_dict) in enumerate(zip(images, scores_dicts)):
    # Display image
    axes[idx, 0].imshow(img)
    axes[idx, 0].axis('off')
    axes[idx, 0].set_title(f'Image {idx+1}')
    
    # Display scores dictionary
    axes[idx, 1].axis('off')
    # Format the scores_dict for display
    scores_text = '\n'.join([f'{k}: {v}' for k, v in scores_dict.items()])
    axes[idx, 1].text(0.1, 0.5, scores_text, fontsize=12, va='center')
    axes[idx, 1].set_title(f'Scores Dict for Image {idx+1}')

plt.tight_layout()
plt.show()

In [None]:
import json
import sfs
from PIL import Image
import torch
import matplotlib.pyplot as plt
mast3r = sfs.Mast3r(
        num_responses_per_query=15,
        min_conf_threshold = 1.5,
        matching_conf_threshold = 2.0,
    )
with open("/viscam/projects/sfs/mast3r/clip_keywords.json", 'r') as file:
        clip_keywords = json.load(file)["classes"]

import os

path = '/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/'
all_img_paths = [entry.path for entry in os.scandir(path) if entry.is_file() and entry.name.endswith('.jpg')]

for i in range(10):
    imgs_per_iter = 5
    img_paths = all_img_paths[imgs_per_iter*i:imgs_per_iter*(i+1)]
    scores_dicts = mast3r.get_clip_scores(img_paths, clip_keywords, 10)


    images = [Image.open(img_path) for img_path in img_paths]

    fig, axes = plt.subplots(len(images), 2, figsize=(12, 6 * len(images)))

    for idx, (img, scores_dict) in enumerate(zip(images, scores_dicts)):
        # Display image
        axes[idx, 0].imshow(img)
        axes[idx, 0].axis('off')
        axes[idx, 0].set_title(f'Image {idx+1}')
        
        # Display scores dictionary
        axes[idx, 1].axis('off')
        # Format the scores_dict for display
        scores_text = '\n'.join([f'{k}: {v}' for k, v in scores_dict.items()])
        axes[idx, 1].text(0.1, 0.5, scores_text, fontsize=12, va='center')
        axes[idx, 1].set_title(f'Scores Dict for Image {idx+1}')

    plt.tight_layout()
    plt.show()

In [None]:
from transformers import AutoProcessor, AutoTokenizer, CLIPModel, CLIPProcessor
import os
import json
from PIL import Image

import datetime
import matplotlib.pyplot as plt
# Use os.scandir for efficient directory traversal
# os.scandir returns an iterator of os.DirEntry objects, which include the file path and type information
# This is more efficient than os.listdir when dealing with large directories
with open("/viscam/projects/sfs/mast3r/clip_keywords.json", 'r') as file:
    json_file = json.load(file)
    class_keywords = json_file["classes"]
    meta_class_keywords = json_file["meta_classes"]
    quality_keywords = json_file["quality"]
import os

path = '/viscam/projects/sfs/mast3r/mast3r_outputs/imgs/'
# Use os.scandir for efficient directory traversal
# os.scandir returns an iterator of os.DirEntry objects, which include the file path and type information
# This is more efficient than os.listdir when dealing with large directories
all_img_paths = [entry.path for entry in os.scandir(path) if entry.is_file() and entry.name.endswith('.jpg')]

timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
save_dir = f'clip_test_imgs/clip_analysis_{timestamp}'
os.makedirs(save_dir, exist_ok=True)
model_name = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_name).to("cuda")
processor = CLIPProcessor.from_pretrained(model_name)

for i in range(20):
    imgs_per_iter = 5
    img_paths = all_img_paths[imgs_per_iter*i:imgs_per_iter*(i+1)]
    imgs = [Image.open(image_path) for image_path in img_paths]

    bottom_k_scores = []
    for clip_keywords in [quality_keywords, meta_class_keywords, class_keywords]:
        inputs = processor(text=clip_keywords, images=imgs, return_tensors="pt", padding=True).to("cuda")
        outputs = model(**inputs)
        imgs_probs = outputs.logits_per_image.softmax(dim=1).round(decimals=1).to("cpu")
        for img_path, probs in zip(img_paths, imgs_probs):
            scores = {}
            for prob, word in zip(probs, clip_keywords):
                scores[word] = prob
            sorted_pairs = sorted(scores.items(), reverse=True, key=lambda x: x[1])[:10]
            lowest_k_dict = dict(sorted_pairs)
            bottom_k_scores.append(lowest_k_dict)

    fig, axes = plt.subplots(len(imgs), 2, figsize=(12, 6 * len(imgs)))

    for idx, (img, scores_dict) in enumerate(zip(imgs, bottom_k_scores)):
        # Display image
        axes[idx, 0].imshow(img)
        axes[idx, 0].axis('off')
        axes[idx, 0].set_title(f'Image {idx+1}')

        # Display scores dictionary
        axes[idx, 1].axis('off')
        # Format the scores_dict for display
        scores_text = '\n'.join([f'{k}: {v}' for k, v in scores_dict.items()])
        axes[idx, 1].text(0.1, 0.5, scores_text, fontsize=12, va='center')
        axes[idx, 1].set_title(f'Scores Dict for Image {idx+1}')

    plt.tight_layout()

    # Save plot to the timestamped directory
    save_path = os.path.join(save_dir, f'plot_{i}.png')
    plt.savefig(save_path, bbox_inches='tight', dpi=300)
