## Test CLIP interpret

In [1]:
import clip
import torch
import torch.nn as nn
from typing import List
from PIL import Image
from captum.attr import IntegratedGradients, visualization

backbone_ckpt = "/home/ksas/Public/model_zoo/clip"
backbone_name = "ViT-B/32"
device = "cuda:1" if torch.cuda.is_available() else "cpu"

backbone, preprocess = clip.load(backbone_name, device=device, download_root=backbone_ckpt)
backbone = backbone.float()\
            .to("cuda")\
            .eval()

In [2]:
image:torch.Tensor = preprocess(Image.open("data/images/glasses.png")).unsqueeze(0).to(device)
text = clip.tokenize(["African with sunglasses", "Asisan", "European", "Dog"]).to(device)

image_features = backbone.encode_image(image)
text_features = backbone.encode_text(text)

logits_per_image, logits_per_text = backbone(image, text)
print(logits_per_image)
print(logits_per_text)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument weight in method wrapper_CUDA__cudnn_convolution)

In [None]:
class ClipVisualWithSimilarity(nn.Module):
    def __init__(self, clip_model, comparison_text:List[str]):
        super(ClipVisualWithSimilarity, self).__init__()
        self.clip_model = clip_model
        self.comparison_text = clip.tokenize(comparison_text).to(device)

    def forward(self, image):
        logits_per_image, _ = self.clip_model(image, self.comparison_text)
        return logits_per_image

In [None]:
clip_sim = ClipVisualWithSimilarity(backbone, ["a man with eyeglasses",
                                               "smile"])
ig = IntegratedGradients(clip_sim)

# 对图像的 Integrated Gradients 计算
attributions = ig.attribute(image, target=0)
_ = visualization.visualize_image_attr(attributions.squeeze().permute((1, 2, 0)).detach().cpu().numpy(), 
                                       image.squeeze().permute((1, 2, 0)).detach().cpu().numpy(), 
                                       "blended_heat_map")

attributions = ig.attribute(image, target=1)
_ = visualization.visualize_image_attr(attributions.squeeze().permute((1, 2, 0)).detach().cpu().numpy(), 
                                       image.squeeze().permute((1, 2, 0)).detach().cpu().numpy(), 
                                       "blended_heat_map")

## Manually interpret

In [None]:
# !cd scripts/CIFAR_10/embedding_training/ && bash _interpret_base.sh "cifar10" "outputs/trains/cifar10_open_clip:RN50_saliency_map_adversarial_saliency_guided_training/adversarial_saliency_guided_training-open_clip:RN50.pth"    "open_clip:RN50"    "layer_grad_cam"    "cat"    "sharp claws"    "--save-100-local --exp-name=visual_cifar10_open_clipRN50_saliency_map_asgt --zip"
# !cd scripts/CIFAR_10/embedding_training/ && bash _interpret_base.sh "cifar10" "outputs/trains/cifar10_open_clip:RN50_saliency_map_topK=5_concept_KL_div/topK=5-open_clip:RN50.pth"    "open_clip:RN50"    "layer_grad_cam"    "cat"    "sharp claws"    "--save-100-local --exp-name=visual_cifar10_open_clipRN50_saliency_map_topK=5_concept_KL_div_topK=5 --zip"
# !cd scripts/CIFAR_10/embedding_training/ && bash _interpret_base.sh "cifar10" "outputs/trains/cifar10_open_clip:RN50_saliency_map_adversarial_saliency_guided_training_concept_KL_div/adversarial_saliency_guided_training-open_clip:RN50.pth"    "open_clip:RN50"    "layer_grad_cam"    "cat"    "sharp claws"    "--save-100-local --exp-name=visual_cifar10_open_clip:RN50_saliency_map_adversarial_saliency_guided_training_concept_KL_div --zip"
# !cd scripts/CIFAR_10/embedding_training/ && bash _interpret_base.sh "cifar10" "outputs/clip_adversarial_saliency_guided_training_3e-1/adversarial_saliency_guided_training-open_clip:RN50.pth"    "open_clip:RN50"    "layer_grad_cam"    "cat"    "sharp claws"    "--save-100-local --exp-name=visual_cifar10_open_clipRN50_saliency_map_adversarial_saliency_guided_training_3e-1 --zip"
# !cd scripts/CIFAR_10/embedding_training/ && bash _interpret_base.sh "cifar10" "outputs/trains/cifar10_open_clip:RN50_saliency_map_specific_ocnept_debug-test_concept_KL_div/debug-test-open_clip:RN50.pth"    "open_clip:RN50"    "layer_grad_cam"    "cat"    "sharp claws"    "--save-100-local --exp-name=visual_debug_test --zip"



2024-11-21 20:47:27,766   INFO  {
    "universal_seed": 24,
    "backbone_ckpt": "outputs/trains/cifar10_open_clip:RN50_saliency_map_adversarial_saliency_guided_training_concept_KL_div/adversarial_saliency_guided_training-open_clip:RN50.pth",
    "backbone_name": "open_clip:RN50",
    "concept_bank": "/home/ksas/Public/datasets/cifar10_concept_bank/multimodal_concept_clip:RN50_cifar10_recurse:1.pkl",
    "pcbm_ckpt": "data/ckpt/CIFAR_10/pcbm_cifar10__clip:RN50__multimodal_concept_clip:RN50_cifar10_recurse:1__lam:0.0002__alpha:0.99__seed:42.ckpt",
    "explain_method": "layer_grad_cam",
    "concept_pooling": "max_pooling_class_wise",
    "concept_target": "sharp claws",
    "class_target": "cat",
    "dataset": "cifar10",
    "device": "cuda",
    "batch_size": 1,
    "num_workers": 4,
    "exp_name": "visual_cifar10_open_clip:RN50_saliency_map_adversarial_saliency_guided_training_concept_KL_div",
    "save_100_local": true,
    "zip": true,
    "save_path": "./outputs/evals/visual_cif