## Test CLIP

In [2]:
import pickle as pkl
import argparse
import os
import pickle
import random
import numpy as np
import torch
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score

from pcbm.data import get_dataset
from pcbm.concepts import ConceptBank
from pcbm.models import PosthocLinearCBM, get_model
from pcbm.training_tools import load_or_compute_projections

UNIVERSAL_SEED = 2024
BATCH_SIZE = 64
NUM_WORKERS = 4
CONCEPT_BANK_PATH = "/home/ksas/Public/datasets/cifar10_concept_bank/multimodal_concept_clip:RN50_cifar10_recurse:1.pkl"
OUT_PUT_DIR_PATH = "exps/test"
CKPT_PATH = "data/ckpt/CIFAR_10/pcbm_cifar10__clip:RN50__multimodal_concept_clip:RN50_cifar10_recurse:1__lam:0.0002__alpha:0.99__seed:42.ckpt"
DATASET_PATH = "/home/ksas/Public/datasets/cifar10_concept_bank"
BACKBONE_NAME = "clip:ViT-B/32"
DEVICE = "cuda"

In [3]:
def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
set_random_seed(UNIVERSAL_SEED)

In [None]:
all_concepts = pkl.load(open(CONCEPT_BANK_PATH, 'rb'))
all_concept_names = list(all_concepts.keys())
print(f"Bank path: {CONCEPT_BANK_PATH}. {len(all_concept_names)} concepts will be used.")
concept_bank = ConceptBank(all_concepts, DEVICE)

import clip
clip_backbone_name = BACKBONE_NAME.split(":")[1]
backbone, preprocess = clip.load(clip_backbone_name, device=DEVICE, download_root="/home/ksas/Public/model_zoo/clip")
backbone = backbone.eval()
backbone = backbone.float()
model = None

backbone = backbone.to(DEVICE)
backbone.eval()

In [None]:
backbone.visual

In [4]:
with open("cifar10_concept.txt", "w+") as input_stream:
    for idx, concept_name in enumerate(concept_bank.concept_info.concept_names):
        input_stream.write(f"{idx}\t-{concept_name}\n")

In [None]:
posthoc_layer:PosthocLinearCBM = torch.load(CKPT_PATH, map_location=DEVICE)
print(posthoc_layer.analyze_classifier(k=5))
print(posthoc_layer.names)
print(posthoc_layer.names.__len__())

In [None]:
from torchvision import datasets
from pcbm.learn_concepts_multimodal import *
trainset = datasets.CIFAR10(root=DATASET_PATH, train=True,
                            download=True, transform=preprocess)
testset = datasets.CIFAR10(root=DATASET_PATH, train=False,
                            download=True, transform=preprocess)
classes = trainset.classes
class_to_idx = {c: i for (i,c) in enumerate(classes)}
idx_to_class = {v: k for k, v in class_to_idx.items()}
train_loader = torch.utils.data.DataLoader(trainset, batch_size=BATCH_SIZE,
                                        shuffle=True, num_workers=NUM_WORKERS)
test_loader = torch.utils.data.DataLoader(testset, batch_size=BATCH_SIZE,
                                    shuffle=False, num_workers=NUM_WORKERS)

In [None]:
def show_image(images:torch.Tensor):
    import torch
    import torchvision
    import matplotlib.pyplot as plt

    # 使用 torchvision.utils.make_grid 将 64 张图片排列成 8x8 的网格
    grid_img = torchvision.utils.make_grid(images, nrow=8, normalize=True)

    # 转换为 NumPy 格式以便用 matplotlib 显示
    plt.imshow(grid_img.permute(1, 2, 0))  # 转换为 [H, W, C]
    plt.axis('off')  # 隐藏坐标轴
    plt.show()

for idx, data in enumerate(train_loader):
    print(data.__len__())
    print(f"x: {data[0].size()}")
    print(f"y: {data[1].size()}")
    batch_X, batch_Y = data
    batch_X = batch_X.to(DEVICE)
    batch_Y = batch_Y.to(DEVICE)
    
    batch_X.requires_grad_(True)
    embeddings = backbone.encode_image(batch_X)
    projs = posthoc_layer.compute_dist(embeddings)
    predicted_Y = posthoc_layer.forward_projs(projs)
    accuracy = (predicted_Y.argmax(1) == batch_Y).float().mean().item()
    
    _, topk_indices = torch.topk(projs, 5, dim=1)
    topk_concept = [[posthoc_layer.names[idx] for idx in row] for row in topk_indices]

    
    show_image(batch_X.detach().cpu())
    print(f"embeddings: {embeddings.size()}")
    print(f"projections: {projs.size()}")
    print(f"predicted_Y: {predicted_Y.size()}")
    print(f"accuracy: {accuracy}")
    # accuracy_idx.append(accuracy)
    import pdb; pdb.set_trace()
    
# print(accuracy_idx)
    

In [None]:
import pickle as pkl
import os
from constants import dataset_cosntants
from pcbm.data.cub import CUBConceptDataset, get_concept_dicts
from pcbm.concepts import ConceptBank

CUB_CONCEPT_BANK_PATH =  "/home/ksas/Public/datasets/cub_concept_bank/cub_resnet18_cub_0.1_100.pkl"
DEVICE = "cuda"

TRAIN_PKL = os.path.join(dataset_cosntants.CUB_PROCESSED_DIR, "train.pkl")
metadata = pkl.load(open(TRAIN_PKL, "rb"))

concept_info = get_concept_dicts(metadata=metadata)
concept_info[0].keys()

In [None]:
print(concept_info.__len__())
print(concept_info[0][0].__len__())
print(concept_info[0][1].__len__())

## Test Layer Grad CAM

In [None]:
import clip
from clip.model import CLIP, ModifiedResNet, VisionTransformer
import torch
from PIL import Image
import torchvision.transforms as transforms

BACKBONE_NAME = "clip:ViT-B/32"
DEVICE = "cuda"
clip_backbone_name = BACKBONE_NAME.split(":")[1]
backbone, preprocess = clip.load(clip_backbone_name, device=DEVICE, download_root="/home/ksas/Public/model_zoo/clip")
backbone = backbone.eval()
backbone = backbone.float()
normalizer = transforms.Compose(preprocess.transforms[-1:])
preprocess = transforms.Compose(preprocess.transforms[:-1])
print(preprocess)

In [4]:
from explain_utils import *
from captum.attr import visualization, GradientAttribution, LayerAttribution

image_attn_blocks = list(dict(backbone.visual.transformer.resblocks.named_children()).values())
last_blocks = image_attn_blocks[-1].ln_1
layer_grad_cam = layer_grad_cam_vit(backbone,
                                last_blocks)

In [5]:
from visual_utils import *

image:torch.Tensor = preprocess(Image.open("data/images/cat_and_dog.jpg")).unsqueeze(0).to(DEVICE)
text = clip.tokenize(["car", "a dog", "a cat"]).to(DEVICE)
print(backbone(normalizer(image), text))
attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 1, additional_args = {"text": text})
print(attributions)
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=False,
        save_to=None)


In [None]:
image:torch.Tensor = preprocess(Image.open("data/images/multi_dog_and_cat.jpg")).unsqueeze(0).to(DEVICE)
text = clip.tokenize(["cat and dog", "dog", "cat", "many dogs", "many cats"]).to(DEVICE)
print(backbone(normalizer(image), text))
attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 2, additional_args = {"text": text})
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=False,
        save_to=None)

attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 1, additional_args = {"text": text})
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=False,
        save_to=None)


In [None]:
image:torch.Tensor = preprocess(Image.open("data/layer_grad_cam/propellers_images/946-original_image.jpg")).unsqueeze(0).to(DEVICE)
text = clip.tokenize(["airplane", "propellers", "landing gear"]).to(DEVICE)
print(backbone(normalizer(image), text))

attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 0, additional_args = {"text": text})
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=True,
        save_to=None)

attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 1, additional_args = {"text": text})
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=True,
        save_to=None)

attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 2, additional_args = {"text": text})
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=True,
        save_to=None)


In [None]:
image:torch.Tensor = preprocess(Image.open("data/images/dog_cat.jpg")).unsqueeze(0).to(DEVICE)
text = clip.tokenize(["car", "dog", "cat"]).to(DEVICE)
print(backbone(normalizer(image), text))
attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 2, additional_args = {"text": text})
print(attributions)
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=False,
        save_to=None)

In [None]:
image:torch.Tensor = preprocess(Image.open("data/images/glasses.png")).unsqueeze(0).to(DEVICE)
text = clip.tokenize(["man with eyeglasses"]).to(DEVICE)
print(backbone(normalizer(image), text))
attributions:torch.Tensor = layer_grad_cam.attribute(normalizer(image), 0, additional_args = {"text": text})
upsampled_attr = LayerAttribution.interpolate(attributions, image.size()[-2:], interpolate_mode="bicubic")

viz_attn(image,
        upsampled_attr,
        blur=True,
        save_to=None)

## Test OPEN_CLIP

In [1]:
import open_clip
open_clip.list_pretrained()

[('RN50', 'openai'),
 ('RN50', 'yfcc15m'),
 ('RN50', 'cc12m'),
 ('RN50-quickgelu', 'openai'),
 ('RN50-quickgelu', 'yfcc15m'),
 ('RN50-quickgelu', 'cc12m'),
 ('RN101', 'openai'),
 ('RN101', 'yfcc15m'),
 ('RN101-quickgelu', 'openai'),
 ('RN101-quickgelu', 'yfcc15m'),
 ('RN50x4', 'openai'),
 ('RN50x16', 'openai'),
 ('RN50x64', 'openai'),
 ('ViT-B-32', 'openai'),
 ('ViT-B-32', 'laion400m_e31'),
 ('ViT-B-32', 'laion400m_e32'),
 ('ViT-B-32', 'laion2b_e16'),
 ('ViT-B-32', 'laion2b_s34b_b79k'),
 ('ViT-B-32', 'datacomp_xl_s13b_b90k'),
 ('ViT-B-32', 'datacomp_m_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_clip_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_laion_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_image_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_text_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_basic_s128m_b4k'),
 ('ViT-B-32', 'commonpool_m_s128m_b4k'),
 ('ViT-B-32', 'datacomp_s_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_clip_s13m_b4k'),
 ('ViT-B-32', 'commonpool_s_laion_s13m_b4k'),
 ('ViT-B-32', 'commonpool_

In [6]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='openai', cache_dir="/home/ksas/Public/model_zoo/clip")
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')

image = preprocess(Image.open("data/images/cat_and_dog.jpg")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])
print(preprocess)
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs) 

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    <function _convert_to_rgb at 0x797c2cbf7240>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)
Label probs: tensor([[0.1177, 0.5364, 0.3459]])
