In [10]:
import torch
import clip
from torchvision.datasets import VOCSegmentation
from torch.utils.data import DataLoader
from torchvision import transforms
import numpy as np
from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt
import cv2

from torchvision.transforms import Compose, Resize, ToTensor, Normalize
from torchvision.transforms import InterpolationMode
BICUBIC = InterpolationMode.BICUBIC


device = "cuda" if torch.cuda.is_available() else "cpu"
model, _ = clip.load("CS-RN101", device=device)
preprocess_img =  Compose([Resize((224, 224), interpolation=BICUBIC), ToTensor(),
    Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))])


preprocess_target =  Compose([Resize((224, 224), interpolation=BICUBIC), ToTensor()])

class VOCSegmentationCustom(VOCSegmentation):
    def __getitem__(self, index):
        img = self.images[index]
        target = self.masks[index]

        img = Image.open(img) #.convert('RGB')
        
        cv2_img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
        pre_target = Image.open(target)
        target = Image.open(target) #.convert('L')
        
        
        if self.transforms is not None:
            img, target, pre_target = self.transforms(img, target, pre_target)

        return img, cv2_img, target, pre_target

# Load the dataset with the preprocess transformation
test_dataset = VOCSegmentationCustom(
    root='path/to/VOC2012', year='2012', image_set='val', download=False, 
    transforms=lambda img,target,pre_target: (preprocess_img(img), preprocess_target(target), preprocess_target(pre_target))
)

test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=4)

In [11]:
# VOC classes, excluding the background class
voc_classes = [
    "aeroplane", "bicycle", "bird", "boat", "bottle", 
    "bus", "car", "cat", "chair", "cow", 
    "dining table", "dog", "horse", "motorbike", "person", 
    "potted plant", "sheep", "sofa", "train", "tv/monitor"
]

all_texts = voc_classes 


In [3]:
# import torch

# def calculate_iou(mask1, mask2):
#     # Ensure masks are boolean tensors
#     mask1 = mask1.bool()
#     mask2 = mask2.bool()

#     # Compute intersection and union
#     intersection = (mask1 & mask2).sum().item()
#     union = (mask1 | mask2).sum().item()

#     # Compute IoU
#     iou = intersection / union if union != 0 else 0
#     return iou

# threshold = 0.5
# miou_list = []
# with torch.no_grad():
#     i = 0
#     for images, cv2_img, targets, pre_target in tqdm(test_loader):
        

#         ############### This code below will give the GT classes/labels present in the image #############
#         sa = pre_target[0].permute(1,2,0).cpu().detach().numpy()
#         unique_pixels = np.unique(sa) *255
#         true_label = unique_pixels[1:-1]
        
#         ############### get image and text features: and obtain similarity maps (same as Surgery) #############
        
#         images = images.to(device)
#         targets = targets.to(device)
        
#         image_features = model.encode_image(images)
        
#         features = image_features @ text_features.t()
#         similarity = clip.clip_feature_surgery(image_features, text_features)
#         similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

#         #### Now for each class in each image we find the IoU ####
#         miou = []
#         for idx in true_label:
            

#             ## This gives the simialrity between prompts of class idx and image
#             preds = similarity_map[:,:,:,int(idx-1)].view_as(targets)
#             ## Next is just making the pixel values as the class number or the idx. Commonly done in segmentation problems. The class number (0 or 1 - aeroplane will have pixel values as 0 or 1)
#             preds = (preds> threshold).float()
#             preds = preds * idx
        
#             preds = preds.to(torch.uint8)
#             preds_img =preds[0].permute(1,2,0).cpu().detach().numpy() 
#             # print('PREDS', np.unique(preds.cpu().detach().numpy()))
            
#             # plt.imshow(preds_img)  ## 0 is the classname
#             # plt.show()
#             ## Here, in the target we seperate the index, that is segment only the class or idx. 
            
#             tar_img = targets[0].permute(1,2,0).cpu().detach().numpy() * 255
#             tar_img_mask = (tar_img == idx).astype(int) * idx
            
#             tar_img_mask = 255 * tar_img_mask.astype(np.uint8)
#             # print('tar_img_mask', np.unique(tar_img_mask))
            
#             # plt.imshow(tar_img_mask)
#             # plt.show()

#             mask1 = torch.tensor(preds_img, dtype  = torch.uint8)
#             mask2 = torch.tensor(tar_img_mask, dtype=torch.uint8)

#             ## iou is between prediction and target for one class or index. 

#             iou = calculate_iou(mask1, mask2)
#             print('IoU for class {} is {}'.format(idx,iou))
#             miou.append(iou)
        
#         miou_list.append(np.mean(np.array(miou)))
#         print('-'*25)    
#         # i = i+1
#         # if i >5:
#         #     break        
# # mean_iou = np.nanmean(iou_scores)
# # print(f'Mean IoU on PASCAL VOC 2012 test set: {mean_iou:.4f}')


In [4]:
# np.nanmean(np.array(miou_list))

In [5]:
# def calculate_iou(pred, target, num_classes=21):
#     ious = []
#     pred = pred.view(-1)
#     target = target.view(-1)
    
#     for cls in range(num_classes):
#         pred_inds = pred == cls
#         target_inds = target == cls
#         intersection = torch.sum(pred_inds[target_inds])
#         union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
#         if union == 0:
#             ious.append(float('nan'))
#         else:
#             ious.append(float(intersection) / max(union, 1))
    
#     return np.nanmean(ious)


# miou_list = []

# threshold = 0.5
# with torch.no_grad():
#     i = 0
#     for images, cv2_img, targets, pre_target in tqdm(test_loader):
        
#         iou_scores = []

#         ############### This code below will give the GT classes/labels present in the image #############
#         sa = pre_target[0].permute(1,2,0).cpu().detach().numpy()
#         unique_pixels = np.unique(sa) *255
#         filtered_numbers = [num for num in unique_pixels if num not in [0, 255]]

#         true_label = filtered_numbers

        
#         ############### get image and text features: and obtain similarity maps (same as Surgery) #############
        
#         images = images.to(device)
#         targets = targets.to(device)
        
#         image_features = model.encode_image(images)
        
#         features = image_features @ text_features.t()
#         similarity = clip.clip_feature_surgery(image_features, text_features)
#         similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

#         #### Now for each class in each image we find the IoU ####
#         mask_idx = []
#         mask_target = []

#         for idx in true_label:

#             ## This gives the simialrity between prompts of class idx and image
#             preds = similarity_map[:,:,:,int(idx-1)].view_as(targets)

#             ## Next is just making the pixel values as the class number or the idx. Commonly done in segmentation problems. The class number (0 or 1 - aeroplane will have pixel values as 0 or 1)
#             preds = (preds> threshold).float()
#             preds = preds * idx
        
#             preds = preds.to(torch.uint8)
#             preds_img =preds[0].permute(1,2,0).cpu().detach().numpy() 
#             # print('PREDS', np.unique(preds.cpu().detach().numpy()))
            
#             # plt.imshow(preds_img)  ## 0 is the classname
#             # plt.show()

#             ## Here, in the target we seperate the index, that is segment only the class or idx. 
            
#             tar_img = targets[0].permute(1,2,0).cpu().detach().numpy() * 255
#             tar_img_mask = (tar_img == idx).astype(int) * idx
            
#             tar_img_mask = tar_img_mask.astype(np.uint8)
#             # print('tar_img_mask', np.unique(tar_img_mask))
            
#             # plt.imshow(tar_img_mask)
#             # plt.show()

#             mask1 = torch.tensor(preds_img, dtype  = torch.uint8)
#             mask2 = torch.tensor(tar_img_mask, dtype=torch.uint8)

#             mask_idx.append(mask1)
#             mask_target.append(mask2)
        
        
#         summed_mask_idx = torch.sum(torch.stack(mask_idx), dim=0)
#         summed_mask_target = torch.sum(torch.stack(mask_target), dim=0)
#         # plt.imshow(summed_mask_idx)  ## 0 is the classname
#         # plt.show()
#         # print('summed_mask_idx', np.unique(summed_mask_idx.cpu().detach().numpy()))
#         # # plt.imshow(summed_mask_target)  ## 0 is the classname
#         # # plt.show()
#         # print('summed_mask_target', np.unique(summed_mask_target.cpu().detach().numpy()))
        
#         # ## iou is between prediction and target for one class or index. 
#         for p in range(224):
#             iou = calculate_iou(summed_mask_idx[p].cpu(), summed_mask_target[p].cpu(), num_classes=21)
#             iou_scores.append(iou)
        
#         mean_iou = np.nanmean(iou_scores)
#         miou_list.append(mean_iou)


In [6]:
# np.nanmean(np.array(miou_list))

In [7]:
# ## Experiment:

# def calculate_iou(pred, target, num_classes=21):
#     ious = []
#     pred = pred.view(-1)
#     target = target.view(-1)
    
#     for cls in range(num_classes):
#         pred_inds = pred == cls
#         target_inds = target == cls
#         intersection = torch.sum(pred_inds[target_inds])
#         union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
#         if union == 0:
#             ious.append(float('nan'))
#         else:
#             ious.append(float(intersection) / max(union, 1))
    
#     return np.nanmean(ious)


# miou_list = []

# threshold = 0.5
# with torch.no_grad():
#     i = 0
#     for idy, (images, cv2_img, targets, pre_target) in enumerate(tqdm(test_loader)):
        
#         iou_scores = []

#         ############### This code below will give the GT classes/labels present in the image #############
#         sa = pre_target[0].permute(1,2,0).cpu().detach().numpy()
#         unique_pixels = np.unique(sa) *255
#         filtered_numbers = [num for num in unique_pixels if num not in [0, 255]]

#         true_label = filtered_numbers

#         ############### get image and text features: and obtain similarity maps (same as Surgery) #############

#         images = images.to(device)
#         targets = targets.to(device)

#         image_features = model.encode_image(images)

#         features = image_features @ text_features.t()
#         similarity = clip.clip_feature_surgery(image_features, text_features)
#         similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

#         #### Now for each class in each image we find the IoU ####
#         mask_idx = []
#         mask_target = []
#         for idx in true_label:

#             ## This gives the simialrity between prompts of class idx and image
#             preds = similarity_map[:,:,:,int(idx-1)].view_as(targets)

#             ## Next is just making the pixel values as the class number or the idx. Commonly done in segmentation problems. The class number (0 or 1 - aeroplane will have pixel values as 0 or 1)
#             preds = (preds> threshold).float()
#             preds = preds * idx

#             preds = preds.to(torch.uint8)
#             preds_img =preds[0].permute(1,2,0).cpu().detach().numpy()
            

#             ## Here, in the target we seperate the index, that is segment only the class or idx. 

#             tar_img = targets[0].permute(1,2,0).cpu().detach().numpy() * 255
#             tar_img_mask = (tar_img == idx).astype(int) * idx

#             tar_img_mask = tar_img_mask.astype(np.uint8)
#             # print('tar_img_mask', np.unique(tar_img_mask))

#             mask1 = torch.tensor(preds_img, dtype=torch.uint8)
#             mask2 = torch.tensor(tar_img_mask, dtype=torch.uint8)
            

#             mask_idx.append(mask1)
#             mask_target.append(mask2)
        
        
#         summed_mask_idx = torch.sum(torch.stack(mask_idx), dim=0)
#         summed_mask_target = torch.sum(torch.stack(mask_target), dim=0)
# #         plt.imshow(summed_mask_idx)  ## 0 is the classname
# #         plt.show()
#         # print('summed_mask_idx', np.unique(summed_mask_idx.cpu().detach().numpy()))
# #         plt.imshow(summed_mask_target)  ## 0 is the classname
# #         plt.show()
#         # print('summed_mask_target', np.unique(summed_mask_target.cpu().detach().numpy()))

#         # ## iou is between prediction and target for one class or index. 
#         iou = calculate_iou(summed_mask_idx.cpu(), summed_mask_target.cpu(), num_classes=21)
#         iou_scores.append(iou)

#     mean_iou = np.nanmean(iou_scores)
#     miou_list.append(mean_iou)
#     # print(f'Mean IoU on PASCAL VOC 2012 test set: {mean_iou:.4f}')
#     # print('-'*25)

In [8]:
# np.mean(miou_list)

In [10]:
import torch
from tqdm import tqdm

def calculate_iou(pred, target, num_classes=20):
    ious = []
    pred = pred.view(-1)
    target = target.view(-1)
    
    for cls in range(num_classes):
        pred_inds = pred == cls
        target_inds = target == cls
        intersection = torch.sum(pred_inds[target_inds])
        union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(float(intersection) / max(union, 1))
    # print(ious)
#     print(np.nanmean(ious))
    return np.nanmean(ious)


miou_list = []

with torch.no_grad():
    i = 0
    iou_scores = []
    for images, cv2_img, targets, pre_target in tqdm(test_loader):
        
        ############### This code below will give the GT classes/labels present in the image #############
        sa = pre_target[0].permute(1,2,0).cpu().detach().numpy()
        unique_pixels = np.unique(sa) *255
        filtered_numbers = [num for num in unique_pixels if num not in [0, 255]]

        true_label = filtered_numbers
        # print(true_label)
        
        ############### get image and text features: and obtain similarity maps (same as Surgery) #############

        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)

        features = image_features @ text_features.t()
        similarity = clip.clip_feature_surgery(image_features, text_features)
        similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

        pred = similarity_map.argmax(dim = -1)
        targets = 255.*targets

        # target_mask = torch.zeros_like(targets)
        # pred_mask = torch.zeros_like(pred)
        target_mask_list = []
        pred_mask_list = []
        for val in true_label:
            target_mask_list.append((targets.squeeze(0) == val).to(int)*val)
            pred_mask_list.append((pred == val-1).to(int)*val)
            
        
        target_mask = sum(target_mask_list)
        pred_mask = sum(pred_mask_list)
        
        ans = calculate_iou(pred_mask.cpu(), target_mask.cpu())
        
        miou_list.append(ans)

print(np.nanmean(miou_list))

100%|██████████| 1449/1449 [00:40<00:00, 35.66it/s]

0.39863354497171954





## Loop Over only the argmax classes

In [3]:
import torch
from tqdm import tqdm

def calculate_iou(pred, target, num_classes=21):
    ious = []
    pred = pred.view(-1)
    target = target.view(-1)
    
    for cls in range(1, num_classes): ## start from 1 to ignore background
        pred_inds = pred == cls
        target_inds = target == cls
        intersection = torch.sum(pred_inds[target_inds])
        union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(float(intersection) / max(union, 1))
#     print(ious)
#     print(np.nanmean(ious))
    return np.nanmean(ious)


miou_list = []

with torch.no_grad():
    i = 0
    iou_scores = []
    for images, cv2_img, targets, pre_target in tqdm(test_loader):
        
        ############### This code below will give the GT classes/labels present in the image #############
        sa = pre_target[0].permute(1,2,0).cpu().detach().numpy()
        unique_pixels = np.unique(sa) *255
        filtered_numbers = [num for num in unique_pixels if num not in [0, 255]]

        true_label = filtered_numbers
        # print(true_label)
        
        ############### get image and text features: and obtain similarity maps (same as Surgery) #############

        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)

        features = image_features @ text_features.t()
        similarity = clip.clip_feature_surgery(image_features, text_features)
        similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

        pred = similarity_map.argmax(dim = -1)
        pred = pred + 1 ## making sure that 0 is background
        
        targets = 255.*targets
        targets[targets == 255] = 0 ## only keep 0 for background, 255 was border, made it background

        
#         print(torch.unique(pred))
#         plt.imshow((pred == 1).permute(1,2,0).cpu())
#         plt.show()
        
#         print(torch.unique(targets))
#         plt.imshow((targets.squeeze(0) == 1).permute(1,2,0).cpu())
#         plt.show()
#         break
        
        target_mask_list = []
        pred_mask_list = []
            
        for val in torch.unique(pred):
            target_mask_list.append((targets.squeeze(0) == val).to(int)*(val))
            pred_mask_list.append((pred == val).to(int)*val)
            
        
        target_mask = sum(target_mask_list)
        pred_mask = sum(pred_mask_list)
        
#         print(torch.unique(target_mask))
        
#         plt.imshow(target_mask.permute(1,2,0).cpu())
#         plt.show()
        
#         plt.imshow(pred_mask.permute(1,2,0).cpu())
#         plt.show()
        
        
        ans = calculate_iou(pred_mask.cpu(), target_mask.cpu())
#         print(ans)
        
        miou_list.append(ans)

print(np.nanmean(miou_list))

100%|██████████| 1449/1449 [00:23<00:00, 60.59it/s]

0.012222481767219597





## Loop over all classes

In [3]:
with torch.no_grad():
    text_features = clip.encode_text_with_prompt_ensemble(model, all_texts, device)

In [5]:
import torch
from tqdm import tqdm

def calculate_iou(pred, target, num_classes=21):
    ious = []
    pred = pred.view(-1)
    target = target.view(-1)
    
    for cls in range(1, num_classes): ## start from 1 to ignore background
        pred_inds = pred == cls
        target_inds = target == cls
        intersection = torch.sum(pred_inds[target_inds])
        union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(float(intersection) / max(union, 1))
#     print(ious)
#     print(np.nanmean(ious))
    return np.nanmean(ious)

miou_list = []
        

with torch.no_grad():
    i = 0
    iou_scores = []
    for images, cv2_img, targets, pre_target in tqdm(test_loader):

        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        
        features = image_features @ text_features.t()
        similarity = clip.clip_feature_surgery(image_features, text_features)
        similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

        pred = similarity_map.argmax(dim = -1)
        pred = pred + 1 ## making sure that 0 is background
        
        targets = 255.*targets
        targets[targets == 255] = 0 ## only keep 0 for background, 255 was border, made it background
        
        target_mask_list = []
        pred_mask_list = []
            
        for val in range(1,21):
            target_mask_list.append((targets.squeeze(0) == val).to(int)*(val))
            pred_mask_list.append((pred == val).to(int)*val)
            
        target_mask = sum(target_mask_list)
        pred_mask = sum(pred_mask_list)
        
        ans = calculate_iou(pred_mask.cpu(), target_mask.cpu())
        # print(ans)
        miou_list.append(ans)

print(np.nanmean(miou_list))

100%|██████████| 1449/1449 [00:35<00:00, 40.53it/s]

0.009240414121136459





In [6]:
import torch
from tqdm import tqdm

def calculate_iou(pred, target, num_classes=21):
    ious = []
    pred = pred.view(-1)
    target = target.view(-1)
    
    for cls in range( num_classes):
        pred_inds = pred == cls
        target_inds = target == cls
        intersection = torch.sum(pred_inds[target_inds])
        union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(float(intersection) / max(union, 1))
#     print(ious)
#     print(np.nanmean(ious))
    return np.nanmean(ious)


miou_list = []

with torch.no_grad():
    i = 0
    iou_scores = []
    for images, cv2_img, targets, pre_target in tqdm(test_loader):
        
        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
    

        features = image_features @ text_features.t()
        similarity = clip.clip_feature_surgery(image_features, text_features)
        similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

        pred = similarity_map.argmax(dim = -1)
        pred = pred + 1 ## making sure that 0 is background
        
        targets = 255.*targets
        targets[targets == 255] = 0 ## only keep 0 for background, 255 was border, made it background

        
        target_mask_list = []
        pred_mask_list = []
            
        for val in torch.unique(pred):
            target_mask_list.append((targets.squeeze(0) == val).to(int)*(val))
            pred_mask_list.append((pred == val).to(int)*val)
            
        
        target_mask = sum(target_mask_list)
        pred_mask = sum(pred_mask_list)
        
        ans = calculate_iou(pred_mask.cpu(), target_mask.cpu())
        
        miou_list.append(ans)

print(np.nanmean(miou_list))

100%|██████████| 1449/1449 [00:30<00:00, 48.18it/s]

0.008751337843497195





## Ours with segemetnation

In [12]:
import torch.nn as nn
sizes = [512, 384, 256]

layers_text = []
for i in range(len(sizes) - 2):
    layers_text.append(nn.Linear(sizes[i], sizes[i + 1], bias=False))
    layers_text.append(nn.BatchNorm1d(sizes[i + 1]))
    layers_text.append(nn.ReLU(inplace=True))
layers_text.append(nn.Linear(sizes[-2], sizes[-1], bias=False))
text_projector = nn.Sequential(*layers_text)

size_img = [512, 256]
layers_img = []
# for i in range(len(sizes) - 2):
#     layers_img.append(nn.Linear(size_img[i], size_img[i + 1], bias=False))
#     layers_img.append(nn.BatchNorm1d(197))
#     layers_img.append(nn.ReLU(inplace=True))
layers_img.append(nn.Linear(size_img[-2], size_img[-1], bias=False))
text_projector = nn.Sequential(*layers_text).to(device)

image_projector = nn.Sequential(*layers_img).to(device)


# model_path = '/home/samyakr2/Redundancy/DualCoOp/output/coco_with_SSL_90_0.003R/model_best.pth.tar'
model_path = '/home/samyakr2/Redundancy/DualCoOp/output/coco_with_SSL_90_0.002R/model_best.pth.tar'
state_dict = torch.load(model_path)

projector_weights_text = {}
projector_weights_img = {}

for kays in state_dict['state_dict'].keys():
    if 'text_projector' in kays:
        projector_weights_text[kays[15:]] = state_dict['state_dict'][kays]
    if 'image_projector' in kays:
        projector_weights_img[kays[16:]] = state_dict['state_dict'][kays]


text_projector.load_state_dict(projector_weights_text)
image_projector.load_state_dict(projector_weights_img)

<All keys matched successfully>

### CLIP Surgery

In [13]:
import torch
from tqdm import tqdm

def calculate_iou(pred, target, num_classes=21):
    ious = []
    pred = pred.view(-1)
    target = target.view(-1)
    
    for cls in range( num_classes):
        pred_inds = pred == cls
        target_inds = target == cls
        intersection = torch.sum(pred_inds[target_inds])
        union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(float(intersection) / max(union, 1))
#     print(ious)
#     print(np.nanmean(ious))
    return np.nanmean(ious)


miou_list = []

with torch.no_grad():
    i = 0
    iou_scores = []
    text_feats = clip.encode_text_with_prompt_ensemble(model, all_texts, device)
        
    for images, cv2_img, targets, pre_target in tqdm(test_loader):

        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        

        # text_features = text_projector(text_feats)
        text_features = text_feats
    

        features = image_features @ text_features.t()
        similarity = clip.clip_feature_surgery(image_features, text_features)
        similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)


        pred = similarity_map.argmax(dim = -1)
        pred = pred + 1 ## making sure that 0 is background
        
        targets = 255.*targets
        targets[targets == 255] = 0 ## only keep 0 for background, 255 was border, made it background
        
        target_mask_list = []
        pred_mask_list = []
            
        for val in range(1,21):
            target_mask_list.append((targets.squeeze(0) == val).to(int)*(val))
            pred_mask_list.append((pred == val).to(int)*val)
            
        target_mask = sum(target_mask_list)
        pred_mask = sum(pred_mask_list)
        
        ans = calculate_iou(pred_mask.cpu(), target_mask.cpu())
        miou_list.append(ans)

print(np.nanmean(miou_list))

100%|██████████| 1449/1449 [00:25<00:00, 56.90it/s]

0.008664675723514367





### Ours

In [14]:
import torch
from tqdm import tqdm

def calculate_iou(pred, target, num_classes=21):
    ious = []
    pred = pred.view(-1)
    target = target.view(-1)
    
    for cls in range( num_classes):
        pred_inds = pred == cls
        target_inds = target == cls
        intersection = torch.sum(pred_inds[target_inds])
        union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(float(intersection) / max(union, 1))
#     print(ious)
#     print(np.nanmean(ious))
    return np.nanmean(ious)


miou_list = []

with torch.no_grad():
    i = 0
    iou_scores = []
    text_feats = clip.encode_text_with_prompt_ensemble(model, all_texts, device)
        
    for images, cv2_img, targets, pre_target in tqdm(test_loader):

        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        img_feat = image_projector(image_features)
        image_features = img_feat

        text_features = text_projector(text_feats)
    

        features = image_features @ text_features.t()
        # similarity = clip.clip_feature_surgery(image_features, text_features)
        # similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

        similarity_map = clip.get_similarity_map(features[:, 1:, :], 224)

        pred = similarity_map.argmax(dim = -1)
        pred = pred + 1 ## making sure that 0 is background
        
        targets = 255.*targets
        targets[targets == 255] = 0 ## only keep 0 for background, 255 was border, made it background
        
        target_mask_list = []
        pred_mask_list = []
            
        for val in range(1,21):
            target_mask_list.append((targets.squeeze(0) == val).to(int)*(val))
            pred_mask_list.append((pred == val).to(int)*val)
            
        target_mask = sum(target_mask_list)
        pred_mask = sum(pred_mask_list)
        
        ans = calculate_iou(pred_mask.cpu(), target_mask.cpu())
        miou_list.append(ans)

print(np.nanmean(miou_list))

100%|██████████| 1449/1449 [00:24<00:00, 58.06it/s]

0.005253465879289109





#### Ours + CLIP surgery

In [15]:
import torch
from tqdm import tqdm

def calculate_iou(pred, target, num_classes=21):
    ious = []
    pred = pred.view(-1)
    target = target.view(-1)
    
    for cls in range( num_classes):
        pred_inds = pred == cls
        target_inds = target == cls
        intersection = torch.sum(pred_inds[target_inds])
        union = torch.sum(pred_inds) + torch.sum(target_inds) - intersection
        
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(float(intersection) / max(union, 1))
#     print(ious)
#     print(np.nanmean(ious))
    return np.nanmean(ious)


miou_list = []

with torch.no_grad():
    i = 0
    iou_scores = []
    text_feats = clip.encode_text_with_prompt_ensemble(model, all_texts, device)
        
    for images, cv2_img, targets, pre_target in tqdm(test_loader):

        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        img_feat = image_projector(image_features)
        image_features = img_feat

        text_features = text_projector(text_feats)
    

        features = image_features @ text_features.t()
        similarity = clip.clip_feature_surgery(image_features, text_features)
        similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

        pred = similarity_map.argmax(dim = -1)
        pred = pred + 1 ## making sure that 0 is background
        
        targets = 255.*targets
        targets[targets == 255] = 0 ## only keep 0 for background, 255 was border, made it background
        
        target_mask_list = []
        pred_mask_list = []
            
        for val in range(1,21):
            target_mask_list.append((targets.squeeze(0) == val).to(int)*(val))
            pred_mask_list.append((pred == val).to(int)*val)
            
        target_mask = sum(target_mask_list)
        pred_mask = sum(pred_mask_list)
        
        ans = calculate_iou(pred_mask.cpu(), target_mask.cpu())
        miou_list.append(ans)

print(np.nanmean(miou_list))

100%|██████████| 1449/1449 [00:24<00:00, 58.16it/s]

0.019492460235222737





## Explanablity

In [4]:
import numpy as np
from sklearn.metrics import jaccard_score

def min_max_normalize(segmentation_map):
    """
    Normalize the segmentation map to the range [0, 1] using Min-Max normalization.
    """
    min_val = np.min(segmentation_map)
    max_val = np.max(segmentation_map)
    return (segmentation_map - min_val) / (max_val - min_val)

def compute_iou(ground_truth, binarized_map):
    """
    Compute the Intersection over Union (IoU) between the ground truth and the binarized segmentation map.
    """
    # Flatten the arrays to compute IoU
    return jaccard_score(ground_truth.flatten(), binarized_map.flatten(), average='macro')

def find_best_threshold(normalized_map, ground_truth, step_size=0.01):
    """
    Perform a grid search to find the optimal threshold for a single class segmentation map.
    """
    best_threshold = 0
    best_iou = 0

    thresholds = np.arange(0, 1 + step_size, step_size)

    for threshold in thresholds:
        binarized_map = (normalized_map >= threshold).astype(np.uint8)
        iou = compute_iou(ground_truth, binarized_map)

        if iou > best_iou:
            best_iou = iou
            best_threshold = threshold

    return best_threshold, best_iou

def process_segmentation_map(segmentation_map, ground_truth, num_classes, step_size=0.01):
    """
    Apply the grid search strategy to find the best threshold for each class in the segmentation map.
    """
    best_thresholds = []
    best_ious = []

    for class_idx in num_classes:
        # Extract the segmentation map and ground truth for the current class
        print()
        class_segmentation_map = segmentation_map[class_idx]
        class_ground_truth = ground_truth[class_idx]

        # Normalize the segmentation map
        normalized_map = min_max_normalize(class_segmentation_map)

        # Find the best threshold using grid search
        best_threshold, best_iou = find_best_threshold(normalized_map, class_ground_truth, step_size)

        best_thresholds.append(best_threshold)
        best_ious.append(best_iou)

        print(f"Class {class_idx + 1}: Best Threshold = {best_threshold:.2f}, Best IoU = {best_iou:.4f}")

    return best_thresholds, best_ious


### CLIP Surgery

In [6]:
import torch
from tqdm import tqdm


with torch.no_grad():
    i = 0
    iou_scores = []
    text_feats = clip.encode_text_with_prompt_ensemble(model, all_texts, device)
        
    for images, cv2_img, targets, pre_target in tqdm(test_loader):

        images = images.to(device)
        targets = targets.to(device)

        image_features = model.encode_image(images)
        image_features = image_features / image_features.norm(dim=1, keepdim=True)
        img_feat = image_projector(image_features)
        image_features = img_feat

        text_features = text_projector(text_feats)
    

        features = image_features @ text_features.t()
        similarity = clip.clip_feature_surgery(image_features, text_features)
        similarity_map = clip.get_similarity_map(similarity[:, 1:, :], 224)

        targets = 255.*targets
        targets[targets == 255] = 0 ## only keep 0 for background, 255 was border, made it background        
        num_classes = torch.unique(target_mask)[1:].tolist()

        
        best_thresholds, best_ious = process_segmentation_map(pred_mask.cpu(), target_mask.cpu(), num_classes)
        break


  0%|          | 0/1449 [00:00<?, ?it/s]


NameError: name 'image_projector' is not defined

In [22]:
y

tensor([0, 1], device='cuda:0')