In [1]:
import torch
import pandas as pd
import os.path as osp
import numpy as np
from ast import literal_eval
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import pdb
import clip 
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
except ImportError:
    BICUBIC = Image.BICUBIC
    

In [2]:
## Prepare the  dataset (merge it with the emotion-histograms.)
datasetname = 'COCO' #ArtEmis, Flickr30K,  VizWiz, COCO

if datasetname == 'ArtEmis':
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdC/{datasetname}_IdCII_3ErrType.csv'
    img_dir = f"../Dataset/{datasetname}/{datasetname}_IdC/Images/rawImages"
    df = pd.read_csv(datafile)
    df = df[df.split=='test']
else:
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdCII_3ErrType.csv'
    img_dir = f"../Dataset/{datasetname}/Images/rawImages"
    df = pd.read_csv(datafile)
df.reset_index(drop=True,inplace=True)
print('Number of caption sets in the test set:', len(df))
df['captSet_CLIP_tokens'] = df['captSet_CLIP_tokens'].apply(literal_eval)
df.img_files = [osp.join(img_dir,imgfile) for imgfile in df.img_files]

Number of caption sets in the test set: 1699


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
CLIP_name = 'RN50x16'
CLIPmodel,CLIPtransform,CLIPsettings = clip.load(CLIP_name,device, jit=False)
embed_dim,image_resolution, vision_layers, vision_width, vision_patch_size,context_length_CLIP, vocab_size_CLIP, transformer_width, transformer_heads, transformer_layers = CLIPsettings

In [4]:
prefix = 'A photo depicts '
prefix_encoded = clip.tokenize(prefix)[0]
prefix_encoded  = prefix_encoded[torch.nonzero(prefix_encoded)].reshape(-1)[:-1].tolist()

def preprocess_dataset(df,img_dim):
    img_transform = Compose([ 
                        Resize(image_resolution, interpolation=BICUBIC),
                        CenterCrop(image_resolution),
                        lambda image: image.convert("RGB"),
                        ToTensor(),
                        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
                    ])
    dataset = Dataset(df.img_files, df.captSet_CLIP_tokens,img_transform=img_transform)
    return dataset

class Dataset(Dataset):
    def __init__(self, image_files,captSets,img_transform=None):
        super(Dataset).__init__()
        self.image_files = image_files
        self.captSets = captSets
        self.img_transform = img_transform
        self.no_tokens = len(self.captSets[0][0])

    def __getitem__(self, index):
        captSet = []
        
        for capt in self.captSets[index]:
            prefix_capt = prefix_encoded +capt[1:]
            captSet.append(prefix_capt[:self.no_tokens]) 
        captSet = np.array(captSet).astype(dtype=np.long)
        if self.image_files is not None:
            img = Image.open(self.image_files[index])

            if img.mode is not 'RGB':
                img = img.convert('RGB')

            if self.img_transform is not None:
                img = self.img_transform(img)
        else:
            img = []
        item = {'image': img, 'captSet': captSet, 'index': index}
        return item

    def __len__(self):
        return len(self.captSets)


In [5]:
"""
Extracting CLIPscore using code from https://github.com/jmhessel/clipscore
"""
from packaging import version
import sklearn.preprocessing
CLIPmodel.eval()
w=2.5

dataset = preprocess_dataset(df,image_resolution)
clipscores = []
no_imgs =len(dataset)
print(no_imgs)

for i in range(no_imgs):
    data = dataset[i]   
    image_inputs = torch.unsqueeze(data['image'], 0)
    text_inputs  = torch.LongTensor(data['captSet']) 
    # Calculate features
    with torch.no_grad():
        image_inputs = image_inputs.to(device)
        if device == 'cuda':
            image_inputs = image_inputs.to(torch.float16)
                
        image_features = CLIPmodel.encode_image(image_inputs).cpu().numpy()
        text_features = CLIPmodel.encode_text(text_inputs.to(device)).cpu().numpy()
    
    #as of numpy 1.21, normalize doesn't work properly for float16
    if version.parse(np.__version__) < version.parse('1.21'):
        image_features = sklearn.preprocessing.normalize(image_features, axis=1)
        text_features = sklearn.preprocessing.normalize(text_features, axis=1)
    else:
        warnings.warn(
            'due to a numerical instability, new numpy normalization is slightly different than paper results. '
            'to exactly replicate paper results, please use numpy version less than 1.21, e.g., 1.20.3.')
        image_features = image_features / np.sqrt(np.sum(image_features**2, axis=1, keepdims=True))
        text_features = text_features / np.sqrt(np.sum(text_features**2, axis=1, keepdims=True))

    per_instance_image_text = w*np.clip(np.sum(image_features * text_features, axis=1), 0, None)
    clipscores.append(per_instance_image_text.tolist())
    

1699


In [6]:
import numpy as np
no_errType = 3
cnt_corr_all = 0
cnt_incorr_all = 0
print("Dataset:",datasetname,", Number of caption sets:",len(clipscores))
for errType in range(1,no_errType+1):
    cnt_corr = 0
    cnt_incorr = 0
    for sim in clipscores:
        if sim[0] > sim[errType]:
            cnt_corr +=1
            cnt_corr_all +=1
        else:
            cnt_incorr +=1
            cnt_incorr_all +=1
    print(f"Accuracy at errType={errType}:{cnt_corr}/{cnt_corr+cnt_incorr}=",cnt_corr/(cnt_corr+cnt_incorr))

print(f"Accuracy for all types:{cnt_corr_all}/{cnt_corr_all+cnt_incorr_all}=",cnt_corr_all/(cnt_corr_all+cnt_incorr_all))

Dataset: COCO , Number of images: 1699
Accuracy at errType=1:1645/1699= 0.9682165979988229
Accuracy at errType=2:1511/1699= 0.8893466745144203
Accuracy at errType=3:1352/1699= 0.7957622130665097
Accuracy for all types:4508/5097= 0.8844418285265843


In [None]:
# Dataset: COCO , Number of caption sets: 1699
Accuracy at errType=1:1645/1699= 0.9682165979988229
Accuracy at errType=2:1511/1699= 0.8893466745144203
Accuracy at errType=3:1352/1699= 0.7957622130665097
Accuracy for all types:4508/5097= 0.8844418285265843

Dataset: VizWiz , Number of caption sets: 1160
Accuracy at errType=1:1007/1160= 0.868103448275862
Accuracy at errType=2:1017/1160= 0.8767241379310344
Accuracy at errType=3:791/1160= 0.6818965517241379
Accuracy for all types:2815/3480= 0.8089080459770115

Dataset: Flickr30K , Number of caption sets: 595
Accuracy at errType=1:575/595= 0.9663865546218487
Accuracy at errType=2:532/595= 0.8941176470588236
Accuracy at errType=3:383/595= 0.6436974789915967
Accuracy for all types:1490/1785= 0.834733893557423
    
Dataset: ArtEmis , Number of caption sets: 15884
Accuracy at errType=1:12391/15884= 0.7800931755225384
Accuracy at errType=2:12165/15884= 0.7658650214051876
Accuracy at errType=3:8725/15884= 0.5492948879375472
Accuracy for all types:33281/47652= 0.6984176949550911