In [1]:
import torch
import pandas as pd
import os.path as osp
import numpy as np
from ast import literal_eval
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import pdb
import clip 
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
try:
    from torchvision.transforms import InterpolationMode
    BICUBIC = InterpolationMode.BICUBIC
except ImportError:
    BICUBIC = Image.BICUBIC
    
import os

In [2]:
## Prepare the  dataset (merge it with the emotion-histograms.)
datasetname = 'COCO' #ArtEmis, Flickr30K,  VizWiz, COCO

if datasetname == 'ArtEmis':
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdC/{datasetname}_IdCII_3ErrType.csv'
    img_dir = f"../Dataset/{datasetname}/{datasetname}_IdC/Images/rawImages"
    df = pd.read_csv(datafile)
    df = df[df.split=='test']
else:
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdCII_3ErrType.csv'
    img_dir = f"../Dataset/{datasetname}/Images/rawImages"
    df = pd.read_csv(datafile)
df.reset_index(drop=True,inplace=True)
print('Number of caption sets in the test set:', len(df))
df['captSet_CLIP_tokens'] = df['captSet_CLIP_tokens'].apply(literal_eval)
df.img_files = [osp.join(img_dir,imgfile) for imgfile in df.img_files]

Number of caption sets in the test set: 1699


## Using source code of vifidel https://github.com/ImperialNLP/vifidel

In [3]:
df['captSet_text'] = df['captSet_text'].apply(literal_eval)
df['refCaptSet'] = df['refCaptSet'].apply(literal_eval)

data_path = '../Dataset/genome/1600-400-20'

# Load classes
classes = ['__background__']
with open(os.path.join(data_path, 'objects_vocab.txt')) as f:
    for object in f.readlines():
        classes.append(object.split(',')[0].lower().strip())
score_thresh = 0.3
no_errType = 3

In [4]:
from __future__ import division
import numpy as np
from pyemd import emd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import euclidean_distances
from gensim.models import KeyedVectors
from gensim.models import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format('vifidel/data/GoogleNews-vectors-negative300.bin.gz',binary=True) 
vocab = word_vectors.key_to_index.keys()

In [5]:
vifidel_scores = []
for _,row in df.iterrows():
    imgfeat_file = row['imgfeat_file']
    captSet = row['captSet_text']
    refcaptSet = row['refCaptSet']
    roi_feats = np.load(imgfeat_file, allow_pickle=True)
    info = roi_feats['info'].item()
    num_boxes = info['num_boxes']
    objects_id = info['objects_id']
    objects_conf = info['objects_conf']
    objects = objects_id
    detected_objs = []
    for i in range(num_boxes):
        if objects_conf[i] > score_thresh:
            detected_objs.append(classes[objects[i]+1])
    vifidel_score_captSet = [] 
    for i in range(no_errType+1): # natural and unnatural captions
        desc = []
        for w in captSet[i].split(' '): ## Excluding words not in vocab
            if w in vocab:
                desc.append(w)
        desc = ' '.join(desc) 
        objs = ' '.join(detected_objs) 
        vc = CountVectorizer(stop_words='english').fit([objs, desc])
        v_obj, v_desc = vc.transform([objs, desc])

        v_obj = v_obj.toarray().ravel()
        v_desc = v_desc.toarray().ravel()
        wvoc = word_vectors[[w for w in vc.get_feature_names()]]
        
        if len(refcaptSet) ==0:
            distance_matrix = euclidean_distances(wvoc)

            if np.sum(distance_matrix) == 0.0:
                score = float('inf')

            else:
                v_obj = v_obj.astype(np.double)
                v_desc = v_desc.astype(np.double)
                if v_obj.sum():
                    v_obj /= v_obj.sum()
                if v_desc.sum():
                    v_desc /= v_desc.sum()

                distance_matrix = distance_matrix.astype(np.double)
                score = np.exp(-emd(v_obj, v_desc, distance_matrix))
        else: ## Reference
            weightsn = np.zeros(len(wvoc))
            for r_ori in refcaptSet:
                r = []
                for w in r_ori.split(' '): ## Excluding words not in vocab
                    if w in vocab:
                        r.append(w)
                r = ' '.join(r) 
                vr = CountVectorizer(stop_words='english').fit([r])
                wvr = word_vectors[[w for w in vr.get_feature_names()]]
                wts = (1. - cosine_similarity(wvoc, wvr).max(axis=1))
                wts = np.array([w if np.sign(w) == 1 else 0. for w in wts]) / 2.
                weightsn += wts

            weights = weightsn / len(refcaptSet)


            distance_matrix = np.zeros((len(wvoc), len(wvoc)), dtype=np.double)
            for i, o in enumerate(vc.get_feature_names()):
                for j, c in enumerate(vc.get_feature_names()):
                    distance_matrix[i,j] = np.sqrt(np.sum(((weights[i] *
                        word_vectors[o]) - (weights[j] *
                            word_vectors[c]))**2))

            if np.sum(distance_matrix) == 0.0:
                score =  float('inf')
            else:
                v_obj = v_obj.astype(np.double)
                v_desc = v_desc.astype(np.double)
                if v_obj.sum():
                    v_obj /= v_obj.sum()
                if v_desc.sum():
                    v_desc /= v_desc.sum()

                distance_matrix = distance_matrix.astype(np.double)
                # distance_matrix /= distance_matrix.max()
                score = np.exp(-emd(v_obj, v_desc, distance_matrix))
        
        vifidel_score_captSet.append(score)
    vifidel_scores.append(vifidel_score_captSet)

In [6]:
import numpy as np
no_errType = 3
cnt_corr_all = 0
cnt_incorr_all = 0
print("Dataset:",datasetname,", Number of caption sets:",len(vifidel_scores))
for errType in range(1,no_errType+1):
    cnt_corr = 0
    cnt_incorr = 0
    for sim in vifidel_scores:
        if sim[0] > sim[errType]:
            cnt_corr +=1
            cnt_corr_all +=1
        else:
            cnt_incorr +=1
            cnt_incorr_all +=1
    print(f"Accuracy at errType={errType}:{cnt_corr}/{cnt_corr+cnt_incorr}=",cnt_corr/(cnt_corr+cnt_incorr))

print(f"Accuracy for all types:{cnt_corr_all}/{cnt_corr_all+cnt_incorr_all}=",cnt_corr_all/(cnt_corr_all+cnt_incorr_all))

Dataset: COCO , Number of caption sets: 1699
Accuracy at errType=1:1506/1699= 0.8864037669217186
Accuracy at errType=2:1202/1699= 0.707474985285462
Accuracy at errType=3:860/1699= 0.5061801059446733
Accuracy for all types:3568/5097= 0.7000196193839513


In [7]:
Dataset: COCO , Number of caption sets: 1699
Accuracy at errType=1:1506/1699= 0.8864037669217186
Accuracy at errType=2:1202/1699= 0.707474985285462
Accuracy at errType=3:860/1699= 0.5061801059446733
Accuracy for all types:3568/5097= 0.7000196193839513
    
Dataset: VizWiz , Number of caption sets: 1160
Accuracy at errType=1:760/1160= 0.6551724137931034
Accuracy at errType=2:759/1160= 0.6543103448275862
Accuracy at errType=3:414/1160= 0.3568965517241379
Accuracy for all types:1933/3480= 0.5554597701149425

Dataset: Flickr30K , Number of caption sets: 595
Accuracy at errType=1:384/595= 0.6453781512605042
Accuracy at errType=2:369/595= 0.6201680672268908
Accuracy at errType=3:225/595= 0.37815126050420167
Accuracy for all types:978/1785= 0.5478991596638656

Dataset: ArtEmis , Number of caption sets: 15884
Accuracy at errType=1:11337/15884= 0.7137370939309997
Accuracy at errType=2:10027/15884= 0.6312641651976832
Accuracy at errType=3:4151/15884= 0.2613321581465626
Accuracy for all types:25515/47652= 0.5354444724250819

SyntaxError: invalid syntax (<ipython-input-7-1bd8056545ae>, line 1)