In [1]:
import torch
import pandas as pd
import os.path as osp
import numpy as np
import clip
from ast import literal_eval

In [2]:
## Prepare the  dataset (merge it with the emotion-histograms.)
datasetname = 'ArtEmis' #ArtEmis, Flickr30K,  VizWiz, COCO

if datasetname == 'ArtEmis':
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdC/{datasetname}_IdCII_TypeII_addText.csv'
    outfile = f'../Dataset/{datasetname}/{datasetname}_IdC/{datasetname}_IdCII_3ErrType.csv'
    df_full = pd.read_csv(datafile)
    df_full['img_files'] = [osp.join(art_style,painting+'.jpg') for (art_style,painting) in zip(df_full.art_style.tolist(),df_full.painting.tolist())]
else:
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdCII_TypeII_addText.csv'
    outfile = f'../Dataset/{datasetname}/{datasetname}_IdCII_3ErrType.csv'
    df_full = pd.read_csv(datafile)


In [3]:
print('Annotations loaded:', len(df_full))
df_full['captSet_CLIP_tokens'] = df_full['captSet_CLIP_tokens'].apply(literal_eval)
df_full['captSet_text'] = df_full['captSet_text'].apply(literal_eval)
df_full['subject'] = df_full['subject'].apply(literal_eval)
df_full['predicate'] = df_full['predicate'].apply(literal_eval)
df_full['CLIP_tokens'] = df_full['CLIP_tokens'].apply(literal_eval)


Annotations loaded: 100393


In [4]:
## Only using one generated caption for each error type 
# captSet: Natural caption --> Type I --> Type II --> Type III
# refCaptSet: Other captions
df_full['captSet_CLIP_tokens'] = df_full['captSet_CLIP_tokens'].apply(lambda x: x[0:2])
df_full['captSet_text'] = df_full['captSet_text'].apply(lambda x: x[0:2])

In [5]:
## Create unnatural captions of type I and III for the training set
import nltk
import random
random.seed(0)

df_new = pd.DataFrame(columns = df_full.columns)
cnt = 0
for imgfile,g in df_full.groupby(['img_files']):
    other_imgs_df = df_full[df_full.img_files != imgfile]
    other_imgs_df.reset_index(inplace=True, drop=True)
    gtCapts = g.utterance_spelled.tolist()
    gtCapts_CLIP_tokens = g.CLIP_tokens.tolist()
    #print(gtCapts)
    #print(gtCapts_CLIP_tokens)
    for index, row in g.iterrows():
        ## Create refCaptSet by excluding the current caption
        refCaptSet = gtCapts.copy()
        refCaptSet.remove(row['utterance_spelled'])
        refCaptSet_CLIP_tokens = gtCapts_CLIP_tokens.copy()
        refCaptSet_CLIP_tokens.remove(row['CLIP_tokens'])
        #print(refCaptSet)
        #print(refCaptSet_CLIP_tokens)
        row['refCaptSet'] = refCaptSet
        row['refCaptSet_CLIP_tokens'] = refCaptSet_CLIP_tokens
        
        ## Add natural captions
        captSet = row['captSet_text'][0:1]
        captSet_CLIP_tokens = row['captSet_CLIP_tokens'][0:1]
        
        #Add Error type I
        while True: #only using subject with len > 3 words --> avoid subject = 'this'
            sel_row = other_imgs_df.iloc[random.randint(0,len(other_imgs_df)-1)]
            if len(sel_row['subject'])>3:
                unCapt = sel_row['subject'] + row['predicate']
                if len(unCapt) > 65:## too long for CLIP tokenizer
                    unCapt = unCapt[:65]
                unCapt = ' '.join(unCapt)
                unCapt_CLIP_tokens = clip.tokenize(unCapt).squeeze().tolist()
                captSet.append(unCapt)
                captSet_CLIP_tokens.append(unCapt_CLIP_tokens)
                break
        
        ## Add Error Type II
        captSet.append(row['captSet_text'][1])
        captSet_CLIP_tokens.append(row['captSet_CLIP_tokens'][1])
        
        #Add Error type III Incompletion
        words = row['utterance_spelled'].split(' ')
        #print(len(row['predicate']))
        no_remove_word = random.randint(1,max(1,int(len(row['predicate'])/2)))
        #print("no_remove_word",no_remove_word)
        #print(words)
        len_temp = len(words)
        words = words[:-no_remove_word]
        assert len_temp> len(words)
        #print(words)
        unCapt = ' '.join(words)
        #print(unCapt)
        unCapt_CLIP_tokens = clip.tokenize(unCapt).squeeze().tolist()
        #print(unCapt_CLIP_tokens)
        captSet.append(unCapt)
        assert len(captSet) == 4
        captSet_CLIP_tokens.append(unCapt_CLIP_tokens)
        row['captSet_text'] = captSet
        row['captSet_CLIP_tokens'] = captSet_CLIP_tokens
        df_new = df_new.append(row)
        cnt +=1
    #if cnt >3:
    #    break

In [6]:
df_new.reset_index(drop=True,inplace=True)
df_new.to_csv( outfile, index=False)

In [7]:
print(len(df_new))

100393


In [8]:
captSet

['the patterns and assorted colors look like a time in chinese history well mix of colors',
 'the man has lost the majority of his hair so he look like a time in chinese history well mix of colors',
 'the patterns and assorted colors resemble a clown',
 'the patterns and assorted colors look like a time in chinese history well mix']