In [1]:
#!/usr/bin/env python
# coding: utf-8

"""
Code adapted from the data preprocessing code of the ArtEmis paper.
"""

"""
Combine, clean, pre-process ArtEmis annotations.
The MIT License (MIT)
Originally created by Panos Achlioptas at 6/17/20, for Python 3.x
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
"""

import nltk
import argparse
import pprint
import pathlib
import json
import numpy as np
import pandas as pd
import os.path as osp
import pickle
from ast import literal_eval
import pdb
random_seed = 2021

In [2]:
## load source dataset
source_data_csv = '../Dataset/ArtEmis/OriginalArtEmis/artemis_dataset_full.csv'
df = pd.read_csv(source_data_csv)
df.reset_index(drop=True, inplace=True)
print(len(df))

454684


In [3]:
#Load a list of images with input features of M2 model provided by ArtEmis paper
with open('../Dataset/ArtEmis/OriginalArtEmis/list_avai_imgId.pkl','rb') as file: 
    avai_imgID = pickle.load(file)
with open('../Dataset/ArtEmis/OriginalArtEmis/wikiart_split.pkl','rb') as file:
    paints_ids_dict = dict(pickle.load(file))
paints_ids_dict_ids = list(paints_ids_dict.values())
paints_ids_dict_imgfiles = list(paints_ids_dict.keys())
avai_imgfiles = []
for imgid in avai_imgID:
    avai_imgfiles.append(paints_ids_dict_imgfiles[imgid])
print(f"Number of images with available input features of M2 model: {len(avai_imgfiles)}")    

Number of images with available input features of M2 model: 80924


In [4]:
## Create utterance_spelled
from artemis.language.basics import tokenize_and_spell
glove_file = '../Dataset/ArtEmis/OriginalArtEmis/glove.6B.100d.vocabulary.txt'
freq_file = '../Dataset/ArtEmis/OriginalArtEmis/symspell_frequency_dictionary_en_82_765.txt'
missed_tokens = tokenize_and_spell(df, glove_file, freq_file, nltk.word_tokenize, spell_check=True)
print('tokens not in Glove/Manual vocabulary:', len(missed_tokens))
print(len(df.utterance_spelled))

SymSpell spell-checker loaded: True
Loading glove word embeddings.
Done. 400000 words loaded.
Updating Glove vocabulary with *valid* ArtEmis words that are missing from it.
tokens not in Glove/Manual vocabulary: 662
454684


In [5]:
too_long_cap = df.tokens_len > 63
df = df[~too_long_cap]
df.reset_index(drop=True, inplace=True)
print(len(df))

453525


In [6]:
# Delete images whose input features of M2 model are not provided
img_files = '/'+df.art_style+'/'+df.painting
sel_img_idx = []
for img_file in img_files.tolist():
    sel_img_idx.append(img_file in avai_imgfiles)
df = df[sel_img_idx]
df.reset_index(drop=True, inplace=True)

In [7]:
## Detect literal captions by excluding Imagination-driven captions
# List of keywords
keywords_2tokens = {'looks like','look like','look as','looks as','reminds me','remind me',
                       'is like','is likely','are like','are likely','think of','thinks of',
                       'as if','as though','feel like','feels like','shaped like', 'shapes like', 'shape like',
                       'calm like','looks likely','look likely',
                       'seems like','seem like','seems as', 'seem as',
                    }

keywords_3tokens = {'looks almost like','look almost like','is almost as','are almost as','seems to be', 'seem to be'}
keywords_1tokens = {'resemble','resembling'}
keywords_1tokens_spell = []
keywords_2tokens_spell = []
keywords_3tokens_spell = []
for keyword in keywords_1tokens:
    keywords_1tokens_spell.append(((keyword.split(' '))))
for keyword in keywords_2tokens:
    keywords_2tokens_spell.append(((keyword.split(' '))))
for keyword in keywords_3tokens:
    keywords_3tokens_spell.append(((keyword.split(' '))))

subjects = []
subjects_maxLen = 0
predicates = []
predicates_maxLen = 0
for index,tokens_encoded in enumerate(df['tokens']):
    subject = None
    predicate = None
    for i,currToken in enumerate(tokens_encoded):
        currToken = tokens_encoded[i:i+1]
        if currToken in keywords_1tokens_spell:
            if len(tokens_encoded[i:])>=2 and len(tokens_encoded[:i])>=1:
                subject = tokens_encoded[:i]
                predicate = tokens_encoded[i:]
                if subjects_maxLen < len(subject):
                    subjects_maxLen = len(subject)
                if predicates_maxLen < len(predicate):
                    predicates_maxLen = len(predicate)
                break # Stop at the first keyword in the sentence
            elif len(tokens_encoded[:i]) ==0:
                subject = ['it']
                predicate = tokens_encoded[i:]
                if subjects_maxLen < len(subject):
                    subjects_maxLen = len(subject)
                if predicates_maxLen < len(predicate):
                    predicates_maxLen = len(predicate)
                break # Stop at the first keyword in the sentence
        if i >= 1:
            contToken = tokens_encoded[i-1:i+1]
            if contToken in keywords_2tokens_spell:
                if len(tokens_encoded[i:])>=2  and len(tokens_encoded[:i-1])>=1:
                    subject = tokens_encoded[:i-1]
                    predicate = tokens_encoded[i-1:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
                elif len(tokens_encoded[:i-1]) ==0:
                    subject = ['it']
                    predicate = tokens_encoded[i-1:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
        if i >= 2:
            contToken = tokens_encoded[i-2:i+1]
            if contToken in keywords_3tokens_spell:
                if len(tokens_encoded[i:])>=2   and len(tokens_encoded[:i-2])>=1:
                    subject = tokens_encoded[:i-2]
                    predicate = tokens_encoded[i-2:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
                elif len(tokens_encoded[:i-2]) ==0:
                    subject = ['it']
                    predicate = tokens_encoded[i-2:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
    subjects.append(subject)
    predicates.append(predicate)

df['subject']=subjects
df['predicate']=predicates
df_LC = df[[subject == None for subject in df.subject]].copy()
df = None
df_LC.reset_index(drop=True, inplace=True)
df_LC['img_id'] = [row.art_style + '/'+row.painting for _,row in df_LC.iterrows()]
img_ids_LC = df_LC['img_id'].unique()
print("Number of literal captions:",len(df_LC))
print("Number of images that has literal captions:",len(img_ids_LC))



Number of literal captions: 338742
Number of images that has literal captions: 79384


In [8]:
## load IdCI dataset
IdCI_data_csv = '../Dataset/ArtEmis/ArtEmis_IdC/ArtEmis_IdCI.csv'
df_IdC = pd.read_csv(IdCI_data_csv)
df_IdC.subject = df_IdC.subject.apply(literal_eval)
df_IdC.predicate = df_IdC.predicate.apply(literal_eval)
df_IdC.reset_index(drop=True, inplace=True)


df_IdC['img_id'] = [row.art_style + '/'+row.painting for _,row in df_IdC.iterrows()]
img_ids_IdC_train = df_IdC[df_IdC.split=='train']['img_id'].to_list()
img_ids_IdC_test = df_IdC[df_IdC.split=='test']['img_id'].to_list()
img_ids_IdC_val = df_IdC[df_IdC.split=='val']['img_id'].to_list()
img_ids_IdC = img_ids_IdC_train + img_ids_IdC_test + img_ids_IdC_val
print("IdC dataset: Number of training captions:",len(img_ids_IdC_train))
print("IdC dataset: Number of val captions:",len(img_ids_IdC_val))
print("IdC dataset: Number of test captions:",len(img_ids_IdC_test))
print("IdC dataset: Total number of captions:",len(img_ids_IdC))

print("######")
img_ids_IdC_train = list(set(img_ids_IdC_train))
img_ids_IdC_test = list(set(img_ids_IdC_test))
img_ids_IdC_val = list(set(img_ids_IdC_val))
img_ids_IdC = list(set(img_ids_IdC))
print("IdC dataset: Number of training images:",len(img_ids_IdC_train))
print("IdC dataset: Number of val images:",len(img_ids_IdC_val))
print("IdC dataset: Number of test images:",len(img_ids_IdC_test))
print("IdC dataset: Total number of images:",len(img_ids_IdC))


IdC dataset: Number of training captions: 75509
IdC dataset: Number of val captions: 9000
IdC dataset: Number of test captions: 15884
IdC dataset: Total number of captions: 100393
######
IdC dataset: Number of training images: 51210
IdC dataset: Number of val images: 3000
IdC dataset: Number of test images: 2497
IdC dataset: Total number of images: 56707


In [9]:
assert len(pd.merge(df_IdC,df_LC, on="utterance",how='inner')) == 0 # No overlapping between df_IdC and df_LC

In [10]:
merged = pd.concat([df_IdC,df_LC])
merged.reset_index(drop=True, inplace=True)
merged['repetition'] =  merged.groupby('img_id')['img_id'].transform('count')
assert len(merged[merged.split.isnull()]) == len(df_LC)
df_LC = merged[merged.split.isnull()].copy()
df_LC.reset_index(drop=True, inplace=True)

In [11]:
df_notInIdC = df_LC[[img_id not in img_ids_IdC for img_id in df_LC.img_id.tolist()]]
df_notInIdC.reset_index(drop=True, inplace=True)
img_ids_notInIdC = df_notInIdC.img_id.unique()
print("Number of images NOT IN IdC:",len(img_ids_notInIdC))
print("Number of captions of images NOT IN IdC:",len(df_notInIdC))

Number of images NOT IN IdC: 22818
Number of captions of images NOT IN IdC: 115009


In [12]:
## Splits to train, val, test sets
train = []
rest = []
for img_id,g in df_notInIdC.groupby('img_id'):
    repetition = g.repetition.tolist()[0]
    if repetition <5:
        train.append(img_id)
    else:
        rest.append(img_id)
print(len(rest))

19940


In [13]:
assert len(img_ids_notInIdC) - len(rest)- len(train) == 0

In [14]:
from sklearn.model_selection import train_test_split
val_size  =  3000
test_size =  3000
  
rest.sort()
rest, val = train_test_split(rest, test_size=val_size, random_state=random_seed)
train_2, test = train_test_split(rest, test_size=test_size, random_state=random_seed)
train = train + train_2
assert len(set(test).intersection(set(train))) == 0
assert len(set(val).intersection(set(train))) == 0
assert len(set(test).intersection(set(val))) == 0
print("NotInIdC dataset: Number of training images:",len(train))
print("NotInIdC dataset: Number of val images:",len(val))
print("NotInIdC dataset: Number of test images:",len(test))

NotInIdC dataset: Number of training images: 16818
NotInIdC dataset: Number of val images: 3000
NotInIdC dataset: Number of test images: 3000


In [15]:
train = train  + img_ids_IdC_train
val = val  + img_ids_IdC_val
test = test  + img_ids_IdC_test

print("Whole dataset: Number of training images:",len(train))
print("Whole dataset: Number of val images:",len(val))
print("Whole dataset: Number of test images:",len(test))

Whole dataset: Number of training images: 68028
Whole dataset: Number of val images: 6000
Whole dataset: Number of test images: 5497


In [16]:
assert len(merged.img_id.unique()) - len(train)- len(val)- len(test) == 0

In [17]:
merged['split'] =  ['train' if uni_id in train  else 'val' if uni_id in val  else 'test' for uni_id in merged.img_id ]
merged.reset_index(drop=True, inplace=True)
print(len(merged[merged.split == 'train']))
print(len(merged[merged.split == 'val']) )
print(len(merged[merged.split == 'test']))

348197
32011
58927


In [18]:
merged = merged[['art_style', 'painting', 'emotion','utterance', 'subject', 'predicate', 'split','img_id']].copy()
merged.reset_index(drop=True, inplace=True)

In [19]:
merged['repetition'] =  merged.groupby('img_id')['img_id'].transform('count')

In [20]:
## Create utterance_spelled
missed_tokens = tokenize_and_spell(merged, glove_file, freq_file, nltk.word_tokenize, spell_check=True)
print('tokens not in Glove/Manual vocabulary:', len(missed_tokens))
print(len(merged.utterance_spelled))

SymSpell spell-checker loaded: True
Loading glove word embeddings.
Done. 400000 words loaded.
Updating Glove vocabulary with *valid* ArtEmis words that are missing from it.
tokens not in Glove/Manual vocabulary: 637
439135


In [21]:
merged

Unnamed: 0,art_style,painting,emotion,utterance,subject,predicate,split,img_id,repetition,tokens,tokens_len,utterance_spelled
0,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,sadness,This woman has really knotty hands which makes...,"[this, woman, has, really, knotty, hands, whic...","[look, like, she, has, arthritis]",train,Post_Impressionism/vincent-van-gogh_portrait-o...,10,"[this, woman, has, really, knotty, hands, whic...",14,this woman has really knotty hands which makes...
1,Post_Impressionism,vincent-van-gogh_portrait-of-madame-ginoux-l-a...,awe,She looks like a lady from that past that migh...,[she],"[looks, like, a, lady, from, that, past, that,...",train,Post_Impressionism/vincent-van-gogh_portrait-o...,10,"[she, looks, like, a, lady, from, that, past, ...",33,she looks like a lady from that past that migh...
2,Impressionism,willard-metcalf_havana-harbor,contentment,"The red of the flowers pop off the page, it is...","[the, red, of, the, flowers, pop, off, the, pa...","[looks, like, a, wonderful, place, to, vacatio...",train,Impressionism/willard-metcalf_havana-harbor,6,"[the, red, of, the, flowers, pop, off, the, pa...",24,the red of the flowers pop off the page it is ...
3,Northern_Renaissance,robert-campin_werl-altarpiece-st-barbara-1438,amusement,The books seems to be drawn in a confusing way...,"[the, books]","[seems, to, be, drawn, in, a, confusing, way, ...",train,Northern_Renaissance/robert-campin_werl-altarp...,7,"[the, books, seems, to, be, drawn, in, a, conf...",26,the books seems to be drawn in a confusing way...
4,Realism,theodor-severin-kittelsen_kveld-paa-soletunet-...,sadness,The two run down houses look like they have se...,"[the, two, run, down, houses]","[look, like, they, have, seen, better, days]",train,Realism/theodor-severin-kittelsen_kveld-paa-so...,7,"[the, two, run, down, houses, look, like, they...",12,the two run down houses look like they have se...
...,...,...,...,...,...,...,...,...,...,...,...,...
439130,Cubism,willi-baumeister_machine-man-with-spiral-turn-...,something else,The interlocking mechanical shapes fitting tog...,,,test,Cubism/willi-baumeister_machine-man-with-spira...,44,"[the, interlocking, mechanical, shapes, fittin...",10,the interlocking mechanical shapes fitting tog...
439131,Cubism,gino-severini_a-dancer-1,awe,the collection and collage of different colors...,,,test,Cubism/gino-severini_a-dancer-1,47,"[the, collection, and, collage, of, different,...",12,the collection and collage of different colors...
439132,Romanticism,ivan-aivazovsky_sea-at-night-1861,awe,The peaceful reflections of the moonlight on t...,,,train,Romanticism/ivan-aivazovsky_sea-at-night-1861,8,"[the, peaceful, reflections, of, the, moonligh...",14,the peaceful reflections of the moonlight on t...
439133,Romanticism,ivan-aivazovsky_sea-at-night-1861,excitement,I can imagine the sailors resting this peacefu...,,,train,Romanticism/ivan-aivazovsky_sea-at-night-1861,8,"[i, can, imagine, the, sailors, resting, this,...",13,i can imagine the sailors resting this peacefu...


In [22]:
## CHECK test images has at least 4 captions
assert not any(np.array(merged[merged.split == 'test'].repetition.tolist()) <4)
assert not any(np.array(merged[merged.split == 'val'].repetition.tolist()) <3)
temp = merged[merged.split == 'test'].copy()
temp2 = temp[temp.repetition==4]
print("Number of test images having only 4 captions:",len(temp2.img_id.unique()))
temp2 = temp[temp.repetition<4]
print("Number of test images having <4 captions:",len(temp2.img_id.unique()))
temp2 = temp[temp.repetition>=5]
print("Number of test images having >=5 captions:",len(temp2.img_id.unique()))

Number of test images having only 4 captions: 45
Number of test images having <4 captions: 0
Number of test images having >=5 captions: 5452


In [23]:
merged.columns

Index(['art_style', 'painting', 'emotion', 'utterance', 'subject', 'predicate',
       'split', 'img_id', 'repetition', 'tokens', 'tokens_len',
       'utterance_spelled'],
      dtype='object')

In [24]:
# Make a word-vocabulary based on training data
from artemis.utils.vocabulary import build_vocab
min_word_freq = 5
train_tokens = merged[merged.split =='train']['tokens']
vocab = build_vocab(train_tokens, min_word_freq)
print(f'Using a vocabulary with {len(vocab)} tokens')

Using a vocabulary with 15018 tokens


In [25]:
# Encode tokens as ints
merged.reset_index(drop=True, inplace=True)
max_len = max(merged.tokens_len)
merged['tokens_encoded'] = merged.tokens.apply(lambda x: vocab.encode(x, max_len))
merged['subject_encoded'] = merged.subject.apply(lambda x: vocab.encode(x, max_len) if isinstance(x, list) else None)
merged['predicate_encoded'] = merged.predicate.apply(lambda x: vocab.encode(x, max_len) if isinstance(x, list) else None)

In [26]:
# Encode tokens using CLIP tokenizer
import clip 
merged['CLIP_tokens'] = [clip.tokenize(' '.join(utter.split(' ')[:62])).squeeze().tolist() for utter in merged['utterance_spelled']]   

In [27]:
ARTEMIS_EMOTIONS = ['amusement', 'awe', 'contentment', 'excitement',
                    'anger', 'disgust',  'fear', 'sadness','something else']

EMOTION_TO_IDX = {e: i for i, e in enumerate(ARTEMIS_EMOTIONS)}
no_emo = len(ARTEMIS_EMOTIONS)
IDX_TO_EMOTION = {EMOTION_TO_IDX[e]: e for e in EMOTION_TO_IDX}
merged['emotion_label'] = merged.emotion.apply(lambda emotion: EMOTION_TO_IDX[emotion])

In [28]:
# Save separately the grouped utterances of each stimulus
def group_gt_annotations(df, vocab):
    """ Group the annotations according to the underlying artwork/stimulus.
    :param preprocessed_dataframe: dataframe carrying ArtEmis annotations, spell-checked, with splits etc.
    :param vocab: the corresponding Vocabulary object
    :return: dictionary, carrying for each split (tran/test/val) a dataframe that has for each artwork all its collected
        annotations grouped.
    """
    results = dict()
    for split, g in df.groupby('split'): # group-by split
        g.reset_index(inplace=True, drop=True)
        g = g.groupby(['art_style', 'painting']) # group-by stimulus

        # group utterances / emotions
        # a) before "vocabularization" (i.e., raw)
        refs_pre_vocab_grouped = g['utterance_spelled'].apply(list).reset_index(name='references_pre_vocab')
        
        tokens_grouped = g['tokens_encoded'].apply(list).reset_index(name='tokens_encoded')
        #print(len(tokens_grouped.iloc[2]['tokens_encoded']))
        assert all(tokens_grouped['painting'] == refs_pre_vocab_grouped['painting'])

        # decode these tokens back to strings and name them "references"
        tokens_grouped['tokens_encoded'] =\
            tokens_grouped['tokens_encoded'].apply(lambda x: [vocab.decode_print(sent) for sent in x])
        tokens_grouped = tokens_grouped.rename(columns={'tokens_encoded': 'references'})

        result = pd.merge(refs_pre_vocab_grouped, tokens_grouped)
        result.reset_index(drop=True, inplace=True)
        results[split] = result
    return results

groups = group_gt_annotations(merged, vocab)

In [29]:
from six.moves import cPickle
def pickle_data(file_name, *args):
    """Using (c)Pickle to save multiple python objects in a single file.
    """
    out_file = open(file_name, 'wb')
    cPickle.dump(len(args), out_file, protocol=2)
    for item in args:
        cPickle.dump(item, out_file, protocol=2)
    out_file.close()
    

merged.reset_index(drop=True,inplace=True)
merged.to_csv(f'../Dataset/ArtEmis/ArtEmis/ArtEmis.csv', index=False)
vocab.save(f'../Dataset/ArtEmis/ArtEmis/ArtEmis_Vocab.pkl')
pickle_data(f'../Dataset/ArtEmis/ArtEmis/Artemis_GT.pkl', groups)
print('n-utterances kept:', len(merged))
print('vocab size:', len(vocab))
print(f'Maximum number of tokens per caption is {max_len}')
print(f'Minimum number of tokens per caption is {min(merged.tokens_len)}')

n-utterances kept: 439135
vocab size: 15018
Maximum number of tokens per caption is 63
Minimum number of tokens per caption is 1


In [30]:
### Extract number of images having the number of captions = noCap
for noCap in range(1,5):
    cnt = 0
    cntexp = 0
    for name, group in merged.groupby('img_id'):
        #print(group)
        #break
        if group.repetition.iloc[0] ==noCap:
            #print(group.freq)
            cnt= cnt + 1
            cntexp += len(group)
    print(noCap," captions per image: ",cnt," images with",cntexp," captions")
cnt = 0
cntexp = 0
for name, group in merged.groupby('img_id'):
    if group.repetition.iloc[0] >=5:
        #print(group.freq)
        cnt= cnt + 1
        cntexp += len(group)
print(">=5 captions per image: ",cnt," images with",cntexp," captions")

print('Total images:',len(merged.img_id.unique()))
print('Total captions:',len(merged))

1  captions per image:  2  images with 2  captions
2  captions per image:  30  images with 60  captions
3  captions per image:  574  images with 1722  captions
4  captions per image:  7425  images with 29700  captions
>=5 captions per image:  71494  images with 407651  captions
Total images: 79525
Total captions: 439135


In [31]:
### Extract number of images having the number of captions = noCap for only testset
for noCap in range(1,5):
    cnt = 0
    cntexp = 0
    for name, group in merged[merged.split == 'test'].groupby('img_id'):
        #print(group)
        #break
        if group.repetition.iloc[0] ==noCap:
            #print(group.freq)
            cnt= cnt + 1
            cntexp += len(group)
    print(noCap," captions per image: ",cnt," images with",cntexp," captions")
cnt = 0
cntexp = 0
for name, group in merged[merged.split == 'test'].groupby('img_id'):
    if group.repetition.iloc[0] >=5:
        #print(group.freq)
        cnt= cnt + 1
        cntexp += len(group)
print(">=5 captions per image: ",cnt," images with",cntexp," captions")

print('Numer of test images:',len(merged[merged.split == 'test'].img_id.unique()))
print('Numer of test captions:',len(merged[merged.split == 'test']))

1  captions per image:  0  images with 0  captions
2  captions per image:  0  images with 0  captions
3  captions per image:  0  images with 0  captions
4  captions per image:  45  images with 180  captions
>=5 captions per image:  5452  images with 58747  captions
Numer of test images: 5497
Numer of test captions: 58927


In [None]:
import pandas as pd
import os.path as osp
import numpy as np
import torch
import os, sys
import base64
import csv
csv.field_size_limit(sys.maxsize)
import nltk
data_dir = '../Dataset/ArtEmis/ArtEmis'

## Load dataset
file_name = f'ArtEmis.csv'
merged = pd.read_csv(osp.join(data_dir, file_name))
print(f'Loaded {len(merged)} captions!!!')