In [1]:
#!/usr/bin/env python
# coding: utf-8

"""
Code adapted from the data preprocessing code of the ArtEmis paper.
"""

"""
Combine, clean, pre-process ArtEmis annotations.
The MIT License (MIT)
Originally created by Panos Achlioptas at 6/17/20, for Python 3.x
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
"""

import nltk
import argparse
import pprint
import pathlib
import json
import numpy as np
import pandas as pd
import os.path as osp
import pickle

import pdb
random_seed = 2021

In [2]:
## load source dataset
source_data_csv = '../Dataset/ArtEmis/OriginalArtEmis/artemis_dataset_full.csv'
df = pd.read_csv(source_data_csv)
print(len(df))

454684


In [3]:
#Load a list of images with input features of M2 model provided by ArtEmis paper
with open('../Dataset/ArtEmis/OriginalArtEmis/list_avai_imgId.pkl','rb') as file: 
    avai_imgID = pickle.load(file)
with open('../Dataset/ArtEmis/OriginalArtEmis/wikiart_split.pkl','rb') as file:
    paints_ids_dict = dict(pickle.load(file))
paints_ids_dict_ids = list(paints_ids_dict.values())
paints_ids_dict_imgfiles = list(paints_ids_dict.keys())
avai_imgfiles = []
for imgid in avai_imgID:
    avai_imgfiles.append(paints_ids_dict_imgfiles[imgid])
print(f"Number of images with available input features of M2 model: {len(avai_imgfiles)}")    

Number of images with available input features of M2 model: 80924


In [4]:
## Create utterance_spelled
from artemis.language.basics import tokenize_and_spell
glove_file = '../Dataset/ArtEmis/OriginalArtEmis/glove.6B.100d.vocabulary.txt'
freq_file = '../Dataset/ArtEmis/OriginalArtEmis/symspell_frequency_dictionary_en_82_765.txt'
missed_tokens = tokenize_and_spell(df, glove_file, freq_file, nltk.word_tokenize, spell_check=True)
print('tokens not in Glove/Manual vocabulary:', len(missed_tokens))
print(len(df.utterance_spelled))

SymSpell spell-checker loaded: True
Loading glove word embeddings.
Done. 400000 words loaded.
Updating Glove vocabulary with *valid* ArtEmis words that are missing from it.
tokens not in Glove/Manual vocabulary: 662
454684


In [5]:
too_long_cap = df.tokens_len > 63
df = df[~too_long_cap]
df.reset_index(drop=True, inplace=True)
print(len(df))

453525


In [6]:
## Exclude captions with emotions = 'something else'
df = df[df.emotion!='something else']
df.reset_index(drop=True,inplace=True)
print(len(df))

400838


In [7]:
## Extract imagination-driven captions
# List of keywords
keywords_2tokens = {'looks like','look like','look as','looks as','reminds me','remind me',
                       'is like','is likely','are like','are likely','think of','thinks of',
                       'as if','as though','feel like','feels like','shaped like', 'shapes like', 'shape like',
                       'calm like','looks likely','look likely',
                       'seems like','seem like','seems as', 'seem as',
                    }

keywords_3tokens = {'looks almost like','look almost like','is almost as','are almost as','seems to be', 'seem to be'}
keywords_1tokens = {'resemble','resembling'}
keywords_1tokens_spell = []
keywords_2tokens_spell = []
keywords_3tokens_spell = []
for keyword in keywords_1tokens:
    keywords_1tokens_spell.append(((keyword.split(' '))))
for keyword in keywords_2tokens:
    keywords_2tokens_spell.append(((keyword.split(' '))))
for keyword in keywords_3tokens:
    keywords_3tokens_spell.append(((keyword.split(' '))))


In [8]:
subjects = []
subjects_maxLen = 0
predicates = []
predicates_maxLen = 0
for index,tokens_encoded in enumerate(df['tokens']):
    subject = None
    predicate = None
    for i,currToken in enumerate(tokens_encoded):
        currToken = tokens_encoded[i:i+1]
        if currToken in keywords_1tokens_spell:
            if len(tokens_encoded[i:])>=2 and len(tokens_encoded[:i])>=1:
                subject = tokens_encoded[:i]
                predicate = tokens_encoded[i:]
                if subjects_maxLen < len(subject):
                    subjects_maxLen = len(subject)
                if predicates_maxLen < len(predicate):
                    predicates_maxLen = len(predicate)
                break # Stop at the first keyword in the sentence
            elif len(tokens_encoded[:i]) ==0:
                subject = ['it']
                predicate = tokens_encoded[i:]
                if subjects_maxLen < len(subject):
                    subjects_maxLen = len(subject)
                if predicates_maxLen < len(predicate):
                    predicates_maxLen = len(predicate)
                break # Stop at the first keyword in the sentence
        if i >= 1:
            contToken = tokens_encoded[i-1:i+1]
            if contToken in keywords_2tokens_spell:
                if len(tokens_encoded[i:])>=2  and len(tokens_encoded[:i-1])>=1:
                    subject = tokens_encoded[:i-1]
                    predicate = tokens_encoded[i-1:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
                elif len(tokens_encoded[:i-1]) ==0:
                    subject = ['it']
                    predicate = tokens_encoded[i-1:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
        if i >= 2:
            contToken = tokens_encoded[i-2:i+1]
            if contToken in keywords_3tokens_spell:
                if len(tokens_encoded[i:])>=2   and len(tokens_encoded[:i-2])>=1:
                    subject = tokens_encoded[:i-2]
                    predicate = tokens_encoded[i-2:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
                elif len(tokens_encoded[:i-2]) ==0:
                    subject = ['it']
                    predicate = tokens_encoded[i-2:]
                    if subjects_maxLen < len(subject):
                        subjects_maxLen = len(subject)
                    if predicates_maxLen < len(predicate):
                        predicates_maxLen = len(predicate)
                    break # Stop at the first keyword in the sentence
    subjects.append(subject)
    predicates.append(predicate)
print(subjects_maxLen)
print(predicates_maxLen)

56
62


In [9]:
df['subject']=subjects
df['predicate']=predicates

In [10]:
df_IdCI = df[[subject != None for subject in df.subject]].copy()
df_IdCI.reset_index(drop=True, inplace=True)
df = None

In [11]:
# Delete images whose input features of M2 model are not provided
img_files = '/'+df_IdCI.art_style+'/'+df_IdCI.painting
sel_img_idx = []
for img_file in img_files.tolist():
    sel_img_idx.append(img_file in avai_imgfiles)
df_IdCI = df_IdCI[sel_img_idx]
df_IdCI.reset_index(drop=True, inplace=True)

In [12]:
## Extract the repetition of each artwork based on unique_id
df_IdCI['unique_id'] = df_IdCI.art_style+ df_IdCI.painting
df_IdCI['repetition'] =  df_IdCI.groupby('unique_id')['unique_id'].transform('count')

In [13]:
## Extract emotion distributions
ARTEMIS_EMOTIONS = ['amusement', 'awe', 'contentment', 'excitement',
                    'anger', 'disgust',  'fear', 'sadness']

EMOTION_TO_IDX = {e: i for i, e in enumerate(ARTEMIS_EMOTIONS)}
no_emo = len(ARTEMIS_EMOTIONS)
no_emo

8

In [14]:
IDX_TO_EMOTION = {EMOTION_TO_IDX[e]: e for e in EMOTION_TO_IDX}
df_IdCI['emotion_label'] = df_IdCI.emotion.apply(lambda emotion: EMOTION_TO_IDX[emotion])
def cal_hist(x):
    no_caps = len(x)
    dis = [list(x).count(i) for i in range(no_emo)]    
    dis = np.array(dis)/no_caps 
    return list(list(list([list(dis),]*no_caps)))

df_IdCI['distEmo'] = df_IdCI.groupby('unique_id')['emotion_label'].transform(cal_hist)

In [15]:
## Split dataset
val_size =  3000
## Splits to train, val, test sets
train = [unique_id for unique_id,repetition in zip(df_IdCI.unique_id,df_IdCI.repetition)  if repetition <=2 ]
test = [unique_id for unique_id,repetition in zip(df_IdCI.unique_id,df_IdCI.repetition)  if repetition >=4 ]
rest = [unique_id for unique_id,repetition in zip(df_IdCI.unique_id,df_IdCI.repetition)  if repetition >2 and repetition <4 ]
val  = []

#Get unique values
train = list(set(train))
test = list(set(test))
rest = list(set(rest))

from sklearn.model_selection import train_test_split
    
rest.sort()
train_2, val = train_test_split(rest, test_size=val_size, random_state=random_seed)
train = train + train_2

train = set(train)
test = set(test)
val = set(val)
assert len(test.intersection(train)) == 0
assert len(val.intersection(train)) == 0
assert len(test.intersection(val)) == 0

df_IdCI['split'] =  ['train' if uni_id in train  else 'val' if uni_id in val  else 'test' for uni_id in df_IdCI.unique_id ]
print(len(df_IdCI[df_IdCI.split == 'train']) )
print(len(df_IdCI[df_IdCI.split == 'val']) )
print(len(df_IdCI[df_IdCI.split == 'test']))


75509
9000
15884


In [16]:
# Make a word-vocabulary based on training data
from artemis.utils.vocabulary import build_vocab
min_word_freq = 3
train_tokens = df_IdCI[df_IdCI.split =='train']['tokens']
vocab = build_vocab(train_tokens, min_word_freq)
print(f'Using a vocabulary with {len(vocab)} tokens')

Using a vocabulary with 10506 tokens


In [17]:
# Encode tokens as ints
max_len = max(df_IdCI.tokens_len)
df_IdCI['tokens_encoded'] = df_IdCI.tokens.apply(lambda x: vocab.encode(x, max_len))
df_IdCI['subject_encoded'] = df_IdCI.subject.apply(lambda x: vocab.encode(x, subjects_maxLen))
df_IdCI['predicate_encoded'] = df_IdCI.predicate.apply(lambda x: vocab.encode(x, predicates_maxLen))

In [18]:
# Encode tokens using CLIP tokenizer
import clip 
df_IdCI['CLIP_tokens'] = [clip.tokenize(utter).squeeze().tolist() for utter in df_IdCI['utterance_spelled']]   

In [19]:
# Save separately the grouped utterances of each stimulus
def group_gt_annotations(df, vocab):
    """ Group the annotations according to the underlying artwork/stimulus.
    :param preprocessed_dataframe: dataframe carrying ArtEmis annotations, spell-checked, with splits etc.
    :param vocab: the corresponding Vocabulary object
    :return: dictionary, carrying for each split (tran/test/val) a dataframe that has for each artwork all its collected
        annotations grouped.
    """
    results = dict()
    for split, g in df.groupby('split'): # group-by split
        g.reset_index(inplace=True, drop=True)
        g = g.groupby(['art_style', 'painting']) # group-by stimulus

        # group utterances / emotions
        # a) before "vocabularization" (i.e., raw)
        refs_pre_vocab_grouped = g['utterance_spelled'].apply(list).reset_index(name='references_pre_vocab')
        # np.sum(refs_pre_vocab_grouped.duplicated(subset=['painting']))
        # b) post "vocabularization" (e.g., contain <UNK>)
        #print(len(refs_pre_vocab_grouped.iloc[2]['references_pre_vocab']))

        tokens_grouped = g['tokens_encoded'].apply(list).reset_index(name='tokens_encoded')
        #print(len(tokens_grouped.iloc[2]['tokens_encoded']))
        emotion_grouped = g['emotion_label'].apply(list).reset_index(name='emotion')
        #print(len(emotion_grouped.iloc[2]['emotion']))

        assert all(tokens_grouped['painting'] == emotion_grouped['painting'])
        assert all(tokens_grouped['painting'] == refs_pre_vocab_grouped['painting'])

        # decode these tokens back to strings and name them "references"
        tokens_grouped['tokens_encoded'] =\
            tokens_grouped['tokens_encoded'].apply(lambda x: [vocab.decode_print(sent) for sent in x])
        tokens_grouped = tokens_grouped.rename(columns={'tokens_encoded': 'references'})

        # join results in a new single dataframe
        temp = pd.merge(emotion_grouped, refs_pre_vocab_grouped)
        #print(len(temp.iloc[2]['emotion']))
        result = pd.merge(temp, tokens_grouped)
        #print(len(result.iloc[2]['references']))
        result.reset_index(drop=True, inplace=True)
        results[split] = result
    return results

groups = group_gt_annotations(df_IdCI, vocab)

In [20]:
from six.moves import cPickle
def pickle_data(file_name, *args):
    """Using (c)Pickle to save multiple python objects in a single file.
    """
    out_file = open(file_name, 'wb')
    cPickle.dump(len(args), out_file, protocol=2)
    for item in args:
        cPickle.dump(item, out_file, protocol=2)
    out_file.close()
    

df_IdCI.reset_index(drop=True,inplace=True)
df_IdCI.to_csv(f'../Dataset/ArtEmis/ArtEmis_IdC/ArtEmis_IdCI.csv', index=False)
vocab.save(f'../Dataset/ArtEmis/ArtEmis_IdC/ArtEmis_IdCI_Vocab.pkl')
pickle_data(f'../Dataset/ArtEmis/ArtEmis_IdC/Artemis_IdCI_GT.pkl', groups)

print('n-utterances kept:', len(df_IdCI))
print('vocab size:', len(vocab))
print(f'Maximum number of tokens per caption is {max_len}')
print(f'Minimum number of tokens per caption is {min(df_IdCI.tokens_len)}')

n-utterances kept: 100393
vocab size: 10506
Maximum number of tokens per caption is 63
Minimum number of tokens per caption is 3


In [21]:
### Extract number of images having the number of captions = noCap
for noCap in range(1,4):
    cnt = 0
    cntexp = 0
    for name, group in df_IdCI.groupby('unique_id'):
        #print(group)
        #break
        if group.repetition.iloc[0] ==noCap:
            #print(group.freq)
            cnt= cnt + 1
            cntexp += len(group)
    print(noCap," captions per image: ",cnt," images with",cntexp," captions")
cnt = 0
cntexp = 0
for name, group in df_IdCI.groupby('unique_id'):
    if group.repetition.iloc[0] >=4:
        #print(group.freq)
        cnt= cnt + 1
        cntexp += len(group)
print(">=4 captions per image: ",cnt," images with",cntexp," captions")
print('Total images:',len(df_IdCI.unique_id.unique()))
print('Total captions:',len(df_IdCI))

1  captions per image:  30155  images with 30155  captions
2  captions per image:  17811  images with 35622  captions
3  captions per image:  6244  images with 18732  captions
>=4 captions per image:  2497  images with 15884  captions
Total images: 56707
Total captions: 100393
