In [1]:
#!/usr/bin/env python
# coding: utf-8

"""
Code adapted from the data preprocessing code of the ArtEmis paper.
"""

"""
Combine, clean, pre-process ArtEmis annotations.
The MIT License (MIT)
Originally created by Panos Achlioptas at 6/17/20, for Python 3.x
Copyright (c) 2021 Panos Achlioptas (pachlioptas@gmail.com) & Stanford Geometric Computing Lab
"""

import nltk
import argparse
import pprint
import pathlib
import json
import numpy as np
import pandas as pd
import os.path as osp
import pickle
from ast import literal_eval
import pdb
from model.vocabulary import Vocabulary
random_seed = 2021

In [2]:
## load  dataset
vocab = Vocabulary.load('../Dataset/ArtEmis/ArtEmis/ArtEmis_Vocab.pkl')
data_csv = '../Dataset/ArtEmis/ArtEmis/ArtEmis.csv'
df = pd.read_csv(data_csv)
df.reset_index(drop=True, inplace=True)
print(len(df))

  interactivity=interactivity, compiler=compiler, result=result)


439135


In [3]:
df = df.where(pd.notnull(df), 'None')
df.tokens_encoded = df.tokens_encoded.apply(literal_eval)
df.subject_encoded = df.subject_encoded.apply(literal_eval)
df['IdCflag'] = [0 if x == None else 1 for x in df.subject_encoded.tolist() ]

In [4]:
df_ = df[df['split']=='train']
print("Number of train captions:",len(df_))
print("Number of train images:",len(set(df_.img_id.tolist())))
df_ = df[df['split']=='val'].copy()
print("Number of val captions:",len(df_))
print("Number of val images:",len(set(df_.img_id.tolist())))
df_ = df[df['split']=='test']
df_.reset_index(drop=True, inplace=True)
print("Number of testing captions:",len(df_))
print("Number of testing images:",len(set(df_.img_id.tolist())))

Number of train captions: 348197
Number of train images: 68028
Number of val captions: 32011
Number of val images: 6000
Number of testing captions: 58927
Number of testing images: 5497


In [5]:
df_LC = df[df.IdCflag == 0].copy()
df_LC.reset_index(drop=True, inplace=True)
df_LC_train = df_LC[df_LC['split']=='train'].copy()
print("Number of train literal captions:",len(df_LC_train))
print("Number of train images with literal captions:",len(set(df_LC_train.img_id.tolist())))
df_LC_val = df_LC[df_LC['split']=='val'].copy()
print("Number of val literal captions:",len(df_LC_val))
print("Number of val images with literal captions:",len(set(df_LC_val.img_id.tolist())))
df_LC = df_LC[df_LC['split']=='test']
df_LC.reset_index(drop=True, inplace=True)
print("Number of testing literal captions:",len(df_LC))
print("Number of testing images with literal captions:",len(set(df_LC.img_id.tolist())))
df_LC['repetition'] =  df_LC.groupby('img_id')['img_id'].transform('count')
df_LC = df_LC[df_LC['repetition']>=4]
df_LC.reset_index(drop=True, inplace=True)
assert len(df_LC[df_LC['repetition']<4]) == 0
print("Number of testing literal captions with repetition>=4:",len(df_LC))
print("Number of testing images with >=4 literal captions:",len(set(df_LC.img_id.tolist())))
df_LC = pd.concat([df_LC,df_LC_train,df_LC_val])
df_LC.reset_index(drop=True, inplace=True)
print("Number of literal captions (excluding test captions with repetition <4):",len(df_LC))

Number of train literal captions: 272688
Number of train images with literal captions: 68014
Number of val literal captions: 23011
Number of val images with literal captions: 5996
Number of testing literal captions: 43043
Number of testing images with literal captions: 5374
Number of testing literal captions with repetition>=4: 41128
Number of testing images with >=4 literal captions: 4019
Number of literal captions (excluding test captions with repetition <4): 336827


In [6]:
df_IdC = df[df.IdCflag == 1].copy()
df_IdC.reset_index(drop=True, inplace=True)
df_IdC_train = df_IdC[df_IdC['split']=='train'].copy()
print("Number of train Id-captions:",len(df_IdC_train))
print("Number of train images with Id-captions:",len(set(df_IdC_train.img_id.tolist())))
df_IdC_val = df_IdC[df_IdC['split']=='val'].copy()
print("Number of val Id-captions:",len(df_IdC_val))
print("Number of val images with Id-captions:",len(set(df_IdC_val.img_id.tolist())))
df_IdC = df_IdC[df_IdC['split']=='test']
df_IdC.reset_index(drop=True, inplace=True)
print("Number of testing Id-captions:",len(df_IdC))
print("Number of testing images with Id-captions:",len(set(df_IdC.img_id.tolist())))
df_IdC['repetition'] =  df_IdC.groupby('img_id')['img_id'].transform('count')
assert len(df_IdC[df_IdC['repetition']<4]) == 0
print("Number of testing Id-captions with repetition>=4:",len(df_IdC))
print("Number of testing images with >=4 Id-captions:",len(set(df_IdC.img_id.tolist())))
df_IdC = pd.concat([df_IdC,df_IdC_train,df_IdC_val])
df_IdC.reset_index(drop=True, inplace=True)
print("Number of Id-captions (excluding test captions with repetition <4):",len(df_IdC))

Number of train Id-captions: 75509
Number of train images with Id-captions: 51210
Number of val Id-captions: 9000
Number of val images with Id-captions: 3000
Number of testing Id-captions: 15884
Number of testing images with Id-captions: 2497
Number of testing Id-captions with repetition>=4: 15884
Number of testing images with >=4 Id-captions: 2497
Number of Id-captions (excluding test captions with repetition <4): 100393


In [7]:
# Save separately the grouped utterances of each stimulus
def group_gt_annotations(df, vocab):
    """ Group the annotations according to the underlying artwork/stimulus.
    :param preprocessed_dataframe: dataframe carrying ArtEmis annotations, spell-checked, with splits etc.
    :param vocab: the corresponding Vocabulary object
    :return: dictionary, carrying for each split (tran/test/val) a dataframe that has for each artwork all its collected
        annotations grouped.
    """
    results = dict()
    for split, g in df.groupby('split'): # group-by split
        g.reset_index(inplace=True, drop=True)
        g = g.groupby(['art_style', 'painting']) # group-by stimulus

        # group utterances / emotions
        # a) before "vocabularization" (i.e., raw)
        refs_pre_vocab_grouped = g['utterance_spelled'].apply(list).reset_index(name='references_pre_vocab')
        
        tokens_grouped = g['tokens_encoded'].apply(list).reset_index(name='tokens_encoded')
        #print(len(tokens_grouped.iloc[2]['tokens_encoded']))
        assert all(tokens_grouped['painting'] == refs_pre_vocab_grouped['painting'])

        # decode these tokens back to strings and name them "references"
        tokens_grouped['tokens_encoded'] =\
            tokens_grouped['tokens_encoded'].apply(lambda x: [vocab.decode_print(sent) for sent in x])
        tokens_grouped = tokens_grouped.rename(columns={'tokens_encoded': 'references'})

        result = pd.merge(refs_pre_vocab_grouped, tokens_grouped)
        result.reset_index(drop=True, inplace=True)
        results[split] = result
    return results

from six.moves import cPickle
def pickle_data(file_name, *args):
    """Using (c)Pickle to save multiple python objects in a single file.
    """
    out_file = open(file_name, 'wb')
    cPickle.dump(len(args), out_file, protocol=2)
    for item in args:
        cPickle.dump(item, out_file, protocol=2)
    out_file.close()

In [8]:
groups = group_gt_annotations(df_LC, vocab)
pickle_data(f'../Dataset/ArtEmis/ArtEmis/Artemis_GT_LC.pkl', groups)
groups = group_gt_annotations(df_IdC, vocab)
pickle_data(f'../Dataset/ArtEmis/ArtEmis/Artemis_GT_IdC.pkl', groups)

In [9]:
### Extract number of images having the number of captions = noCap
for noCap in range(1,5):
    cnt = 0
    cntexp = 0
    for name, group in df_LC.groupby('img_id'):
        #print(group)
        #break
        if group.repetition.iloc[0] ==noCap:
            #print(group.freq)
            cnt= cnt + 1
            cntexp += len(group)
    print(noCap," captions per image: ",cnt," images with",cntexp," captions")
cnt = 0
cntexp = 0
for name, group in df_LC.groupby('img_id'):
    if group.repetition.iloc[0] >=5:
        #print(group.freq)
        cnt= cnt + 1
        cntexp += len(group)
print(">=5 captions per image: ",cnt," images with",cntexp," captions")

print('Total images:',len(df_LC.img_id.unique()))
print('Total captions:',len(df_LC))

1  captions per image:  2  images with 2  captions
2  captions per image:  24  images with 35  captions
3  captions per image:  562  images with 1291  captions
4  captions per image:  7428  images with 22757  captions
>=5 captions per image:  70013  images with 312742  captions
Total images: 78029
Total captions: 336827


In [10]:
### Extract number of images having the number of captions = noCap
for noCap in range(1,5):
    cnt = 0
    cntexp = 0
    for name, group in df_IdC.groupby('img_id'):
        #print(group)
        #break
        if group.repetition.iloc[0] ==noCap:
            #print(group.freq)
            cnt= cnt + 1
            cntexp += len(group)
    print(noCap," captions per image: ",cnt," images with",cntexp," captions")
cnt = 0
cntexp = 0
for name, group in df_IdC.groupby('img_id'):
    if group.repetition.iloc[0] >=5:
        #print(group.freq)
        cnt= cnt + 1
        cntexp += len(group)
print(">=5 captions per image: ",cnt," images with",cntexp," captions")

print('Total images:',len(df_IdC.img_id.unique()))
print('Total captions:',len(df_IdC))

1  captions per image:  0  images with 0  captions
2  captions per image:  19  images with 25  captions
3  captions per image:  327  images with 431  captions
4  captions per image:  6217  images with 12775  captions
>=5 captions per image:  50144  images with 87162  captions
Total images: 56707
Total captions: 100393
