In [1]:
"""
#### Code adapted from the source code of ArtEmis dataset paper
"""

import json
import torch
import pandas as pd
import os.path as osp
import numpy as np
import itertools
from PIL import Image
from model.func_eval import unpickle_data,torch_load_model
from artemis.utils.vocabulary import Vocabulary #Use for text2emotion metrics
from artemis.evaluation.single_caption_per_image import apply_basic_evaluations
from artemis.emotions import IDX_TO_EMOTION
%load_ext autoreload
%autoreload 2

In [2]:
DBdir = "../Dataset/ArtEmis/ArtEmis"
modelname = 'CLIPViTB16_woSG' #'CLIPViTB16_full','CLIPViTB16_woSG','INRN34_full','INRN34_woSG','INViTB16_full','INViTB16_woSG'

model_dir = f'output/Ours_ArtEmis/{modelname}'
sampled_captions_file = osp.join(model_dir,'fullDB_test.pkl')

In [3]:
references_file = osp.join(DBdir,'Artemis_GT.pkl')
split = 'test'
gpu_id = 0

# the evaluation of the longest-common-subsequence is quite slow -- so we sub-sampled the data:
default_lcs_sample = [25000, 800]
# First integer (25000) = number of training (gt) sentences to subsample from all training
# Secong integer (800)  = number of sample sentences to subsample from all generations

In [4]:
device = "cpu"
evaluation_methods = {'bleu', 'meteor', 'rouge', 'spice','cider'}
print(evaluation_methods) 

{'cider', 'spice', 'meteor', 'bleu', 'rouge'}


In [5]:
from model.count_IdC import count_IdC

In [6]:
print("Evaluating on ArtEmis...")
gt_data = next(unpickle_data(references_file))
train_utters = gt_data['train']['references_pre_vocab']
train_utters = list(itertools.chain(*train_utters))  # undo the grouping per artwork to a single large list
print('Training Utterances', len(train_utters))
unique_train_utters = set(train_utters)
gt_data = gt_data[split]
print('Images Captioned', len(gt_data))

saved_samples = next(unpickle_data(sampled_captions_file))
for sampling_config_details, captions in saved_samples:  # you might have sampled under several sampling configurations
    merged = pd.merge(gt_data, captions)  # this ensures proper order of captions to gt (via accessing merged.captions)
    merged['caption'] = merged.captions_predicted
    hypothesis = merged.caption
    references = merged.references_pre_vocab # i.e., use references that do not have <UNK>

    metrics_eval = apply_basic_evaluations(hypothesis, references, None, None, None, 
                                           nltk_bleu=False, lcs_sample=default_lcs_sample,
                                           train_utterances=unique_train_utters,
                                           methods_to_do=evaluation_methods)
    print("Using combined types!")
    print(pd.DataFrame(metrics_eval))
    merged_IdC,merged_LC_df = count_IdC(merged)
    print("Number of Id-Captions:",len(merged_IdC))
    print("Number of unique Id-Captions:",len(set(merged_IdC.caption.tolist())))
    print("Number of literal Captions:",len(merged_LC_df))
    print("Number of unique literal Captions:",len(set(merged_LC_df.caption.tolist())))
    print()
    
    

Evaluating on ArtEmis...
Training Utterances 348197
Images Captioned 5497
BLEU: done
COCO-based-metrics: done
Using combined types!
   metric      mean       std
0  BLEU-0  0.618207  0.187845
1  BLEU-1  0.380873  0.235182
2  BLEU-2  0.224721  0.218318
3  BLEU-3  0.135761  0.158792
4   CIDER  0.112374  0.140179
5   SPICE  0.064493  0.050768
6  METEOR  0.158707  0.063991
7   ROUGE  0.332289  0.110967
Number of Id-Captions: 4140
Number of unique Id-Captions: 2736
Number of literal Captions: 1357
Number of unique literal Captions: 1126



In [7]:
print("Evaluating on ArtEmis literal captions ...")
references_file = osp.join(DBdir,'Artemis_GT_LC.pkl')
gt_data = next(unpickle_data(references_file))
train_utters = gt_data['train']['references_pre_vocab']
train_utters = list(itertools.chain(*train_utters))  # undo the grouping per artwork to a single large list
print('Training Utterances', len(train_utters))
unique_train_utters = set(train_utters)
gt_data = gt_data[split]
print('Images Captioned', len(gt_data))

merged = pd.merge(gt_data, captions)  # this ensures proper order of captions to gt (via accessing merged.captions)
merged['caption'] = merged.LC_predicted
hypothesis = merged.caption
references = merged.references_pre_vocab # i.e., use references that do not have <UNK>

metrics_eval = apply_basic_evaluations(hypothesis, references, None, None, None, 
                                       nltk_bleu=False, lcs_sample=default_lcs_sample,
                                       train_utterances=unique_train_utters,
                                       methods_to_do=evaluation_methods)
print(pd.DataFrame(metrics_eval))
merged_IdC,merged_LC_df = count_IdC(merged)
print("Number of Id-Captions:",len(merged_IdC))
print("Number of unique Id-Captions:",len(set(merged_IdC.caption.tolist())))
print("Number of literal Captions:",len(merged_LC_df))
print("Number of unique literal Captions:",len(set(merged_LC_df.caption.tolist())))
print()




Evaluating on ArtEmis literal captions ...
Training Utterances 272688
Images Captioned 4019
BLEU: done
COCO-based-metrics: done
   metric      mean       std
0  BLEU-0  0.612091  0.180901
1  BLEU-1  0.365989  0.216010
2  BLEU-2  0.199232  0.183790
3  BLEU-3  0.100891  0.102680
4   CIDER  0.125504  0.152294
5   SPICE  0.074605  0.054700
6  METEOR  0.153407  0.061447
7   ROUGE  0.334211  0.101582
Number of Id-Captions: 0
Number of unique Id-Captions: 0
Number of literal Captions: 4019
Number of unique literal Captions: 3180



In [8]:
print("Evaluating on ArtEmis Id-captions ...")
references_file = osp.join(DBdir,'Artemis_GT_IdC.pkl')
gt_data = next(unpickle_data(references_file))
train_utters = gt_data['train']['references_pre_vocab']
train_utters = list(itertools.chain(*train_utters))  # undo the grouping per artwork to a single large list
print('Training Utterances', len(train_utters))
unique_train_utters = set(train_utters)
gt_data = gt_data[split]
print('Images Captioned', len(gt_data))

merged = pd.merge(gt_data, captions)  # this ensures proper order of captions to gt (via accessing merged.captions)
merged['caption'] = merged.IdC_predicted
hypothesis = merged.caption
references = merged.references_pre_vocab # i.e., use references that do not have <UNK>
metrics_eval = apply_basic_evaluations(hypothesis, references, None, None, None, 
                                           nltk_bleu=False, lcs_sample=default_lcs_sample,
                                           train_utterances=unique_train_utters,
                                           methods_to_do=evaluation_methods)
print("Using only IdC!")
print(pd.DataFrame(metrics_eval))
merged_IdC,merged_LC_df = count_IdC(merged)
print("Number of Id-Captions:",len(merged_IdC))
print("Number of unique Id-Captions:",len(set(merged_IdC.caption.tolist())))
print("Number of literal Captions:",len(merged_LC_df))
print("Number of unique literal Captions:",len(set(merged_LC_df.caption.tolist())))
print()

Evaluating on ArtEmis Id-captions ...
Training Utterances 75509
Images Captioned 2497
BLEU: done
COCO-based-metrics: done
Using only IdC!
   metric      mean       std
0  BLEU-0  0.626189  0.172612
1  BLEU-1  0.430129  0.190077
2  BLEU-2  0.290601  0.218220
3  BLEU-3  0.199041  0.190423
4   CIDER  0.120872  0.162590
5   SPICE  0.065644  0.056398
6  METEOR  0.190781  0.061789
7   ROUGE  0.376010  0.114564
Number of Id-Captions: 2494
Number of unique Id-Captions: 1883
Number of literal Captions: 3
Number of unique literal Captions: 3

