In [1]:
import torch
import pandas as pd
import os.path as osp
import numpy as np
from ast import literal_eval
from PIL import Image
from torch.utils.data import Dataset, DataLoader

In [2]:
## Prepare the  dataset (merge it with the emotion-histograms.)
datasetname = 'COCO' #ArtEmis, Flickr30K,  VizWiz, COCO

if datasetname == 'ArtEmis':
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdC/{datasetname}_IdCII_3ErrType.csv'
    img_dir = f"../Dataset/{datasetname}/{datasetname}_IdC/Images/rawImages"
    df = pd.read_csv(datafile)
    df = df[df.split=='test']
else:
    datafile = f'../Dataset/{datasetname}/{datasetname}_IdCII_3ErrType.csv'
    img_dir = f"../Dataset/{datasetname}/Images/rawImages"
    df = pd.read_csv(datafile)
df.reset_index(drop=True,inplace=True)
print('Number of caption sets in the test set:', len(df))
df['captSet_CLIP_tokens'] = df['captSet_CLIP_tokens'].apply(literal_eval)
df.img_files = [osp.join(img_dir,imgfile) for imgfile in df.img_files]


Number of caption sets in the test set: 1699


## Similar to make_txt_db.py in UMIC source code

In [3]:
df['captSet_text'] = df['captSet_text'].apply(literal_eval)
no_errType = 3

In [4]:
import argparse
import numpy as np
import os
import pickle
import json
from copy import copy
from collections import defaultdict
from pytorch_pretrained_bert import BertTokenizer
from cytoolz import curry
from tqdm import tqdm
import shutil

@curry
def bert_tokenize(tokenizer, text):
    ids = []
    for word in text.strip().split():
        ws = tokenizer.tokenize(word)
        if not ws:
            # some special char
            continue
        ids.extend(tokenizer.convert_tokens_to_ids(ws))
    return ids
def invert_dict(d):
    d_inv = defaultdict(list)
    for k, v in d.items():
        d_inv[v].append(k)
    return d_inv

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
tokenizer = bert_tokenize(bert_tokenizer)

meta = {
    #'annotations': ['./default.jsonl'],
 #'output': './default',
 #'format': 'lmdb',
 'task': 'caption_evaluation',
 'bert': 'bert-base-cased',
 'UNK': 100,
 'CLS': 101,
 'SEP': 102,
 'MASK': 103,
 'v_range': [106, 28996]}


In [5]:
captions = []
img_ids = []
for _,row in df.iterrows():
    imgfeat_file = row['imgfeat_file']
    captSet = row['captSet_text']
    for i in range(no_errType+1): # natural and unnatural captions
        captions.append(captSet[i])
        img_ids.append(imgfeat_file)
        
ce_txt2img = {}
ce_id2len = {}
sent_ids = []
for i in range(len(img_ids)):
    ce_txt2img[str(i)] = img_ids[i]
ce_img2txts = dict(invert_dict(ce_txt2img))

for i in tqdm(range(len(img_ids))):
    sent = captions[i]
    input_ids = tokenizer(sent)
    sent_ids.append(input_ids)
    ce_id2len[str(i)] = len(input_ids)
db = {}   
for i in tqdm(range(len(img_ids))):
    id_ = str(i)
    example = {}
    sent = captions[i].lower()
    example['input_ids'] = sent_ids[i]
    example['img_fname'] = ce_txt2img[id_]
    example['target'] = 1.0
    db[id_] = example


name2nbb = {}
for file in list(set(img_ids)):
    imgfeat = np.load(file, allow_pickle=True)
    name2nbb[file] = int(imgfeat['num_bbox'])
print("Number of captions:",len(name2nbb))


100%|██████████| 6796/6796 [00:00<00:00, 7649.46it/s]
100%|██████████| 6796/6796 [00:00<00:00, 346319.15it/s]


Number of captions: 1489


In [6]:
print(len(captions),len(img_ids),len(ce_txt2img),len(ce_id2len),len(db))
print(len(ce_img2txts),len(name2nbb))

6796 6796 6796 6796 6796
1489 1489


## Similar to compute_metric.py in UMIC source code

In [7]:
import argparse
import json
import os
from os.path import exists
import pickle
from time import time
import math
import torch
from torch.utils.data import DataLoader

from horovod import torch as hvd

from UMIC.data import (PrefetchLoader,
                  DetectFeatLmdb_Tran, TxtTokLmdb_Tran, ItmEvalDataset, itm_eval_collate,
                 CeEvalDataset, ce_eval_collate)
from UMIC.model.ce import UniterForCaptioningMetric
from UMIC.utils.distributed import all_gather_list
from UMIC.utils.const import IMG_DIM
from UMIC.utils.itm_eval import inference, itm_eval
from types import SimpleNamespace
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.nn.functional import softmax
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as scss
from scipy import stats

def sigmoid(x):
    return 1/(1+math.exp(-x))


The pre-trained model you are loading is a cased model but you have not set `do_lower_case` to False. We are setting `do_lower_case=False` for you but you may want to check this behavior.


In [8]:
hvd.init()
n_gpu = hvd.size()
device = torch.device("cuda", hvd.local_rank())
torch.cuda.set_device(hvd.local_rank())
rank = hvd.rank()
opts = SimpleNamespace(compressed_db=False, max_txt_len=60,conf_th=0.2, max_bb=100, min_bb=10, num_bb=36, inf_minibatch_size=400, margin=0.2,
                      valid_steps=1000, n_workers=4, fp16=True,
                      model_config='./UMIC/config/uniter-base.json',
                      output_dir='output/UMIC_results',
                      pin_mem=True,
                      batch_size=128,
                      checkpoint='UMIC/ckpt/umic.pt')
# load DBs and image dirs
eval_img_db = DetectFeatLmdb_Tran(name2nbb,opts.conf_th, opts.max_bb,
                             opts.min_bb, opts.num_bb,
                             opts.compressed_db)
eval_txt_db = TxtTokLmdb_Tran( db,meta,ce_txt2img,ce_id2len, ce_img2txts,-1)
eval_dataset = CeEvalDataset(eval_txt_db, eval_img_db)
# Prepare model
load_checkpoint = torch.load(opts.checkpoint)

model = UniterForCaptioningMetric.from_pretrained(
    opts.model_config, load_checkpoint, img_dim=IMG_DIM)

model = model.cuda()
model.eval()

eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, shuffle=False,
                         num_workers=1,
                         pin_memory=False,
                         collate_fn=ce_eval_collate)
eval_dataloader = PrefetchLoader(eval_dataloader)

06/10/2022 10:49:00 - INFO - UMIC.model.model -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 28996
}



In [9]:
umic_scores = []
qids = []
for i, batch, in tqdm(enumerate(eval_dataloader)):
    with torch.no_grad():
        scores = model(batch, compute_loss=False)
        umic_scores += (list(scores.squeeze().detach().cpu().numpy()))
        qids += batch['qids']

umic_scores = [sigmoid(x) for x in umic_scores]
print("Number of captions:",len(umic_scores))
print("UMIC Score: %.3f"% np.average(umic_scores))

54it [00:30,  1.80it/s]

Number of captions: 6796
UMIC Score: 0.478





In [10]:
umic_scores_dict = dict(zip(qids, umic_scores))
umic_scores_full = []
cnt = 0
for _,row in df.iterrows():
    umic_scores_temp = []
    for i in range(0,no_errType+1): # natural and unnatural captions
        umic_scores_temp.append(umic_scores_dict[str(cnt)])
        cnt +=1
    umic_scores_full.append(umic_scores_temp)

In [11]:
import numpy as np
no_errType = 3
cnt_corr_all = 0
cnt_incorr_all = 0
print("Dataset:",datasetname,", Number of caption sets:",len(umic_scores_full))
for errType in range(1,no_errType+1):
    cnt_corr = 0
    cnt_incorr = 0
    for sim in umic_scores_full:
        if sim[0] > sim[errType]:
            cnt_corr +=1
            cnt_corr_all +=1
        else:
            cnt_incorr +=1
            cnt_incorr_all +=1
    print(f"Accuracy at errType={errType}:{cnt_corr}/{cnt_corr+cnt_incorr}=",cnt_corr/(cnt_corr+cnt_incorr))

print(f"Accuracy for all types:{cnt_corr_all}/{cnt_corr_all+cnt_incorr_all}=",cnt_corr_all/(cnt_corr_all+cnt_incorr_all))

Dataset: COCO , Number of caption sets: 1699
Accuracy at errType=1:1670/1699= 0.9829311359623308
Accuracy at errType=2:1544/1699= 0.9087698646262508
Accuracy at errType=3:1410/1699= 0.8298999411418482
Accuracy for all types:4624/5097= 0.9072003139101432


In [12]:
Dataset: COCO , Number of caption sets: 1699
Accuracy at errType=1:1670/1699= 0.9829311359623308
Accuracy at errType=2:1544/1699= 0.9087698646262508
Accuracy at errType=3:1410/1699= 0.8298999411418482
Accuracy for all types:4624/5097= 0.9072003139101432

Dataset: VizWiz , Number of caption sets: 1160
Accuracy at errType=1:993/1160= 0.8560344827586207
Accuracy at errType=2:939/1160= 0.8094827586206896
Accuracy at errType=3:798/1160= 0.6879310344827586
Accuracy for all types:2730/3480= 0.7844827586206896

Dataset: Flickr30K , Number of caption sets: 595
Accuracy at errType=1:579/595= 0.973109243697479
Accuracy at errType=2:531/595= 0.892436974789916
Accuracy at errType=3:462/595= 0.7764705882352941
Accuracy for all types:1572/1785= 0.880672268907563
    
Dataset: ArtEmis , Number of caption sets: 15884
Accuracy at errType=1:11514/15884= 0.7248803827751196
Accuracy at errType=2:11198/15884= 0.7049861495844876
Accuracy at errType=3:9536/15884= 0.6003525560312264
Accuracy for all types:32248/47652= 0.6767396961302778

SyntaxError: invalid syntax (<ipython-input-12-50df44db92bf>, line 1)