In [81]:
import os
import json, time
import math

import random
import pickle
import numpy as np

from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider

from word2number import w2n
import string, re
from collections import Counter
from pprint import pprint
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner","textcat","parser"])

In [45]:
def toNum(word):
    try: return w2n.word_to_num(word)
    except:
        return word

def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text): # additional: converting numbers to digit form
        return " ".join([str(toNum(w)) for w in text.split()])

    def remove_punc(text):
        exclude = set(string.punctuation) - set(['.'])
        text1 = "".join(ch for ch in text if ch not in exclude)
        return re.sub(r"\.(?!\d)", "", text1) # remove '.' if it's not a decimal point

    def lower(text):
        return text.lower()
    
    def lemmatization(text):
        return " ".join([token.lemma_ for token in nlp(text)])

    if len(s.strip()) == 1:
        # accept article and punc if input is a single char
        return white_space_fix(lower(s))
    elif len(s.strip().split()) == 1: 
        # accept article if input is a single word
        return lemmatization(white_space_fix(remove_punc(lower(s))))

    return lemmatization(white_space_fix(remove_articles(remove_punc(lower(s)))))

# Language eval with Caption metrics
class Evaluate(object):
    def __init__(self):
        self.scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            #(Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            #(Cider(), "CIDEr"),
            #(Spice(), "Spice")
        ]
    
    def score(self, ref, hypo):
        final_scores = {}
        for scorer, method in self.scorers:
            if type(method) == list: score, scores = scorer.compute_score(ref, hypo, verbose=0)
            else: score, scores = scorer.compute_score(ref, hypo)
            if type(score) == list:
                for m, s in zip(method, score):
                    #print(m)
                    final_scores[m] = s
            else:
                #print(method)
                final_scores[method] = score
        return final_scores

    def evaluate(self, return_scores=False, **kwargs):
        ans = kwargs.pop('ref', {}) # support a list of references
        cand = kwargs.pop('cand', {}) # only support one cand per sample, but the input cand has size batch_size x K

        hypo = {}
        ref = {}
        i = 0
        for i in range(len(cand)):
            hypo[i] = [cand[i][0]]
            ref[i] = ans[i]
        
        final_scores = self.score(ref, hypo)
        #print ('Bleu_1:\t', final_scores['Bleu_1'])
        #print ('Bleu_2:\t', final_scores['Bleu_2'])
        #print ('Bleu_3:\t', final_scores['Bleu_3'])
        #print ('Bleu_4:\t', final_scores['Bleu_4'])
        #print ('METEOR:\t', final_scores['METEOR'])
        #print ('ROUGE_L:', final_scores['ROUGE_L'])
        #print ('CIDEr:\t', final_scores['CIDEr'])
        #print ('Spice:\t', final_scores['Spice'])

        if return_scores:
            return final_scores

In [42]:
B = Bleu(4)
B.compute_score({1:['eee']}, {1: ['seef']}, verbose=0)

([9.99999998000001e-16,
  3.162277655424966e-11,
  9.999999986666689e-10,
  5.62341324487423e-09],
 [[9.99999998000001e-16],
  [3.162277655424966e-11],
  [9.999999986666689e-10],
  [5.62341324487423e-09]])

In [3]:
# VQA Eval (SQuAD style EM, F1)
def compute_vqa_metrics(cands, a):
    if len(cands) == 0: return (0,0,0)
    bow_a = normalize_text(a).split()
    F1 = []
    EM = 0
    RE = []
    PR = []
    for c in cands:
        bow_c = normalize_text(c).split()
        if bow_c == bow_a:
            EM = 1
        common = Counter(bow_a) & Counter(bow_c)
        num_same = sum(common.values())
        if num_same == 0:
            return (0,0,0,0,0)
        precision = 1.0 * num_same / len(bow_c)
        recall = 1.0 * num_same / len(bow_a)
        RE.append(recall)
        PR.append(precision)

        f1 = 2*precision*recall / (precision + recall + 1e-5)
        F1.append(f1)
    
    PR_avg = np.mean(PR)
    RE_avg = np.mean(RE)
    F1_avg = np.mean(F1)
    F1_max = np.max(F1)
    return (F1_avg, F1_max, EM, RE_avg, PR_avg)

In [16]:
txt_dataset = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/txt_dataset_0820_addKA.json", "r"))
img_dataset = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/img_dataset_0819_16neg.json", "r"))

print(Counter([txt_dataset[k]['split'] for k in txt_dataset]))
print(len(set([txt_dataset[k]['Guid'] for k in txt_dataset])))

print(Counter([img_dataset[k]['split'] for k in img_dataset]))
print(Counter([img_dataset[k]['Qcate'] for k in img_dataset]))
print(len(set([img_dataset[k]['Guid'] for k in img_dataset])))

Counter({'train': 17812, 'test': 4695, 'val': 2455})
24962
Counter({'train': 16448, 'ood_test': 3948, 'val': 2511, 'ind_test': 2485})
Counter({'YesNo': 8410, 'Others': 6689, 'choose': 5226, 'number': 2337, 'color': 2068, 'shape': 662})
25392


In [17]:
x = []
for k in txt_dataset:
    if txt_dataset[k]['split'] == 'test':
        x.append(len(txt_dataset[k]['A']))
print(Counter(x))

Counter({6: 2476, 5: 1500, 4: 527, 3: 167, 2: 25})


In [5]:
x = []
for k in img_dataset:
    if 'test' in img_dataset[k]['split']:
        x.append(len(img_dataset[k]['A']))
print(Counter(x))

Counter({6: 6242, 5: 139, 4: 36, 3: 15, 2: 1})


In [6]:
Q = []
A_list = []
C = []
Keywords_A = []
for k in img_dataset:
    if not 'test' in img_dataset[k]['split']: continue
    datum = img_dataset[k]
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q.append(datum['Q'].replace('"', ""))
        C.append([all_A[i]])
        A_list.append(all_A[:i] + all_A[i+1:])
        Keywords_A.append(datum['Keywords_A'].replace('"', ""))
assert len(C) == len(Q) == len(A_list) == len(Keywords_A)
print(len(Q))

38338


In [48]:
## 同一个sample单独，最后取avg/max
eval_f = Evaluate()
bleu4 = {}
RE = {}
mul = {}
for k in img_dataset:
    if not 'test' in img_dataset[k]['split']: continue
    bleu4[k] = []
    RE[k] = []
    mul[k] = []
    datum = img_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        #print(scores)
        bleu4[k].append(scores['Bleu_4'])
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        RE[k].append(RE_avg)
        mul[k].append(RE_avg * scores['Bleu_4'])
    if len(RE) % 200 == 199: print(len(RE))
assert len(RE) == len(mul) == len(bleu4)
print(len(RE))

199
399
599
799
999
1199
1399
1599
1799
1999
2199
2399
2599
2799
2999
3199
3399
3599
3799
3999
4199
4399
4599
4799
4999
5199
5399
5599
5799
5999
6199
6399
6433


In [49]:
img_human_scores = {'RE': RE, 'mul': mul, 'bleu4': bleu4}
pickle.dump(img_human_scores, open("./img_human_scores.pkl", "wb"))

In [70]:
img_human_scores = pickle.load(open("./img_human_scores.pkl", "rb"))
RE = img_human_scores['RE']
mul = img_human_scores['mul']
bleu4 = img_human_scores['bleu4']

In [50]:
print("max RE: ", np.mean([max(RE[k]) for k in RE]))
print("max mul: ", np.mean([max(mul[k]) for k in mul]))
print("max bleu4: ", np.mean([max(bleu4[k]) for k in bleu4]))
print("mean RE: ", np.mean([np.mean(RE[k]) for k in RE]))
print("mean mul: ", np.mean([np.mean(mul[k]) for k in mul]))
print("mean bleu4: ", np.mean([np.mean(bleu4[k]) for k in bleu4]))

max RE:  0.9828157800008271
max mul:  0.8684240369906326
max bleu4:  0.9320046073960947
mean RE:  0.7364894343993178
mean mul:  0.48238871302412323
mean bleu4:  0.6526889064849104


In [102]:
for k in RE:
    for i in RE[k]:
        if i<0.5 and i>0.0:
            if random.random()>0.05: break
            print(k)
            print(RE[k])
            #print(sorted(mul[k], reverse=True))
            print(bleu4[k])
            print()
            break

7076
[1.0, 0.42857142857142855, 0.42857142857142855, 0.2857142857142857, 0.2857142857142857, 0.42857142857142855]
[0.7739321540095442, 0.2863070881368244, 0.7561289225233876, 0.6687403048963458, 5.233846518568202e-05, 0.5114432342517753]

7986
[1.0, 0, 0.2, 0.4, 0, 0]
[0.4692470063653536, 0.5410822689681074, 0.47987820661783703, 0.32466791540375595, 0.5329462626443542, 0.5372849657937071]

3774
[1.0, 0, 0.25, 0, 0.5, 0.25]
[0.7138957846600729, 0.8313539763197327, 0.7765453554362727, 0.36336981878206925, 0.7307717332985799, 4.887406509299109e-05]

4418
[0.3333333333333333, 0.3333333333333333, 0, 1.0, 0.6666666666666666, 0.6666666666666666]
[0.38141656158453924, 1.0294994182935423e-08, 0.8817122475196995, 1.5352597835010118e-12, 0.4952330115902502, 0.904431377538006]

5173
[0.6666666666666666, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.6666666666666666, 0.6666666666666666]
[0.7473021918305992, 3.137143608036445e-05, 0.7138099644123547, 0.7598356855903131, 5.40614986084

In [104]:
pprint(img_dataset['14044'])

{'A': ['"Military and ordinary working people are carved on the building that '
       'is part of the Quezon City Hall complex."',
       '"Figures are carved on the building."',
       '"A mural is carved on the building that is part of the Quezon City '
       'Hall complex."',
       '"Images of people are carved on the building that is part of the '
       'Quezon City Hall complex."',
       '"A group of people are gathered with a flag in the background, which '
       'is what is carved on the building that is part of the Quezon City Hall '
       'complex."',
       '"Figures of people are carved on the building."'],
 'Guid': 'dc63e242d46f11ebba07a504a20e7724',
 'Keywords_A': '"A military and ordinary working people."',
 'Q': '"What is carved on the building that is part of the Quezon City Hall '
      'complex?"',
 'Qcate': 'Others',
 'Qtype': 'img-Singlehop',
 'img_negFacts': [{'caption': 'City centre 05',
                   'image_id': '10239772',
                   'imgUrl'

In [105]:
print(Counter([np.sum(np.array(RE[k]) > 0.5) for k in RE if img_dataset[k]['split'] == 'ind_test']))
print(Counter([np.sum(np.array(RE[k]) >= 0.3) for k in RE if img_dataset[k]['split'] == 'ind_test']))
print(Counter([np.sum(np.array(RE[k]) > 0.5) for k in RE if img_dataset[k]['split'] == 'ood_test']))
print(Counter([np.sum(np.array(RE[k]) >= 0.3) for k in RE if img_dataset[k]['split'] == 'ood_test']))

Counter({6: 769, 5: 538, 4: 486, 3: 288, 2: 186, 1: 178, 0: 40})
Counter({6: 963, 5: 514, 4: 464, 3: 273, 2: 146, 1: 100, 0: 25})
Counter({6: 1313, 5: 759, 4: 714, 3: 458, 1: 327, 2: 308, 0: 69})
Counter({6: 1580, 5: 782, 4: 707, 3: 400, 2: 261, 1: 177, 0: 41})


In [106]:
img_drop_k = [k for k in RE if np.sum(np.array(RE[k]) >= 0.3)<3]
print(len(img_drop_k))

750


In [None]:
RE_clean_img

In [111]:
## 同一个sample单独，最后取avg/max, img_drop_k 里面自动忽略，RE<0.3的 full sentence 忽略
eval_f = Evaluate()
bleu4_img_clean = {}
RE_img_clean = {}
mul_img_clean = {}
drop = 0
for k in img_dataset:
    if not 'test' in img_dataset[k]['split']: continue
    #if k in img_drop_k:
        #drop += 1
        #continue
    bleu4_img_clean[k] = []
    RE_img_clean[k] = []
    mul_img_clean[k] = []
    datum = img_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        #print(scores)
        
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        if RE_avg<0.3: continue
        bleu4_img_clean[k].append(scores['Bleu_4'])
        RE_img_clean[k].append(RE_avg)
        mul_img_clean[k].append(RE_avg * scores['Bleu_4'])
    if len(RE_img_clean[k]) < 3: 
        drop += 1
        del RE_img_clean[k]
        del mul_img_clean[k]
        del bleu4_img_clean[k]
        
    if len(RE_img_clean) % 500 == 499: print(len(RE_img_clean))
assert len(RE_img_clean) == len(mul_img_clean) == len(bleu4_img_clean)
print(len(RE_img_clean))
print(drop)

499
999
1499
1999
2499
2999
3499
3999
4499
4999
5499
5683
750


In [112]:
# 0.3 threshold
print(Counter([len(RE_img_clean[k]) for k in RE_img_clean]))
print("max RE: ", np.mean([max(RE_img_clean[k]) for k in RE_img_clean]))
print("max mul: ", np.mean([max(mul_img_clean[k]) for k in mul_img_clean]))
print("max bleu4: ", np.mean([max(bleu4_img_clean[k]) for k in bleu4_img_clean]))
print("mean RE: ", np.mean([np.mean(RE_img_clean[k]) for k in RE_img_clean]))
print("mean mul: ", np.mean([np.mean(mul_img_clean[k]) for k in mul_img_clean]))
print("mean bleu4: ", np.mean([np.mean(bleu4_img_clean[k]) for k in bleu4_img_clean]))

Counter({6: 2543, 5: 1296, 4: 1171, 3: 673})
max RE:  0.9940494712742778
max mul:  0.9029879466059659
max bleu4:  0.9231728016666029
mean RE:  0.9533923859689158
mean mul:  0.6240687179944258
mean bleu4:  0.6504880563907047


In [114]:
## 同一个sample单独，最后取avg/max, img_drop_k 里面自动忽略，RE<0.3的 full sentence 忽略
eval_f = Evaluate()
bleu4_img_clean = {}
RE_img_clean = {}
mul_img_clean = {}
drop = 0
for k in img_dataset:
    if not 'test' in img_dataset[k]['split']: continue
    #if k in img_drop_k:
        #drop += 1
        #continue
    bleu4_img_clean[k] = []
    RE_img_clean[k] = []
    mul_img_clean[k] = []
    datum = img_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        #print(scores)
        
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        if RE_avg<0.5: continue
        bleu4_img_clean[k].append(scores['Bleu_4'])
        RE_img_clean[k].append(RE_avg)
        mul_img_clean[k].append(RE_avg * scores['Bleu_4'])
    if len(RE_img_clean[k]) < 3: 
        drop += 1
        del RE_img_clean[k]
        del mul_img_clean[k]
        del bleu4_img_clean[k]
        
    if len(RE_img_clean) % 500 == 499: print(len(RE_img_clean))
assert len(RE_img_clean) == len(mul_img_clean) == len(bleu4_img_clean)
print(len(RE_img_clean))
print(drop)

499
999
1499
1999
2499
2999
3499
3999
4499
4499
4999
5499
5567
866


In [115]:
# 0.5 threshold
print(Counter([len(RE_img_clean[k]) for k in RE_img_clean]))
print("max RE: ", np.mean([max(RE_img_clean[k]) for k in RE_img_clean]))
print("max mul: ", np.mean([max(mul_img_clean[k]) for k in mul_img_clean]))
print("max bleu4: ", np.mean([max(bleu4_img_clean[k]) for k in bleu4_img_clean]))
print("mean RE: ", np.mean([np.mean(RE_img_clean[k]) for k in RE_img_clean]))
print("mean mul: ", np.mean([np.mean(mul_img_clean[k]) for k in mul_img_clean]))
print("mean bleu4: ", np.mean([np.mean(bleu4_img_clean[k]) for k in bleu4_img_clean]))

Counter({6: 2377, 5: 1302, 4: 1184, 3: 704})
max RE:  0.9955401807341809
max mul:  0.9109116325133675
max bleu4:  0.9252774135697667
mean RE:  0.9667703676481375
mean mul:  0.6345987014313298
mean bleu4:  0.6541056214571918


In [130]:
print("max RE: ", np.mean([max(RE_img_clean[k]) for k in RE_img_clean if img_dataset[k]['split'] == 'ind_test']))
print("max RE: ", np.mean([max(RE_img_clean[k]) for k in RE_img_clean if img_dataset[k]['split'] == 'ood_test']))

max RE:  0.9968079922027291
max RE:  0.9947327477318657


In [127]:
## 同一个sample单独，最后取avg/max, img_drop_k 里面自动忽略，RE<0.3的 full sentence 忽略
eval_f = Evaluate()
bleu4_txt_clean = {}
RE_txt_clean = {}
mul_txt_clean = {}
drop = 0
for k in txt_dataset:
    if not 'test' in txt_dataset[k]['split']: continue
    #if k in img_drop_k:
        #drop += 1
        #continue
    bleu4_txt_clean[k] = []
    RE_txt_clean[k] = []
    mul_txt_clean[k] = []
    datum = txt_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        #print(scores)
        
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        if RE_avg<0.5: continue
        bleu4_txt_clean[k].append(scores['Bleu_4'])
        RE_txt_clean[k].append(RE_avg)
        mul_txt_clean[k].append(RE_avg * scores['Bleu_4'])
    if len(RE_txt_clean[k]) < 3: 
        drop += 1
        del RE_txt_clean[k]
        del mul_txt_clean[k]
        del bleu4_txt_clean[k]
    if len(RE_txt_clean) % 500 == 499: print(len(RE_txt_clean))
assert len(RE_txt_clean) == len(mul_txt_clean) == len(bleu4_txt_clean)
print(len(RE_txt_clean))
print(drop)

499
999
999
1499
1999
2499
2999
2999
3499
3499
3999
4076
619


In [128]:
# 0.5 threshold
print(Counter([len(RE_txt_clean[k]) for k in RE_txt_clean]))
print("max RE: ", np.mean([max(RE_txt_clean[k]) for k in RE_txt_clean]))
print("max mul: ", np.mean([max(mul_txt_clean[k]) for k in mul_txt_clean]))
print("max bleu4: ", np.mean([max(bleu4_txt_clean[k]) for k in bleu4_txt_clean]))
print("mean RE: ", np.mean([np.mean(RE_txt_clean[k]) for k in RE_txt_clean]))
print("mean mul: ", np.mean([np.mean(mul_txt_clean[k]) for k in mul_txt_clean]))
print("mean bleu4: ", np.mean([np.mean(bleu4_txt_clean[k]) for k in bleu4_txt_clean]))

Counter({6: 1366, 5: 1343, 4: 874, 3: 493})
max RE:  0.9828222069856472
max mul:  0.8212597143292607
max bleu4:  0.8480550016967062
mean RE:  0.9458907476823312
mean mul:  0.5068453831403538
mean bleu4:  0.5348509314403606


In [121]:
# 0.3 threshold
print(Counter([len(RE_txt_clean[k]) for k in RE_txt_clean]))
print("max RE: ", np.mean([max(RE_txt_clean[k]) for k in RE_txt_clean]))
print("max mul: ", np.mean([max(mul_txt_clean[k]) for k in mul_txt_clean]))
print("max bleu4: ", np.mean([max(bleu4_txt_clean[k]) for k in bleu4_txt_clean]))
print("mean RE: ", np.mean([np.mean(RE_txt_clean[k]) for k in RE_txt_clean]))
print("mean mul: ", np.mean([np.mean(mul_txt_clean[k]) for k in mul_txt_clean]))
print("mean bleu4: ", np.mean([np.mean(bleu4_txt_clean[k]) for k in bleu4_txt_clean]))

Counter({6: 1483, 5: 1386, 4: 865, 3: 449})
max RE:  0.9762826107052714
max mul:  0.8114693233465995
max bleu4:  0.8479138449716856
mean RE:  0.92778444095049
mean mul:  0.4953681230624369
mean bleu4:  0.5316556978374075


In [71]:
txt_human_scores = {'RE': RE_txt, 'mul': mul_txt, 'bleu4': bleu4_txt}
pickle.dump(txt_human_scores, open("./txt_human_scores.pkl", "wb"))

In [77]:
print("max RE: ", np.mean([max(RE_txt[k]) for k in RE_txt]))
print("max mul: ", np.mean([max(mul_txt[k]) for k in mul_txt]))
print("max bleu4: ", np.mean([max(bleu4_txt[k]) for k in bleu4_txt]))
print("mean RE: ", np.mean([np.mean(RE_txt[k]) for k in RE_txt]))
print("mean mul: ", np.mean([np.mean(mul_txt[k]) for k in mul_txt]))
print("mean bleu4: ", np.mean([np.mean(bleu4_txt[k]) for k in bleu4_txt]))

max RE:  0.9419907587205651
max mul:  0.7568339273336601
max bleu4:  0.8467212422473528
mean RE:  0.7943510636147798
mean mul:  0.4195641452808087
mean bleu4:  0.5223123738189273


In [125]:
for k in RE_txt:
    for i in RE_txt[k]:
        if i<0.3 and i>0.0:
            if random.random()>0.02: break
            print(k)
            print(RE_txt[k])
            #print(sorted(mul[k], reverse=True))
            print(bleu4_txt[k])
            print()
            break

3888
[0.36363636363636365, 0.2727272727272727, 0.5454545454545454, 0.36363636363636365, 0.45454545454545453, 0.36363636363636365]
[2.1249835507091168e-05, 3.006454568869563e-05, 0.25802943580872184, 4.397856524709696e-09, 0.8282477530793885, 0.7577395671888485]

1656
[0.14285714285714285, 0.8571428571428571, 0.8571428571428571, 1.0, 0.8571428571428571, 0.8571428571428571]
[1.3939047821026512e-11, 0.9999999998101192, 0.8515139815352575, 0.7307717332579815, 0.9255653651854665, 0.6774689750586048]

4598
[0, 0.09090909090909091, 1.0, 0, 0.18181818181818182]
[3.455144512702623e-14, 0.8648454149611199, 0.3082627645885311, 0.4503303523424431, 0.4785543920365968]



In [126]:
pprint(txt_dataset['4598'])

{'A': ['Descents.',
       '"Anyone directly descended from original tribal enrollees can be part '
       'of a lineage organization."',
       '"Descendants of a particular person or group of people of historical '
       'importance can be part of a lineage organization."',
       '"Anyone directly descended from original tribal enrollees could be '
       'eligible for tribal enrollment."',
       '"People descended from the organization\'s past members can be part of '
       'a lineage organization."'],
 'BucketId': 1705667512,
 'CuratedAnswer': [],
 'DistractorImageIds': ['https://media.gettyimages.com/photos/american-flag-waving-with-the-capitol-hill-picture-id1154438278?b=1&k=6&m=1154438278&s=612x612&w=0&h=QG27Vppr-nBdx9F5YP_iPpElJqmd4quyHSMUeZ2CMVU=',
                        'https://media.gettyimages.com/photos/aerial-view-of-beirut-lebanon-city-of-beirut-picture-id635844142?b=1&k=6&m=635844142&s=612x612&w=0&h=KGQjPtUk8TY3THB6VrNmjlNW8MFwWZOWyhCRjoRV_Ck=',
                  

In [78]:
print(Counter([np.sum(np.array(RE_txt[k]) > 0.75) for k in RE_txt if txt_dataset[k]['split'] == 'test']))
print(Counter([np.sum(np.array(RE_txt[k]) > 0.5) for k in RE_txt if txt_dataset[k]['split'] == 'test']))

Counter({5: 1167, 6: 962, 4: 937, 3: 582, 0: 408, 2: 362, 1: 277})
Counter({5: 1229, 6: 1099, 4: 941, 3: 575, 2: 330, 0: 285, 1: 236})


In [14]:
eval_f = Evaluate()
scores = eval_f.evaluate(cand=C, ref=A_list, return_scores=True)

{'testlen': 548703, 'reflen': 553081, 'guess': [548703, 510365, 472027, 433689], 'correct': [482229, 397449, 332196, 279545]}
ratio: 0.9920843420764751
Bleu_1
Bleu_2
Bleu_3
Bleu_4
ROUGE_L
Bleu_1:	 0.8718681923526563
Bleu_2:	 0.8207163809672817
Bleu_3:	 0.7776476582958962
Bleu_4:	 0.7405240931886949
ROUGE_L: 0.775898078614296


In [15]:
# SQuAD style vqa eval: EM, F1
F1_avg_scores = []
F1_max_scores = []
EM_scores = []
RE_scores = []
PR_scores = []
#F1_avg_bertscores = []
#F1_max_bertscores = []
for cands, a in zip(C, Keywords_A):
    assert len(cands)==1
    F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics([cands[0]], a)
    F1_avg_scores.append(F1_avg)
    F1_max_scores.append(F1_max)
    EM_scores.append(EM)
    RE_scores.append(RE_avg)
    PR_scores.append(PR_avg)

    #F1_avg_bertscore, F1_max_bertscore = compute_bertscore([cands[0]], a)
    #F1_avg_bertscores.append(F1_avg_bertscore)
    #F1_max_bertscores.append(F1_max_bertscore)

F1_avg = np.mean(F1_avg_scores)
F1_max = np.mean(F1_max_scores)
EM = np.mean(EM_scores)
RE_avg = np.mean(RE_scores)
PR_avg = np.mean(PR_scores)

#F1_avg_bertscore = np.mean(F1_avg_bertscores)
#F1_max_bertscore = np.mean(F1_max_bertscores)
print("F1_avg = {}".format(F1_avg))
print("F1_max = {}".format(F1_max))
print("EM = {}".format(EM))
print("RE_avg = {}".format(RE_avg))
print("PR_avg = {}".format(PR_avg))

#print("F1_avg_bertscore = {}".format(F1_avg_bertscore))
#print("F1_max_bertscore = {}".format(F1_max_bertscore))

print("RE * BLEU4 = {}".format(RE_avg * scores['Bleu_4']))

F1_avg = 0.18892881294338235
F1_max = 0.18892881294338235
EM = 0.00013041890552454484
RE_avg = 0.7355976452156661
PR_avg = 0.11738792355336282
RE * BLEU4 = 0.5447277791750704


In [19]:
Q = []
A_list = []
C = []
Keywords_A = []
for k in txt_dataset:
    if not 'test' in txt_dataset[k]['split']: continue
    datum = txt_dataset[k]
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q.append(datum['Q'].replace('"', ""))
        C.append([all_A[i]])
        A_list.append(all_A[:i] + all_A[i+1:])
        Keywords_A.append(datum['Keywords_A'].replace('"', ""))
assert len(C) == len(Q) == len(A_list) == len(Keywords_A)
print(len(Q))

25015


In [20]:
eval_f = Evaluate()
scores = eval_f.evaluate(cand=C, ref=A_list, return_scores=True)

{'testlen': 365753, 'reflen': 368388, 'guess': [365753, 340738, 317201, 294375], 'correct': [302169, 244036, 202037, 168574]}
ratio: 0.9928472154358965
Bleu_1
Bleu_2
Bleu_3
Bleu_4
ROUGE_L
Bleu_1:	 0.820225403511018
Bleu_2:	 0.7636931139008422
Bleu_3:	 0.7171364099147381
Bleu_4:	 0.6766927878708912
ROUGE_L: 0.6734202548341871


In [21]:
# SQuAD style vqa eval: EM, F1
F1_avg_scores = []
F1_max_scores = []
EM_scores = []
RE_scores = []
PR_scores = []
#F1_avg_bertscores = []
#F1_max_bertscores = []
for cands, a in zip(C, Keywords_A):
    assert len(cands)==1
    F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics([cands[0]], a)
    F1_avg_scores.append(F1_avg)
    F1_max_scores.append(F1_max)
    EM_scores.append(EM)
    RE_scores.append(RE_avg)
    PR_scores.append(PR_avg)

    #F1_avg_bertscore, F1_max_bertscore = compute_bertscore([cands[0]], a)
    #F1_avg_bertscores.append(F1_avg_bertscore)
    #F1_max_bertscores.append(F1_max_bertscore)

F1_avg = np.mean(F1_avg_scores)
F1_max = np.mean(F1_max_scores)
EM = np.mean(EM_scores)
RE_avg = np.mean(RE_scores)
PR_avg = np.mean(PR_scores)

#F1_avg_bertscore = np.mean(F1_avg_bertscores)
#F1_max_bertscore = np.mean(F1_max_bertscores)
print("F1_avg = {}".format(F1_avg))
print("F1_max = {}".format(F1_max))
print("EM = {}".format(EM))
print("RE_avg = {}".format(RE_avg))
print("PR_avg = {}".format(PR_avg))

#print("F1_avg_bertscore = {}".format(F1_avg_bertscore))
#print("F1_max_bertscore = {}".format(F1_max_bertscore))

print("RE * BLEU4 = {}".format(RE_avg * scores['Bleu_4']))

F1_avg = 0.2933705406243417
F1_max = 0.2933705406243417
EM = 0.07567459524285429
RE_avg = 0.7901031055052431
PR_avg = 0.22620708177082618
RE * BLEU4 = 0.5346570731697918
