In [1]:
import os
import json, time, copy
import math

import random
import pickle
import numpy as np

from pycocoevalcap.spice.spice import Spice
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.rouge.rouge import Rouge
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.cider.cider import Cider

import nltk
from nltk.corpus import stopwords

from word2number import w2n
import string, re
from collections import Counter, defaultdict
from pprint import pprint
import spacy
nlp = spacy.load("en_core_web_sm", disable=["ner","textcat","parser"])

In [342]:
#pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*|\(|\)|-')

In [2]:
def detectNum(l):
    result = []
    for w in l:
        try: result.append(str(int(w)))
        except: pass
    return result

In [130]:
def toNum(word):
    if word == 'point': return word
    try: return w2n.word_to_num(word)
    except:
        return word

def normalize_text(s):
    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text): # additional: converting numbers to digit form
        return " ".join([str(toNum(w)) for w in text.split()])

    def remove_punc(text):
        exclude = set(string.punctuation) - set(['.'])
        text1 = "".join(ch for ch in text if ch not in exclude)
        return re.sub(r"\.(?!\d)", "", text1) # remove '.' if it's not a decimal point

    def lower(text):
        return text.lower()
    
    def lemmatization(text):
        return " ".join([token.lemma_ for token in nlp(text)])

    if len(s.strip()) == 1:
        # accept article and punc if input is a single char
        return white_space_fix(lower(s))
    elif len(s.strip().split()) == 1: 
        # accept article if input is a single word
        return lemmatization(white_space_fix(remove_punc(lower(s))))

    return lemmatization(white_space_fix(remove_articles(remove_punc(lower(s)))))

# Language eval with Caption metrics
class Evaluate(object):
    def __init__(self):
        self.scorers = [
            (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
            #(Meteor(), "METEOR"),
            (Rouge(), "ROUGE_L"),
            #(Cider(), "CIDEr"),
            #(Spice(), "Spice")
        ]
    
    def score(self, ref, hypo):
        final_scores = {}
        for scorer, method in self.scorers:
            if type(method) == list: score, scores = scorer.compute_score(ref, hypo, verbose=0)
            else: score, scores = scorer.compute_score(ref, hypo)
            if type(score) == list:
                for m, s in zip(method, score):
                    #print(m)
                    final_scores[m] = s
            else:
                #print(method)
                final_scores[method] = score
        return final_scores

    def evaluate(self, return_scores=False, **kwargs):
        ans = kwargs.pop('ref', {}) # support a list of references
        cand = kwargs.pop('cand', {}) # only support one cand per sample, but the input cand has size batch_size x K

        hypo = {}
        ref = {}
        i = 0
        for i in range(len(cand)):
            hypo[i] = cand[i]
            ref[i] = ans[i]
        
        final_scores = self.score(ref, hypo)
        #print ('Bleu_1:\t', final_scores['Bleu_1'])
        #print ('Bleu_2:\t', final_scores['Bleu_2'])
        #print ('Bleu_3:\t', final_scores['Bleu_3'])
        #print ('Bleu_4:\t', final_scores['Bleu_4'])
        #print ('METEOR:\t', final_scores['METEOR'])
        #print ('ROUGE_L:', final_scores['ROUGE_L'])
        #print ('CIDEr:\t', final_scores['CIDEr'])
        #print ('Spice:\t', final_scores['Spice'])

        if return_scores:
            return final_scores

In [4]:
# VQA Eval (SQuAD style EM, F1)
def compute_vqa_metrics(cands, a, exclude="", domain=None):
    if len(cands) == 0: return (0,0,0)
    bow_a = normalize_text(a).split()
    F1 = []
    EM = 0
    RE = []
    PR = []
    e = normalize_text(exclude).split()
    for c in cands:
        bow_c = [w for w in normalize_text(c).split() if not w in e]
        if domain == {"NUMBER"}: bow_c = detectNum(bow_c)
        elif domain is not None: 
            bow_c = list(domain.intersection(bow_c))
            bow_a = list(domain.intersection(bow_a))
        
        #print(bow_c)
        #print(bow_a)
        if bow_c == bow_a:
            EM = 1
        common = Counter(bow_a) & Counter(bow_c)
        num_same = sum(common.values())
        if num_same == 0:
            return (0,0,0,0,0)
        precision = 1.0 * num_same / len(bow_c)
        recall = 1.0 * num_same / len(bow_a)
        RE.append(recall)
        PR.append(precision)

        f1 = 2*precision*recall / (precision + recall + 1e-5)
        F1.append(f1)
    
    PR_avg = np.mean(PR)
    RE_avg = np.mean(RE)
    F1_avg = np.mean(F1)
    F1_max = np.max(F1)
    return (F1_avg, F1_max, EM, RE_avg, PR_avg)

In [65]:
txt_dataset = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/txt_dataset_0820_addKA.json", "r"))
img_dataset = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/img_dataset_0819_16neg.json", "r"))

print(Counter([txt_dataset[k]['split'] for k in txt_dataset]))
print(len(set([txt_dataset[k]['Guid'] for k in txt_dataset])))

print(Counter([img_dataset[k]['split'] for k in img_dataset]))
print(Counter([img_dataset[k]['Qcate'] for k in img_dataset]))
print(Counter([img_dataset[k]['Qcate'] for k in img_dataset if img_dataset[k]['split'] == 'ood_test']))
print(len(set([img_dataset[k]['Guid'] for k in img_dataset])))

Counter({'train': 17812, 'test': 4695, 'val': 2455})
24962
Counter({'train': 16448, 'ood_test': 3948, 'val': 2511, 'ind_test': 2485})
Counter({'YesNo': 8410, 'Others': 6689, 'choose': 5226, 'number': 2337, 'color': 2068, 'shape': 662})
Counter({'Others': 1284, 'YesNo': 1098, 'choose': 1010, 'color': 239, 'number': 220, 'shape': 97})
25392


In [66]:
x = []
for k in txt_dataset:
    if txt_dataset[k]['split'] == 'test':
        x.append(len(txt_dataset[k]['A']))
print(Counter(x))
x = []
for k in img_dataset:
    if 'test' in img_dataset[k]['split']:
        x.append(len(img_dataset[k]['A']))
print(Counter(x))

Counter({6: 2476, 5: 1500, 4: 527, 3: 167, 2: 25})


Counter({6: 6242, 5: 139, 4: 36, 3: 15, 2: 1})


In [6]:
'''
Q = []
A_list = []
C = []
Keywords_A = []
for k in img_dataset:
    if not 'test' in img_dataset[k]['split']: continue
    datum = img_dataset[k]
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q.append(datum['Q'].replace('"', ""))
        C.append([all_A[i]])
        A_list.append(all_A[:i] + all_A[i+1:])
        Keywords_A.append(datum['Keywords_A'].replace('"', ""))
assert len(C) == len(Q) == len(A_list) == len(Keywords_A)
print(len(Q))
'''

38338


In [102]:
for k in RE:
    for i in RE[k]:
        if i<0.5 and i>0.0:
            if random.random()>0.05: break
            print(k)
            print(RE[k])
            #print(sorted(mul[k], reverse=True))
            print(bleu4[k])
            print()
            break

7076
[1.0, 0.42857142857142855, 0.42857142857142855, 0.2857142857142857, 0.2857142857142857, 0.42857142857142855]
[0.7739321540095442, 0.2863070881368244, 0.7561289225233876, 0.6687403048963458, 5.233846518568202e-05, 0.5114432342517753]

7986
[1.0, 0, 0.2, 0.4, 0, 0]
[0.4692470063653536, 0.5410822689681074, 0.47987820661783703, 0.32466791540375595, 0.5329462626443542, 0.5372849657937071]

3774
[1.0, 0, 0.25, 0, 0.5, 0.25]
[0.7138957846600729, 0.8313539763197327, 0.7765453554362727, 0.36336981878206925, 0.7307717332985799, 4.887406509299109e-05]

4418
[0.3333333333333333, 0.3333333333333333, 0, 1.0, 0.6666666666666666, 0.6666666666666666]
[0.38141656158453924, 1.0294994182935423e-08, 0.8817122475196995, 1.5352597835010118e-12, 0.4952330115902502, 0.904431377538006]

5173
[0.6666666666666666, 0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.6666666666666666, 0.6666666666666666]
[0.7473021918305992, 3.137143608036445e-05, 0.7138099644123547, 0.7598356855903131, 5.40614986084

In [68]:
txt_dataset_0820_addKA = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/txt_dataset_0820_addKA.json", "r"))
img_dataset_0819_16neg = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/img_dataset_0819_16neg.json", "r"))

In [69]:
color_counter = Counter([normalize_text(img_dataset_0819_16neg[k]['Keywords_A'].replace('"', '')) for k in img_dataset_0819_16neg if img_dataset_0819_16neg[k]['Qcate'] == 'color'])
#print(color_counter)
d = defaultdict(lambda: [])
for a in color_counter.keys():
    if len(a.split()) == 1:
        d[a].append(a)
'''
for a in color_counter.keys():
    if len(a.split()) > 1:
        assigned = False
        for w in a.split():
            for single_color in d:
                if single_color in w:
                    d[single_color].append(a)
                    assigned = True
        if not assigned:
            print(a)
for single_color in d:
    print(single_color, len(d[single_color]))
#pprint(d)
'''
color_set = set(d.keys())
print(color_set)

{'orangebrown', 'spot', 'yellow', 'blue', 'rainbow', 'ivory', 'brown', 'gray', 'teal', 'bluewhite', 'orangepurple', 'black', 'white', 'gold', 'redorange', 'pink', 'blonde', 'tan', 'turquoise', 'grey', 'beige', 'golden', 'orange', 'bronze', 'maroon', 'purple', 'bluere', 'red', 'rust', 'violet', 'transparent', 'yes', 'silver', 'chrome', 'green', 'aqua'}


In [70]:
shape_counter = Counter([normalize_text(img_dataset_0819_16neg[k]['Keywords_A'].replace('"', '')) for k in img_dataset_0819_16neg if img_dataset_0819_16neg[k]['Qcate'] == 'shape'])
#print(color_counter)
d = defaultdict(lambda: [])
for a in shape_counter.keys():
    if len(a.split()) == 1:
        d[a].append(a)

shape_set = set(d.keys())
shape_set = set(sum([normalize_text(c).split() for c in shape_set], []))
print(shape_set)

{'globular', 'octogon', 'ring', 'hoop', 'octagon', 'concave', 'flat', 'wavy', 'shamrock', 'cross', 'cylinder', 'cylindrical', 'pentagon', 'point', 'pyramidal', 'crescent', 'rectangular', 'hook', 'tube', 'cone', 'bell', 'spiral', 'ball', 'convex', 'square', 'arch', 'h', 'cuboid', 'step', 'rectangle', 'dot', 'oval', 'circle', 'star', 'crosse', 'crest', 'octagonal', 'cube', 'triangle', 'semicircle', 'domeshape', 'obelisk', 'corkscrew', 'curve', 'circular', 'xs', 'slope', 'pyramid', 'round', 'bow', 'straight', 'triangular', 'heart', 'fork', 'teardrop', 'fold', 'curl', 'spherical', 'diamond', 'keyhole', 'conical', 'dome', 'sphere', 'bellshaped', 'rounded', 'hexagon', 'flower', 'globe', 'torus'}


In [223]:
number_counter = Counter([normalize_text(img_dataset[k]['Keywords_A'].replace('"', '')) for k in img_dataset if img_dataset[k]['Qcate'] == 'number'])
#print(color_counter)
d = defaultdict(lambda: [])
for a in number_counter.keys():
    if len(a.split()) == 1:
        d[a].append(a)

number_set = set(d.keys())
print(number_set)

{'19', '20', 'twice', '25', '1', '4', '50', '13', '6', '11', '16', '5', '17', 'oneway', '28', '10', '15', '2', '30', 'twentysix', '26', 'twentytwo', '8', '36', '12', '3', '14', '0', '22', '9', 'none', 'once', '7', '18', 'thirtynine', 'thirtythree'}


In [10]:
yesno_set = set(['yes', 'no'])

In [139]:
scorer = Bleu(4)
scorer.compute_score({1: ['She was the goddess of fortune, luck, and fate, and could have bestowed either good or bad luck onto them.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto people.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto them.']},
                    {1: ['she could bestow good or bad luck onto people .']},
                     verbose=0)

([0.22072766465871999,
  0.18997212650349526,
  0.17075451058950142,
  0.1512476051989217],
 [[0.22072766465871999],
  [0.18997212650349526],
  [0.17075451058950142],
  [0.1512476051989217]])

In [107]:
eval_f.evaluate(cand=[["She could bestow good or bad luck onto people ."], ['The Secret Series']], 
                ref=[['She was the goddess of fortune, luck, and fate, and could have bestowed either good or bad luck onto them.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto people.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto them.'],
                     ["This Isn't What It Looks Like", "This Isn't What It Looks Like is the fourth book in The Secret Series, and Bosh was four years old when he started learning to dribble a basketball.", "This Isn't What It Looks Like is the same number position in the pentalogy  The Secret Series  as Bosh was years old when he started learning to dribble a basketball.", "This Isn't What It Looks Like is the same number position in the pentalogy The Secret Series as the age of Bosh when he started learning to dribble a basketball.", "This Isn't What It Looks Like is the same number in the pentalogy  The Secret Series  as Bosh's age when he started learning to dribble a basketball.", "This isn't What it Looks Like is the fourth book in the pentalogy The Secret Series which is the amount of years old that Bosh was when he started to learn to dribble a basketball."]], 
                return_scores=True)

{'Bleu_1': 0.28298418547295817,
 'Bleu_2': 0.23829400937956186,
 'Bleu_3': 0.21017598089731462,
 'Bleu_4': 0.17674481437878237,
 'ROUGE_L': 0.3018725297808242}

In [131]:
print(eval_f.evaluate(cand=[['she could bestow good or bad luck onto people .', 'she was the goddess of fortune so she could bestow good or bad luck onto people .']], 
                ref=[['She was the goddess of fortune, luck, and fate, and could have bestowed either good or bad luck onto them.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto people.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto them.']],
                return_scores=True))
print(eval_f.evaluate(cand=[['she was the goddess of fortune so she could bestow good or bad luck onto people .']], 
                ref=[['She was the goddess of fortune, luck, and fate, and could have bestowed either good or bad luck onto them.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto people.', 'Fortuna, a Roman god, would have influenced the lives of Romulus and Remus as they argued over where the exact position of Rome should be by bestowing good or bad luck onto them.']],
                return_scores=True))
#print(eval_f.evaluate(cand=[['The Secret Series']],
                #ref=[["This Isn't What It Looks Like", "The Secret Series basketball.", "This Isn't What It Looks Like is the same number position in the pentalogy  The Secret Series  as Bosh was years old when he started learning to dribble a basketball.", "This Isn't What It Looks Like is the same number position in the pentalogy The Secret Series as the age of Bosh when he started learning to dribble a basketball.", "This Isn't What It Looks Like is the same number in the pentalogy  The Secret Series  as Bosh's age when he started learning to dribble a basketball.", "This isn't What it Looks Like is the fourth book in the pentalogy The Secret Series which is the amount of years old that Bosh was when he started to learn to dribble a basketball."]], 
                #return_scores=True))

{'Bleu_1': 0.22072766465871999, 'Bleu_2': 0.18997212650349526, 'Bleu_3': 0.17075451058950142, 'Bleu_4': 0.1512476051989217, 'ROUGE_L': 0.37731958762886597}
{'Bleu_1': 0.49307260724963853, 'Bleu_2': 0.4252304590327199, 'Bleu_3': 0.36968266900993085, 'Bleu_4': 0.3086455873605777, 'ROUGE_L': 0.5327510917030567}


In [45]:
## 同一个sample单独，最后取avg/max，RE<0.3的 full sentence 忽略
eval_f = Evaluate()
bleu4_img_clean = {}
RE_img_clean = {}
F1_img_clean = {}
mul_img_clean = {}
keep_A_list = {}
drop = defaultdict(int)
for k in img_dataset:
    if not 'ood_test' in img_dataset[k]['split']: continue
    bleu4_img_clean[k] = []
    RE_img_clean[k] = []
    mul_img_clean[k] = []
    F1_img_clean[k] = []
    keep_A_list[k] = []
    datum = img_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    Qcate = img_dataset[k]['Qcate']
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        
        if Qcate == 'color': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", color_set)
        elif Qcate == 'shape': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", shape_set)
        elif Qcate == 'YesNo': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", yesno_set)
        elif Qcate == 'number': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", {"NUMBER"})
        else: F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        if RE_avg<0.3: continue
        if not Qcate in ['choose', 'Others'] and F1_avg<0.3: continue
        keep_A_list[k].append(datum['A'][i])
        bleu4_img_clean[k].append(scores['Bleu_4'])
        RE_img_clean[k].append(RE_avg)
        F1_img_clean[k].append(F1_avg)
        if Qcate in ['choose', 'Others']: mul_img_clean[k].append(RE_avg * scores['Bleu_4'])
        else: mul_img_clean[k].append(F1_avg * scores['Bleu_4'])
    if len(RE_img_clean[k]) < 3: 
        drop[Qcate] += 1
        del RE_img_clean[k]
        del mul_img_clean[k]
        del bleu4_img_clean[k]
        del F1_img_clean[k]
        
    if len(RE_img_clean) % 500 == 499: print(len(RE_img_clean))
assert len(RE_img_clean) == len(mul_img_clean) == len(bleu4_img_clean) == len(F1_img_clean)
print(len(RE_img_clean))
print(drop)
print(np.sum(list(drop.values())))

499
999
1499
1999
2499
2999
3464
defaultdict(<class 'int'>, {'Others': 226, 'YesNo': 163, 'shape': 35, 'color': 11, 'choose': 29, 'number': 20})
484


In [None]:
Counter({'Others': 1284, 'YesNo': 1098, 'choose': 1010, 'color': 239, 'number': 220, 'shape': 97})

In [47]:
print(Counter([len(RE_img_clean[k]) for k in RE_img_clean]))
Qcate = ['choose', 'YesNo', 'Others', 'color', 'shape', 'number']
print("mean RE: ", np.mean([np.mean(RE_img_clean[k]) for k in RE_img_clean if img_dataset[k]['Qcate'] in Qcate]))
print("mean F1: ", np.mean([np.mean(F1_img_clean[k]) for k in F1_img_clean if img_dataset[k]['Qcate'] in Qcate]))
print("mean bleu4: ", np.mean([np.mean(bleu4_img_clean[k]) for k in bleu4_img_clean if img_dataset[k]['Qcate'] in Qcate]))
print("mean mul: ", np.mean([np.mean(mul_img_clean[k]) for k in mul_img_clean if img_dataset[k]['Qcate'] in Qcate]))

Qcate = ['choose', 'YesNo', 'Others', 'color', 'shape', 'number']
for cate in Qcate:
    print("\n", cate)
    print("mean RE: ", np.mean([np.mean(RE_img_clean[k]) for k in RE_img_clean if img_dataset[k]['Qcate'] == cate]))
    print("mean F1: ", np.mean([np.mean(F1_img_clean[k]) for k in F1_img_clean if img_dataset[k]['Qcate'] == cate]))
    print("mean bleu4: ", np.mean([np.mean(bleu4_img_clean[k]) for k in bleu4_img_clean if img_dataset[k]['Qcate'] == cate]))
    print("mean mul: ", np.mean([np.mean(mul_img_clean[k]) for k in mul_img_clean if img_dataset[k]['Qcate'] == cate]))

Counter({6: 1579, 5: 780, 4: 703, 3: 402})
mean RE:  0.9544662625724938
mean F1:  0.5658698487995043
mean bleu4:  0.6479121336748872
mean mul:  0.6188026913526662

 choose
mean RE:  0.9746595876568294
mean F1:  0.3082771627038374
mean bleu4:  0.672719762202811
mean mul:  0.6598229305976057

 YesNo
mean RE:  1.0
mean F1:  0.9996384943803229
mean bleu4:  0.6230182064208344
mean mul:  0.622805048916689

 Others
mean RE:  0.8799223590146941
mean F1:  0.24272256117956434
mean bleu4:  0.6551779675671641
mean mul:  0.5859068924615922

 color
mean RE:  0.9836866471734893
mean F1:  0.958286424484597
mean bleu4:  0.6194411349556342
mean mul:  0.5973031286096661

 shape
mean RE:  0.9821236559139784
mean F1:  0.9421993814777573
mean bleu4:  0.625958507539576
mean mul:  0.5892321497132155

 number
mean RE:  0.995
mean F1:  0.9469256561066419
mean bleu4:  0.6434361270098679
mean mul:  0.6065825418134638


In [50]:
### Save img_dataset with a cleaner testing set
img_dataset_0823_clean_te = copy.deepcopy(img_dataset)
print(len(img_dataset_0823_clean_te))
print("before cleaning, #test = ", len([k for k in img_dataset_0823_clean_te if img_dataset_0823_clean_te[k]['split'] == 'ood_test']))
print("after cleaning, #test = ", len(RE_img_clean))
for k in img_dataset:
    if img_dataset_0823_clean_te[k]['split'] in ['val', 'train']: continue
    elif img_dataset_0823_clean_te[k]['split'] == 'ind_test':
        del img_dataset_0823_clean_te[k]
    elif not k in RE_img_clean:
        del img_dataset_0823_clean_te[k]
    else:
        img_dataset_0823_clean_te[k]['A'] = keep_A_list[k]
        img_dataset_0823_clean_te[k]['split'] = 'test'
print(len(img_dataset_0823_clean_te))
print("after cleaning, #test = ", len([k for k in img_dataset_0823_clean_te if img_dataset_0823_clean_te[k]['split'] == 'test']))
print(Counter([img_dataset_0823_clean_te[k]['split'] for k in img_dataset_0823_clean_te]))
json.dump(img_dataset_0823_clean_te, open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/img_dataset_0823_clean_te.json", "w"), indent=4)

25392
before cleaning, #test =  3948
after cleaning, #test =  3464
22423
after cleaning, #test =  3464
Counter({'train': 16448, 'test': 3464, 'val': 2511})


In [30]:
for k in bleu4_img_clean:
    if img_dataset[k]['Qcate'] in ['YesNo']:
        if np.mean(F1_img_clean[k]) > 0.95: continue
        if random.random() > 0.99: continue
        print()
        print(RE_img_clean[k])
        print(F1_img_clean[k])
        print(bleu4_img_clean[k])
        print(mul_img_clean[k])
        
        pprint(img_dataset[k]['Q'])
        pprint(img_dataset[k]['A'])
        pprint(img_dataset[k]['Keywords_A'])


[1.0, 1.0, 1.0]
[0.6666622222518517, 0.9999950000249999, 0.6666622222518517]
[0.7753470828296695, 0.17388520871679017, 0.9920942415543661]
[0.516894609255718, 0.1738843392950937, 0.661391751757899]
('"Is the Champion logo on the side of both the Bobby Isaac\'s No. 71 Dodge at '
 "the NASCAR Hall of Fame and the replica of Wendell Scott's No. 34 1962 "
 'Chevrolet?"')
['"Yes both the Bobby Isaac\'s No. 71 Dodge at the NASCAR Hall of Fame and the '
 "replica of Wendell Scott's No. 34 1962 Chevrolet has the Champion logo on "
 'its side."',
 '"Yes, the Champion logo is on the side of both vehicles."',
 '"The Champion logo on is not on the side of both the Bobby Isaac\'s No. 71 '
 "Dodge at the NASCAR Hall of Fame and the replica of Wendell Scott's No. 34 "
 '1962 Chevrolet"',
 '"Yes, the Champion logo is on the side of both the Bobby Isaac\'s No. 71 '
 "Dodge at the NASCAR Hall of Fame and the replica of Wendell Scott's No. 34 "
 '1962 Chevrolet."',
 '"The Champion logo is not on the sid

In [71]:
## 同一个sample单独，最后取avg/max，RE<0.3的 full sentence 忽略
eval_f = Evaluate()
bleu4_txt_clean = {}
RE_txt_clean = {}
mul_txt_clean = {}
keep_txt_A_list = {}
drop = 0
for k in txt_dataset:
    if not 'test' in txt_dataset[k]['split']: continue
    #if k in img_drop_k:
        #drop += 1
        #continue
    bleu4_txt_clean[k] = []
    RE_txt_clean[k] = []
    mul_txt_clean[k] = []
    keep_txt_A_list[k] = []
    datum = txt_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        #print(scores)
        
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        if RE_avg<0.5: continue
        keep_txt_A_list[k].append(datum['A'][i])
        bleu4_txt_clean[k].append(scores['Bleu_4'])
        RE_txt_clean[k].append(RE_avg)
        mul_txt_clean[k].append(RE_avg * scores['Bleu_4'])
    if len(RE_txt_clean[k]) < 3: 
        drop += 1
        del RE_txt_clean[k]
        del mul_txt_clean[k]
        del bleu4_txt_clean[k]
    if len(RE_txt_clean) % 500 == 499: print(len(RE_txt_clean))
assert len(RE_txt_clean) == len(mul_txt_clean) == len(bleu4_txt_clean)
print(len(RE_txt_clean))
print(drop)

499
999
999
1499
1999
2499
2999
2999
3499
3499
3999
4076
619


In [72]:
# 0.5 threshold
print(Counter([len(RE_txt_clean[k]) for k in RE_txt_clean]))
print("mean RE: ", np.mean([np.mean(RE_txt_clean[k]) for k in RE_txt_clean]))
print("mean mul: ", np.mean([np.mean(mul_txt_clean[k]) for k in mul_txt_clean]))
print("mean bleu4: ", np.mean([np.mean(bleu4_txt_clean[k]) for k in bleu4_txt_clean]))

Counter({6: 1366, 5: 1343, 4: 874, 3: 493})
mean RE:  0.9458907476823312
mean mul:  0.5068453831403538
mean bleu4:  0.5348509314403606


In [363]:
for k in bleu4_txt_clean:
    if np.mean(RE_txt_clean[k]) > 0.9: continue
    if random.random() > 0.1: continue
    print()
    print(RE_txt_clean[k])
    print(bleu4_txt_clean[k])
    print(mul_txt_clean[k])
        
    pprint(txt_dataset[k]['Q'])
    pprint(txt_dataset[k]['A'])
    pprint(txt_dataset[k]['Keywords_A'])


[0.5, 1.0, 1.0, 0.5]
[0.3259481888646563, 0.8938651487393561, 0.6257642589497454, 0.828247753030668]
[0.16297409443232816, 0.8938651487393561, 0.6257642589497454, 0.414123876515334]
('Does the British basketball league system have less levels does the Spanish '
 'basketball league system?')
['No',
 '"The British basketball league system has levels 2 to 4 while the Spanish '
 'basketball league system has four levels."',
 '"No, the British basketball league system does not have fewer levels than '
 'the Spanish basketball league system."',
 '"The British basketball league system fewer levels than the Spanish '
 'basketball league system."',
 '"No, the British basketball league system does not have less levels than the '
 'Spanish basketball league system."']
'"fewer levels"'

[0.75, 0.75, 0.75, 0.75, 0.75]
[0.3640930238335333, 0.6803749332091918, 0.9457416089045797, 0.9457416089045797, 0.2180019395508693]
[0.27306976787514997, 0.5102811999068939, 0.7093062066784348, 0.7093062066784348,

 'augmented by software-powered display panels conveying information on '
 'display panels"',
 '"Electronic instrument clusters augment vehicle instruments via '
 'software-powered display panels, and an odometer is an example of such a '
 'vehicle instrument."',
 '"It\'s an example of an instrument."',
 '"An odometer or odograph is an example of vehicle instruments which have '
 'been augmented by software-powered display panels conveying information on '
 'display panels."',
 '"An odometer or odograph is an example of vehicle instruments"']
'"example of vehicle instruments"'

[1.0, 0.9230769230769231, 0.5384615384615384]
[0.5623413251690909, 0.8094889502425031, 0.4557357795787848]
[0.5623413251690909, 0.7472205694546183, 0.24539618900396104]
'Beira barb can be found feeding at what lake depths?'
['The redeye barb lives and feeds on the bottom as well as in the middle of '
 'the water column and at the surface - so, at all depths.',
 '"Beira barb can be found feeding at the bottom as 

In [73]:
### Save txt_dataset with a cleaner testing set
txt_dataset_0823_clean_te = copy.deepcopy(txt_dataset)
print(len(txt_dataset_0823_clean_te))
print("before cleaning, #test = ", len([k for k in txt_dataset_0823_clean_te if txt_dataset_0823_clean_te[k]['split'] == 'test']))
print("after cleaning, #test = ", len(RE_txt_clean))
for k in txt_dataset:
    if not txt_dataset_0823_clean_te[k]['split'] == 'test': continue
    if not k in RE_txt_clean:
        del txt_dataset_0823_clean_te[k]
    else:
        txt_dataset_0823_clean_te[k]['A'] = keep_txt_A_list[k]
print(len(txt_dataset_0823_clean_te))
print("after cleaning, #test = ", len([k for k in txt_dataset_0823_clean_te if txt_dataset_0823_clean_te[k]['split'] == 'test']))
json.dump(txt_dataset_0823_clean_te, open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/txt_dataset_0823_clean_te.json", "w"), indent=4)
#json.dump(txt_dataset_0823_clean_te, open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/txt_dataset_0823_clean_te_5.json", "w"), indent=4)

24962
before cleaning, #test =  4695
after cleaning, #test =  4076
24343
after cleaning, #test =  4076


### Double check clean_te version

In [81]:
txt_dataset = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/txt_dataset_0823_clean_te.json", "r"))
img_dataset = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/img_dataset_0823_clean_te.json", "r"))

In [82]:
x = []
for k in img_dataset:
    if img_dataset[k]['split'] == 'test':
        x.append(len(img_dataset[k]['A']))
print(Counter(x))

Counter({6: 1579, 5: 780, 4: 703, 3: 402})


In [58]:
## 同一个sample单独，最后取avg/max，RE<0.3的 full sentence 忽略
### Double check on the clean_te version
eval_f = Evaluate()
bleu4_img_clean = {}
RE_img_clean = {}
F1_img_clean = {}
mul_img_clean = {}

drop = defaultdict(int)
for k in img_dataset:
    if not 'test' in img_dataset[k]['split']: continue
    bleu4_img_clean[k] = []
    RE_img_clean[k] = []
    mul_img_clean[k] = []
    F1_img_clean[k] = []
    datum = img_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    Qcate = img_dataset[k]['Qcate']
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        #print(scores)
        
        if Qcate == 'color': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", color_set)
        elif Qcate == 'shape': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", shape_set)
        elif Qcate == 'YesNo': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", yesno_set)
        elif Qcate == 'number': F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A, "", {"NUMBER"})
        else: F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        bleu4_img_clean[k].append(scores['Bleu_4'])
        RE_img_clean[k].append(RE_avg)
        F1_img_clean[k].append(F1_avg)
        if Qcate in ['choose', 'Others']: mul_img_clean[k].append(RE_avg * scores['Bleu_4'])
        else: mul_img_clean[k].append(F1_avg * scores['Bleu_4'])
        
    if len(RE_img_clean) % 500 == 499: print(len(RE_img_clean))
assert len(RE_img_clean) == len(mul_img_clean) == len(bleu4_img_clean) == len(F1_img_clean)
print(len(RE_img_clean))
print(drop)
print(np.sum(list(drop.values())))

499
999
1499
1999
2499
2999
3464
defaultdict(<class 'int'>, {})
0.0


In [59]:
print(Counter([len(RE_img_clean[k]) for k in RE_img_clean]))
Qcate = ['choose', 'YesNo', 'Others', 'color', 'shape', 'number']
print("mean RE: ", np.mean([np.mean(RE_img_clean[k]) for k in RE_img_clean if img_dataset[k]['Qcate'] in Qcate]))
print("mean F1: ", np.mean([np.mean(F1_img_clean[k]) for k in F1_img_clean if img_dataset[k]['Qcate'] in Qcate]))
print("mean bleu4: ", np.mean([np.mean(bleu4_img_clean[k]) for k in bleu4_img_clean if img_dataset[k]['Qcate'] in Qcate]))
print("mean mul: ", np.mean([np.mean(mul_img_clean[k]) for k in mul_img_clean if img_dataset[k]['Qcate'] in Qcate]))

Qcate = ['choose', 'YesNo', 'Others', 'color', 'shape', 'number']
for cate in Qcate:
    print("\n", cate)
    print("mean RE: ", np.mean([np.mean(RE_img_clean[k]) for k in RE_img_clean if img_dataset[k]['Qcate'] == cate]))
    print("mean F1: ", np.mean([np.mean(F1_img_clean[k]) for k in F1_img_clean if img_dataset[k]['Qcate'] == cate]))
    print("mean bleu4: ", np.mean([np.mean(bleu4_img_clean[k]) for k in bleu4_img_clean if img_dataset[k]['Qcate'] == cate]))
    print("mean mul: ", np.mean([np.mean(mul_img_clean[k]) for k in mul_img_clean if img_dataset[k]['Qcate'] == cate]))

Counter({6: 1579, 5: 780, 4: 703, 3: 402})
mean RE:  0.9544662625724938
mean F1:  0.5658698487995043
mean bleu4:  0.6172481514538837
mean mul:  0.5891873785394158

 choose
mean RE:  0.9746595876568294
mean F1:  0.3082771627038374
mean bleu4:  0.6640435153229656
mean mul:  0.651500361127975

 YesNo
mean RE:  1.0
mean F1:  0.9996384943803229
mean bleu4:  0.5561091179381232
mean mul:  0.5559100812476657

 Others
mean RE:  0.8799223590146941
mean F1:  0.24272256117956434
mean bleu4:  0.6320506209199894
mean mul:  0.5653609569231084

 color
mean RE:  0.9836866471734893
mean F1:  0.958286424484597
mean bleu4:  0.6043261385509513
mean mul:  0.5827412047857489

 shape
mean RE:  0.9821236559139784
mean F1:  0.9421993814777573
mean bleu4:  0.5660267821267729
mean mul:  0.5332073612663406

 number
mean RE:  0.995
mean F1:  0.9469256561066419
mean bleu4:  0.6258465290872643
mean mul:  0.5898577775655651


In [78]:
x = []
for k in txt_dataset:
    if txt_dataset[k]['split'] == 'test':
        x.append(len(txt_dataset[k]['A']))
print(Counter(x))

Counter({6: 1366, 5: 1343, 4: 874, 3: 493})


In [79]:
## 同一个sample单独，最后取avg/max，RE<0.5的 full sentence 忽略
eval_f = Evaluate()
bleu4_txt_clean = {}
RE_txt_clean = {}
mul_txt_clean = {}
drop = 0
for k in txt_dataset:
    if not 'test' in txt_dataset[k]['split']: continue
    bleu4_txt_clean[k] = []
    RE_txt_clean[k] = []
    mul_txt_clean[k] = []
    datum = txt_dataset[k]
    Keywords_A = datum['Keywords_A'].replace('"', "")
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q = datum['Q'].replace('"', "")
        C = [all_A[i]]
        A_list = all_A[:i] + all_A[i+1:]
        scores = eval_f.evaluate(cand=[C], ref=[A_list], return_scores=True)
        #print(scores)
        
        F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics(C, Keywords_A)
        bleu4_txt_clean[k].append(scores['Bleu_4'])
        RE_txt_clean[k].append(RE_avg)
        mul_txt_clean[k].append(RE_avg * scores['Bleu_4'])
    
    if len(RE_txt_clean) % 500 == 499: print(len(RE_txt_clean))
assert len(RE_txt_clean) == len(mul_txt_clean) == len(bleu4_txt_clean)
print(len(RE_txt_clean))
print(drop)

499
999
1499
1999
2499
2999
3499
3999
4076
0


In [80]:
# 0.3 threshold
print(Counter([len(RE_txt_clean[k]) for k in RE_txt_clean]))
print("mean RE: ", np.mean([np.mean(RE_txt_clean[k]) for k in RE_txt_clean]))
print("mean mul: ", np.mean([np.mean(mul_txt_clean[k]) for k in mul_txt_clean]))
print("mean bleu4: ", np.mean([np.mean(bleu4_txt_clean[k]) for k in bleu4_txt_clean]))

Counter({6: 1366, 5: 1343, 4: 874, 3: 493})
mean RE:  0.9458907476823312
mean mul:  0.4871889696253697
mean bleu4:  0.5139965782866891


### Draft zone

In [19]:
Q = []
A_list = []
C = []
Keywords_A = []
for k in txt_dataset:
    if not 'test' in txt_dataset[k]['split']: continue
    datum = txt_dataset[k]
    all_A = [a.replace('"', "") for a in datum['A']]
    for i in range(len(all_A)):
        Q.append(datum['Q'].replace('"', ""))
        C.append([all_A[i]])
        A_list.append(all_A[:i] + all_A[i+1:])
        Keywords_A.append(datum['Keywords_A'].replace('"', ""))
assert len(C) == len(Q) == len(A_list) == len(Keywords_A)
print(len(Q))

25015


In [20]:
eval_f = Evaluate()
scores = eval_f.evaluate(cand=C, ref=A_list, return_scores=True)

{'testlen': 365753, 'reflen': 368388, 'guess': [365753, 340738, 317201, 294375], 'correct': [302169, 244036, 202037, 168574]}
ratio: 0.9928472154358965
Bleu_1
Bleu_2
Bleu_3
Bleu_4
ROUGE_L
Bleu_1:	 0.820225403511018
Bleu_2:	 0.7636931139008422
Bleu_3:	 0.7171364099147381
Bleu_4:	 0.6766927878708912
ROUGE_L: 0.6734202548341871


In [21]:
# SQuAD style vqa eval: EM, F1
F1_avg_scores = []
F1_max_scores = []
EM_scores = []
RE_scores = []
PR_scores = []
#F1_avg_bertscores = []
#F1_max_bertscores = []
for cands, a in zip(C, Keywords_A):
    assert len(cands)==1
    F1_avg, F1_max, EM, RE_avg, PR_avg = compute_vqa_metrics([cands[0]], a)
    F1_avg_scores.append(F1_avg)
    F1_max_scores.append(F1_max)
    EM_scores.append(EM)
    RE_scores.append(RE_avg)
    PR_scores.append(PR_avg)

    #F1_avg_bertscore, F1_max_bertscore = compute_bertscore([cands[0]], a)
    #F1_avg_bertscores.append(F1_avg_bertscore)
    #F1_max_bertscores.append(F1_max_bertscore)

F1_avg = np.mean(F1_avg_scores)
F1_max = np.mean(F1_max_scores)
EM = np.mean(EM_scores)
RE_avg = np.mean(RE_scores)
PR_avg = np.mean(PR_scores)

#F1_avg_bertscore = np.mean(F1_avg_bertscores)
#F1_max_bertscore = np.mean(F1_max_bertscores)
print("F1_avg = {}".format(F1_avg))
print("F1_max = {}".format(F1_max))
print("EM = {}".format(EM))
print("RE_avg = {}".format(RE_avg))
print("PR_avg = {}".format(PR_avg))

#print("F1_avg_bertscore = {}".format(F1_avg_bertscore))
#print("F1_max_bertscore = {}".format(F1_max_bertscore))

print("RE * BLEU4 = {}".format(RE_avg * scores['Bleu_4']))

F1_avg = 0.2933705406243417
F1_max = 0.2933705406243417
EM = 0.07567459524285429
RE_avg = 0.7901031055052431
PR_avg = 0.22620708177082618
RE * BLEU4 = 0.5346570731697918
