In [522]:
import json, random, os
import numpy as np
from pprint import pprint
from collections import Counter, defaultdict
from bs4 import BeautifulSoup
import requests
import urllib.request
import re
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import pickle
import copy
import spacy
from spacy import displacy
from itertools import tee
import wikipedia
import pylcs
import string
np.set_printoptions(precision=4)

In [81]:
PUNCTUATIONS = set(string.punctuation)
pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*|\(|\)|-')

In [4]:
img_meta = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data/img_metadata-Copy1.json", "r"))
print(len(img_meta))

362367


In [504]:
img_dataset = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/dataset_J_split_update0721.json", "r"))
print(len(img_dataset))

25467


In [76]:
pos_list = ['NUM', 'NOUN', 'ADJ', 'PROPN']

In [6]:
nlp = spacy.load('en_core_web_sm')

In [113]:
def IoU(A, B):
    intersection = len(A.intersection(B))
    union = len(A.union(B))
    return round(intersection / (union+1e-7), 2)

In [110]:
def find_sentences_from_page_for_img_data(title, page, keywords, answerwords):
    try: 
        content = wikipedia.page(title, auto_suggest=False, redirect=True).content
        paragraphs = content[:content.find('== References ==')].split('\n')
        
    except: return {}
    #records = []
    sen2score = {}
    for p in paragraphs:
        if len(p.split()) >= 10:
            #records.append(-999)
            doc = nlp(p)
            for s in doc.sents:
                if len(s) < 10: 
                    continue
                nouns_in_s = [t.text for t in s if (t.pos_ in pos_list or ((not t.is_sent_start) and t.text[0].isupper()))]

                IoU_Q = IoU(set(nouns_in_s), keywords)
                IoU_A = IoU(set(nouns_in_s), answerwords)
                if IoU_Q -  IoU_A > 0.06:
                    sen2score[s.text] = {'scores': (IoU_Q, IoU_A, IoU_Q - IoU_A), 'link': page, 'title': title}
                #records.append(round(IoU_Q, 2))
    #print(records)

    #records = []
    for p in paragraphs:
        if len(p.split()) >= 10:
            #records.append(-999)
            doc = nlp(p)
            it1, it2 = tee(doc.sents)
            next(it2, None)
            for s1, s2 in zip(it1, it2):
                if len(s1) < 5 or len(s2) < 5 or len(s1)+len(s2) > 70 or len(s1)+len(s2) < 10: 
                    continue 
                nouns_in_s = [t.text for s in [s1, s2] for t in s if (t.pos_ in pos_list or ((not t.is_sent_start) and t.text[0].isupper()))]

                IoU_Q = IoU(set(nouns_in_s), keywords)
                IoU_A = IoU(set(nouns_in_s), answerwords)
                if IoU_Q -  IoU_A >= 0.06:
                    sen2score[" ".join([s1.text, s2.text])] = {'scores': (IoU_Q, IoU_A, IoU_Q - IoU_A), 'link': page, 'title': title}
                    #print(s)
                #records.append(round(IoU_Q, 2))
    #print(records)
    #print(len(sen2score))
    return sen2score

In [511]:
def get_keywords_from_img_sample(k):
    Q = img_dataset[str(k)]['Q'].replace('"', '').replace('_', ' ')
    doc = nlp(Q)
    keywords = set([t.text for s in doc.sents for t in s if t.pos_ in ['NUM', 'PROPN', 'ADJ', 'NOUN'] or ((not t.is_sent_start) and t.text[0].isupper())])
    keywords = keywords - PUNCTUATIONS
    keywords = set(sum([[w.capitalize(), w.lower()] for w in keywords], []))
    
    ### Extract noun chunks
    proper_words = [t.text for s in doc.sents for t in s if t.pos_ in ['NUM', 'PROPN', 'ADJ'] or ((not t.is_sent_start) and t.text[0].isupper())]
    chunks = set()
    for chunk in doc.noun_chunks:
        if any([n in proper_words for n in chunk.text.split()]):
            chunks.add(chunk.text)
    if not chunks: 
        chunks = chunks.union([c.text for c in doc.noun_chunks])
        chunks = chunks.union([t.text for s in doc.sents for t in s if t.pos_ == 'PROPN' or ((not t.is_sent_start) and t.text[0].isupper())])
    
    A = img_dataset[str(k)]['A'].replace('"', '')
    doc = nlp(A)
    answerwords = set([t.text for t in doc if t.pos_ in pos_list or ((not t.is_sent_start) and t.text[0].isupper())]) - keywords
    answerwords = answerwords - PUNCTUATIONS
    answerwords = set(sum([[w.capitalize(), w.lower()] for w in answerwords], []))
    
    return keywords, answerwords, Q, A, chunks

In [464]:
doc = nlp("What colors are the Cheverny and Dicentra cucullaria flowers?")
for c in doc.noun_chunks:
    print(c.text)
displacy.render(doc, style='dep')

What colors
the Cheverny and Dicentra cucullaria flowers


In [465]:
for token in doc:
    if token.dep_ == 'amod' or token.dep_ == 'compound':
        print(token.i, token.head.i)
        print(doc[token.i: token.head.i+1].text if token.head.i > token.i else doc[token.head.i:token.i+1].text)

7 8
cucullaria flowers


In [518]:
# Given img_dataset indx & title, find sentences with word overlap with the question
def find_sentences_from_indx_for_img(k, keywords, answerwords, chunks):
    sen2score = {}
    candidate_pages, updated_chunks = noun_chunk2candidate_page(chunks, k)
    #print("num of candidate pages = {}\n".format(len(candidate_pages)))
    for title in candidate_pages:
        page = "https://en.wikipedia.org/wiki/" + "_".join(title.split())
        sen2score.update(find_sentences_from_page_for_img_data(title, page, keywords, answerwords))
    sen2score = dict(sorted(sen2score.items(), key=lambda x: x[1]['scores'][-1], reverse=True))
    return sen2score, candidate_pages, updated_chunks

In [490]:
def add_html_row_x_distractor_for_img(k, sen2score, word_lists, chunks, pages, colors=["(205, 245, 252)", "(255, 214, 222)"]):
    html = '<tr><td>{}.</td>'.format(k)
    Q = img_dataset[str(k)]['Q'].replace('"', '')
    html += '<td>Q: {}<br><br>'.format(highlight_words(word_lists, chunks, colors, Q))
    A = img_dataset[str(k)]['A'].replace('"', '')
    for gid in img_dataset[str(k)]['GoldIds']:
        img = img_meta[str(int(gid))]
        html += '<a href="{}" target="_blank"><img style="display:block; max-height:300px; max-width:100%;" src = "{}"></a>'.format(img['page'], img['src'])
        html += '<br>Title = {}<br>Description = {}<br><br>'.format(highlight_words(word_lists, [], colors, img['name'].replace("_", " ")), highlight_words(word_lists, [], colors, img['description'].replace("_", " ")))
    html += 'A: {}<br><br>'.format(highlight_words(word_lists, [], colors, A))
    html += '<span class="hid" style="display: none"><b>Relevant Wikipedia Pages: </b>{}</span>'.format(', '.join(pages))
    html += '<br><button onclick="btn_click($(this));">Toggle details</button></td><td>'
    
    for s in list(sen2score.keys())[:10]:
        html += '{} --- {} '.format(highlight_words(word_lists, [], colors, s), str(sen2score[s]['scores']))
        html += '<a href="{}"  target="_blank"> {}</a><br><br>'.format(sen2score[s]['link'], sen2score[s]['title'])
    for s in list(sen2score.keys())[10:]:
        html += '<span class="hid" style="display: none">{} --- {} '.format(highlight_words(word_lists, [], colors, s), str(sen2score[s]['scores']))
        html += '<a href="{}"  target="_blank"> {}</a><br><br></span>'.format(sen2score[s]['link'], sen2score[s]['title'])
        
    html += '</td></tr>'
    html += '<tr><td colspan=3><hr></td></tr>'
    return html.encode('ascii', 'xmlcharrefreplace').decode("utf-8") 

In [354]:
def highlight_words(word_lists, chunks, colors, sentence):
    s = copy.deepcopy(sentence)
    if "".join(chunks):
        s = re.sub(r'\s*(' + r'|'.join([re.escape(c) for c in chunks]) + r')\s*', lambda m: '<span class="chunk">{}</span>'.format(m.group()), s)
    for word_list, color in zip(word_lists, colors):
        if "".join(word_list): s = re.sub(r'\b(' + r'|'.join(word_list) + r')\b', lambda m: '<span style="background-color:rgb{}">{}</span>'.format(color, m.group()), s)
    return s

In [488]:
def noun_chunk2candidate_page(chunks, k):
    pages = set()
    for chunk in chunks:
        pages = pages.union(set(wikipedia.search(chunk)))
    if len(pages) < 5:
        print(k)
        Q = img_dataset[str(k)]['Q'].replace('"', '').replace('_', ' ')
        doc = nlp(Q)
        #more_chunks = set([c.text for c in doc.noun_chunks])
        more_chunks = set()
        more_chunks = more_chunks.union([t.text for s in doc.sents for t in s if t.pos_ == 'PROPN' or ((not t.is_sent_start) and t.text[0].isupper())])
        for token in doc:
            if token.dep_ == 'amod' or token.dep_ == 'compound':
                more_chunks.add(doc[token.i: token.head.i+1].text if token.head.i > token.i else doc[token.head.i:token.i+1].text)
        more_chunks = more_chunks - chunks
        print(Q)
        print("More chunks: ", more_chunks)
        more_pages = set()
        for chunk in more_chunks:
            more_pages = more_pages.union(wikipedia.search(chunk))
        print('add {} more chunks, {} more pages'.format(len(more_chunks), len(more_pages)))
        pages = pages.union(more_pages)
        chunks = chunks.union(more_chunks)
    return pages, chunks

In [519]:
### Mining + Save as json
upd_img_data = {}
try: upd_img_data = json.load(open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/upd_img_data/upd_img_data_{}.json".format(1000), "r"))
except: upd_img_data = {}
for k in [678, 679, 680]: #random.sample(range(25467), 2):
    if str(k) in upd_img_data: continue
    #if k%1 == 0: json.dump(upd_img_data, open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/upd_img_data/upd_img_data.json", "w"), indent=4)
    upd_img_data[str(k)] = copy.deepcopy(img_dataset[str(k)])
    upd_img_data[str(k)]['DistractorFacts'] = []
    keywords, answerwords, Q, A, chunks = get_keywords_from_img_sample(k)
    d, pages, chunks = find_sentences_from_indx_for_img(k, keywords, answerwords, chunks)
    print(k)
    print("Q = ", Q)
    print("Keywords = {}".format(keywords))
    print("A = ", A)
    print("answerwords = {}".format(answerwords))
    print("Noun chunks: ", chunks)
    print(' ')
    word_lists = [keywords, answerwords]
    upd_img_data[str(k)]['word_lists'] = {
        'keywords': " || ".join(word_lists[0]),
        'answerwords': " || ".join(word_lists[1]),
        'noun_chunks': " || ".join(chunks)
    }
    
    DistractorFacts_count = 0
    for s in d:
        if DistractorFacts_count >= 40: break
        if len(s.split()) in range(22, 60):
            upd_img_data[str(k)]['DistractorFacts'].append({
                'title': d[s]['title'],
                'scores': str(d[s]['scores']),
                'fact': s,
                'url': d[s]['link'] 
            })
            DistractorFacts_count += 1
json.dump(upd_img_data, open("/home/yingshac/CYS/WebQnA/WebQnA_data_new/upd_img_data/upd_img_data_{}.json".format(1000), "w"), indent=4)



  lis = BeautifulSoup(html).find_all('li')


679
Q =  Does Nathan's at Six Flags Great Adventure have its name displayed in more than one spot on the outside of its building?
Keywords = {'six', 'More', 'Great', 'great', 'Six', 'building', 'spot', 'one', 'Nathan', 'Name', 'name', 'Flags', 'adventure', 'nathan', 'One', 'Building', 'more', 'flags', 'Spot', 'Adventure', 'outside', 'Outside'}
A =  Yes
answerwords = set()
Noun chunks:  {'Nathan', 'Six Flags', 'Great Adventure', 'more than one spot'}
 
680
Q =  Does Hot Doug’s in Chicago have something attached to the building?
Keywords = {'Doug', 'hot', 'Hot', 'Chicago', 'chicago', 'Building', 'doug', 'building'}
A =  Yes
answerwords = set()
Noun chunks:  {'Hot Doug', 'Chicago'}
 


In [580]:
### check progress
path = "/home/yingshac/CYS/WebQnA/WebQnA_data_new/upd_img_data/"

if os.path.isdir(path):
    finished_samples = []
    files = os.listdir(path)
    print("{} files found".format(len(files)))
    for f in files:
        if not '.json' in f: continue
        finished_samples.extend(list(json.load(open(os.path.join(path, f), "r")).keys()))
else:
    finished_samples.extend(list(json.load(open(path, "r")).keys()))
print("{} samples found".format(len(finished_samples)))

14 files found
24861 samples found


In [577]:
path = "/home/yingshac/CYS/WebQnA/WebQnA_data_new/upd_img_data/"

snippets_count = []

no_snippet_k = []

files = os.listdir(path)
data = {}
for f in files:
    if not '.json' in f: continue
    x = json.load(open(os.path.join(path, f), "r"))
    for k in x:
        num_snippets = len(x[k]['DistractorFacts'])
        if num_snippets == 0:
            no_snippet_k.append(k)
        snippets_count.append(num_snippets)
    data.update(x)
print(sorted(Counter(snippets_count).items())[:5])
print(len([k for k in data if len(data[k]['DistractorFacts'])< 5]))

[(0, 143), (1, 128), (2, 114), (3, 105), (4, 100)]
590


In [578]:
print(sorted(Counter(snippets_count).items()))

[(0, 143), (1, 128), (2, 114), (3, 105), (4, 100), (5, 115), (6, 126), (7, 111), (8, 127), (9, 110), (10, 124), (11, 124), (12, 139), (13, 134), (14, 129), (15, 130), (16, 140), (17, 138), (18, 142), (19, 137), (20, 142), (21, 116), (22, 141), (23, 137), (24, 137), (25, 116), (26, 130), (27, 141), (28, 130), (29, 124), (30, 142), (31, 123), (32, 111), (33, 104), (34, 138), (35, 125), (36, 129), (37, 120), (38, 128), (39, 113), (40, 19457), (41, 11)]


In [520]:
### Mining + Create demo
html = '<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">'
html += '<script src="https://code.jquery.com/jquery-3.2.1.min.js" integrity="sha256-hwg4gsxgFZhOsEEamdOYGBf13FyQuiTwlAQgxVSNgt4=" crossorigin="anonymous"></script>'
html += '<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>'
html += '<!DOCTYPE html><html><head><meta http-equiv="content-type" content="text/html; chatset="UTF-8"><body>'
html += '<script>$("img").on("error", function(){console.log($(this).attr("src"));});'
html += 'function btn_click(btn){$(btn).parent().parent().find(".hid").toggle();}</script>'
html += '<style>table {border-collapse: separate;border-spacing: 10px;}\n'
html += '.chunk {text-decoration: underline solid rgb(227, 123, 253) 3px;}\n'
html += 'button {background-color:white; border: 2px solid #4CAF50; color: black; padding: 0px 8px; text-align: center; display: inline-block; font-size: 14px; margin: 4px 2px; transition-duration: 0.4s; cursor: pointer; }'
html += 'button:hover {background-color: #4CAF50;color: white;}'
html += 'th {position: sticky; top: 0;background: FloralWhite;}</style>'
html += '<table border="0" style="table-layout: fixed; width: 100%; word-break:break-word">'
html += '<tr bgcolor=lightblue style="text-align: center;"><th width=5%>Index</th><th width=35%>Q & Pos Facts</th><th width=60%>Neg Facts</th></tr>'
x = []
for k in random.sample(list(img_dataset.keys()), 3):
    if img_dataset[k]['Qcate'] == 'YesNo': continue
    print(k)
    keywords, answerwords, Q, A, chunks = get_keywords_from_img_sample(k)
    print("Q = ", Q)
    print("Keywords = {}".format(keywords))
    print("A = ", A)
    print("answerwords = {}".format(answerwords))
    print("Noun chunks: ", chunks)
    print(' ')
    d, pages, chunks = find_sentences_from_indx_for_img(k, keywords, answerwords, chunks)
    x.append(len(d))
    
    word_lists = [keywords, answerwords]
    html += add_html_row_x_distractor_for_img(k, d, word_lists, chunks, pages, colors=["(193, 239, 253)", "(255, 214, 222)"])
    o = open('x_distractor_for_img_demo2.html', 'wt')
    o.write(html)
    o.close()
html += '</table></body></html>'
o = open('x_distractor_for_img_demo2.html', 'wt')
o.write(html)
o.close()

20260
Q =  Which monument has flowers around it: A monument in the Sea Garden of the town of Tsarevo, Bulgaria or Soldier statue. - Orczy garden, Budapest District VIII?
Keywords = {'Orczy', 'Bulgaria', 'monument', 'Sea', 'Garden', 'statue', 'bulgaria', 'district', 'Town', 'Flowers', 'Soldier', 'budapest', 'town', 'sea', 'tsarevo', 'Statue', 'viii', 'Budapest', 'orczy', 'garden', 'flowers', 'soldier', 'District', 'Monument', 'Tsarevo', 'Viii'}
A =  A monument in the Sea Garden of the town of Tsarevo, Bulgaria
answerwords = set()
Noun chunks:  {'the Sea Garden', 'Bulgaria', 'Tsarevo', 'Budapest District VIII', 'Soldier', 'Orczy garden'}
 




  lis = BeautifulSoup(html).find_all('li')


24539
Q =  What color is the rope lining the Isles at the The Old Library, Trinity College in Dublin?
Keywords = {'The', 'library', 'Old', 'college', 'color', 'trinity', 'rope', 'dublin', 'the', 'Library', 'College', 'Isles', 'Color', 'old', 'Dublin', 'isles', 'Rope', 'Trinity'}
A =  Green.
answerwords = {'green', 'Green'}
Noun chunks:  {'Trinity College', 'Dublin', 'the The Old Library', 'the Isles'}
 
