In [2]:
import json
import jsonlines
import networkx as nx
import hypernetx as hnx
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
# from refined.inference.processor import Refined
from collections import defaultdict


In [None]:
refined = Refined.from_pretrained(model_name='wikipedia_model_with_numbers',
                                  entity_set="wikipedia")

In [3]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [5]:
def transform_vispub(articles):
    res = []
    for article in articles:
        print(article['keywords'])
        res.append({
            "id": "vis_" + str(article['id']),
            "summary": article['Abstract'],
            "content": article['Abstract'],
            "title": article['Title'],
            "date": article['Year'],
            "publication": article['Conference'],
            "event": {
                "title": article['Title'],
                "type": "publication",
                "participants": [
                    {
                        "entity_id": keyword,
                        "entity_title": keyword,
                        "raw_mention": keyword,
                        "entity_type": None
                    }
                    for keyword in article['keywords']
                ]
            }
        })
    return res
articles = json.load(open('data/raw/VisPub/articles_w_keywords.json'))
print(len(articles))
transformed = transform_vispub(articles)
save_json(transformed, 'data/result/VisPub/linked/linked.json')

2632
['accessibility', 'photosensitive epilepsy', 'photosensitivity', 'interactive', 'information visualization and exploration']
['federated learning', 'data heterogeneity', 'cluster analysis', 'visual analysis']
['data transformation', 'self-service data transformation', 'programming by example', 'declarative specification']
['aesthetics in visualization', 'aesthetic pleasure', 'validated scale', 'scale development', 'visual representations']
['deep learning', 'neural network architecture search', 'visual anaytics', 'explainability']
['visual anaytics', 'theory', 'qualitative study', 'design study', 'visualization applications', 'theoretical and empirical research']
['progressive visualization', 'uncertainty', 'bar chart', 'confidence intervals']
['data transformation', 'tabular data', 'hierarchical tabular data', 'tabular visualization']
['time-series visualization', 'ensemble learning', 'music mood classification']
['information visualization and exploration', 'data physicalization

In [None]:
import json
articles = json.load(open('data/raw/VisPub/articles.json'))
keyword_disambiguation = json.load(open('data/raw/VisPub/1123/disambiguated_keywords.json'))
for article in articles:
    keywords = [keyword.lower().strip() for keyword in article['AuthorKeywords'].split(",")]
    disambiguated = [keyword_disambiguation[keyword] for keyword in keywords]
    article['keywords'] = disambiguated
save_json(articles, 'data/raw/VisPub/1123/articles_w_keywords.json')

In [None]:
def merge_sentences(sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph

def prepare_events(datum):
    return datum['events']
    # words_flattened = [word for sentence in datum['sentences'] for word in sentence]
    for event in datum['events']:
        # trigger = event['trigger']
        arguments = event['arguments']
        arguments_obj = [
            { 
                'argument_id': argument, 
                'argument_word': argument,
            }
            for argument in arguments
            ]
        event['arguments'] = arguments_obj
    return datum['events']

def link_entities(event, paragraph):
    # spans = refined.process_text(paragraph)
    res = []
    for participant in event['participants']:
        spans = refined.process_text(participant)
        if spans == []: 
            participant = {
                "entity_id": participant,
                "entity_title": participant,
                "raw_mention": participant,
                "entity_type": "None"
            }
            res.append(participant)
            continue
        span = [span for span in spans][0]
        entity_word = span.text
        print(span)
        if span.predicted_entity != None and span.predicted_entity.wikidata_entity_id != None:
            participant = {
                "entity_id": span.predicted_entity.wikidata_entity_id,
                "entity_title": span.predicted_entity.wikipedia_entity_title,
                "raw_mention": participant,
                "entity_type": span.coarse_mention_type or "None",
            }
        else:
            participant = {
                "entity_id": participant,
                "entity_title": participant,
                "raw_mention": participant,
                "entity_type": span.coarse_mention_type or "None",
            }
        res.append(participant)
    event['participants'] = res
    return event

    for span in spans:
        entity_word = span.text
        for event in events:
            for argument in event['arguments']:
                if argument['argument_word'] == entity_word:
                    if span.predicted_entity != None and span.predicted_entity.wikidata_entity_id != None:
                        entity_id = span.predicted_entity.wikidata_entity_id
                        entity_title = span.predicted_entity.wikipedia_entity_title
                        argument['argument_id'] = entity_id
                        argument['entity_title'] = entity_title
                    argument['entity_type'] = span.coarse_mention_type 
    return events

def transform_dataset(dataset):
    transformed_dataset = {}
    for index, datum in enumerate(dataset):
        print("{}/{}".format(index, len(dataset)))
        paragraph = datum['summary']
        event = prepare_events(datum)
        event = link_entities(event, paragraph)
        doc_key = datum['id']
        source_url = datum['url']
        if doc_key not in transformed_dataset.keys():
            datum['events'] = event
            transformed_dataset[doc_key] = datum
    return list(transformed_dataset.values())

def remove_duplicates(dataset):
    reverse_index_url = defaultdict(list)
    kept_dataset = []
    for index, datum in enumerate(dataset):
        if datum['source_url'] in reverse_index_url.keys():
            previous_data = reverse_index_url[datum['source_url']]
            duplicate = False
            for previous_datum in previous_data:
                if " ".join(datum['content'][0][0]) == " ".join(previous_datum['content'][0][0]):
                    duplicate = True
                    break
            if duplicate: continue
        reverse_index_url[datum['source_url']].append(datum)
        kept_dataset.append(datum)
    return kept_dataset


In [None]:
import re
def find_participant_mentions(participants, paragraph):
    spans = refined.process_text(paragraph)
    print(spans)
    # participant_ids = [participant['entity_id'] for participant in participants]
    mentioned_participants = []
    mention_texts = []
    for participant in participants:
        if participant['entity_id'] == participant['entity_title']:
            mentioned_participants.append({
                'text': participant['raw_mention'],
                'entity_id': participant['entity_id'],
                'entity_title': participant['entity_title'],
            })
        else:
            find = False
            for span in spans:
                if span.predicted_entity != None and span.predicted_entity.wikidata_entity_id != None:
                    entity_id = span.predicted_entity.wikidata_entity_id
                    entity_title = span.predicted_entity.wikipedia_entity_title
                    mention = span.text
                    if entity_id == participant['entity_id'] and mention not in mention_texts:
                        find = True
                        mention_texts.append(mention)
                        mentioned_participants.append({
                            'text': mention,
                            'entity_id': entity_id,
                            'entity_title': entity_title,
                        })
            if not find:
                mentioned_participants.append({
                    'text': participant['raw_mention'],
                    'entity_id': participant['entity_id'],
                    'entity_title': participant['entity_title'],
                }) 
    # find mention spans
    mentioned_participants = find_keyword_spans(paragraph, mentioned_participants)
    return mentioned_participants
def find_keyword_spans(paragraph, participants):
    for participant in participants:
        keyword_spans = []
        keyword = participant['text']
        if keyword.startswith(' ['):
            keyword = keyword[2:]
        pattern = re.compile(re.escape(keyword), re.IGNORECASE)
        matches = pattern.finditer(paragraph)
        for match in matches:
            start_span = match.start()
            end_span = match.end()
            keyword_spans.append((start_span, end_span, keyword))
        participant['spans'] = keyword_spans
    return participants

In [None]:
article_dict = json.load(open('data/result/AllTheNews/network/articles.json'))

In [None]:
# paragraph = """
# The article discussed the brief interaction between President Obama and President Rodrigo Duterte at a summit meeting in Laos. After Duterte's profane outburst, Obama canceled their first meeting, but they spoke briefly at a gala dinner. The conversation was not substantive, and they were seated far apart during the dinner.
# """
# find_participant_mentions(["Q76", "Q457786"], paragraph)
# article_participant_span_dict = json.load(open('data/result/AllTheNews/network/server/article_participant_spans.json'))
article_participant_span_dict = {}
count = 0
for article_id, article_data in article_dict.items():
    print("{}/{}".format(count, len(article_dict)))
    count += 1
    # if count > 50: break
    paragraph = article_data['summary']
    participants = [entity for entity in article_data['event']['participants']]
    participant_spans = find_participant_mentions(participants, paragraph)
    article_participant_span_dict[article_id] = participant_spans
    # save_json(article_participant_span_dict, 'data/result/AllTheNews/network/article_participant_spans.json')
save_json(article_participant_span_dict, 'data/result/AllTheNews/network/article_participant_spans.json')

In [None]:
article_participant_span_dict = json.load(open('data/result/AllTheNews/network/article_participant_spans.json'))
article_dict = json.load(open('data/result/AllTheNews/network/articles.json'))

In [None]:
for article_id, entities in article_participant_span_dict.items():
    for entity in entities:
        if len(entity['spans']) == 0:
            print(article_id, entity)



In [None]:
linked_data = json.load(open("data/result/AllTheNews/linked/2016_10p_0819_2.json"))
linked_ids = [article['id'] for article in linked_data]
old_linked_data = json.load(open("data/result/AllTheNews/linked/2016_10p.json"))
old_linked_ids = [article['doc_id'] for article in old_linked_data]
target_data_ids = [old_article_id for old_article_id in old_linked_ids if old_article_id not in linked_ids]
print(len(target_data_ids))

In [None]:
All_articles = json.load(open("data/raw/AllTheNews/events/2016_10p_0819.json"))
All_articles_dict = { article['id']: article for article in All_articles}
target_data = [All_articles_dict[target_data_id] for target_data_id in target_data_ids]

In [None]:
new_transformed_data = transform_dataset(target_data)

In [None]:
for new_data in new_transformed_data:
    linked_data.append(new_data)
save_json(linked_data, 'data/result/AllTheNews/linked/2016_10p_0819_2.json')

In [None]:
# All the News
# AllTheNews = json.load(open(r'data/result/AllTheNews/preprocessed/2016_10p.json'))
AllTheNews = json.load(open(r'data/raw/AllTheNews/events/2016_10p_0819.json'))
transformed_dataset = transform_dataset(AllTheNews)
print(len(AllTheNews), len(transformed_dataset))
# save_json(transformed_dataset, r'data/result/AllTheNews/linked/2016_10p_0819.json')

In [None]:
save_json(transformed_dataset, r'data/result/AllTheNews/linked/2016_10p_0819.json')

In [None]:
prev_dataset = json.load(open(r'data/result/AllTheNews/linked/2016_10p_0819.json'))
for datum in prev_dataset:
    if 'events' in datum.keys():
        event = datum['events']
        datum['event'] = datum['events']
        del datum['events']
save_json(prev_dataset, r'data/result/AllTheNews/linked/2016_10p_0819.json')

In [None]:
def second_round_linking(events):
    for event in events:
        for argument in event['arguments']:
            if argument['argument_id'] == argument['argument_word']:
                if 'entity_type' in argument.keys(): del argument['entity_type']
                spans = refined.process_text(argument['argument_word'])
                for span in spans:
                    if span.predicted_entity != None and span.predicted_entity.wikidata_entity_id != None:
                        entity_id = span.predicted_entity.wikidata_entity_id
                        entity_title = span.predicted_entity.wikipedia_entity_title
                        argument['argument_id'] = entity_id
                        argument['entity_title'] = entity_title
                        argument['entity_type'] = span.coarse_mention_type
    return events

In [8]:
from collections import defaultdict
# create node link graph
def construct_network(docs):
    entity_dict = {}
    article_dict = {}
    links = []
    for doc in docs:
        doc_id = doc['id']
        article_dict[doc_id] = doc
        event = doc['event']
        article_id = str(doc_id)
        participants = event['participants']
        # create an entity node for each participant
        for participant in participants:
            participant_id = participant['entity_id']
            participant_word = participant['raw_mention']
            participant_title = participant['entity_title'] 
            participant_entity_type = participant['entity_type'] 

            if participant_id not in entity_dict.keys():
                entity_dict[participant_id] = {
                    "id": participant_id, 
                    "title": participant_title,
                    "entity_type": participant_entity_type,
                    "type": "entity",
                    "mentions": [
                        {
                            "doc_id": doc_id,
                            "mention": participant_word,
                        }
                    ]
                }
            else:
                entity_dict[participant_id]['mentions'].append(
                    {
                        "doc_id": doc_id,
                        "mention": participant_word,
                    }
                )
            participant_ids = [participant['entity_id'] for participant in participants]
            for participant in participants:
                participant_id = participant['entity_id']
                links.append((article_id, participant_id))

    return entity_dict, article_dict, links

def merge_network(dataset):
    entity_dict, article_dict, links = construct_network(dataset)
    print(len(entity_dict), len(article_dict), len(links))
    B = nx.Graph()
    B.add_nodes_from(list(article_dict.keys()), bipartite=0)
    B.add_nodes_from(list(entity_dict.keys()), bipartite=1)
    B.add_edges_from(links)

    return B, entity_dict, article_dict, links


In [None]:
transformed_dataset = json.load(open('data/result/AllTheNews/linked/2016_10p_0819.json'))
for datum in transformed_dataset:
    if "events" in datum.keys():
        datum['event'] = datum['events']
        del datum['events']
save_json(transformed_dataset, 'data/result/AllTheNews/linked/2016_10p_0819.json')

In [None]:
old_linked_data = json.load(open("data/result/AllTheNews/linked/2016_10p.json"))
old_linked_ids = [article['doc_id'] for article in old_linked_data]
res = []
for datum in transformed_dataset:
    if datum["id"] in old_linked_ids:
        res.append(datum)
save_json(res, 'data/result/AllTheNews/linked/2016_10p_0819_2.json')


In [9]:
# transformed_dataset = json.load(open('data/result/RAMS/gpt_events_dev_linked.json'))
# transformed_dataset = json.load(open('data/result/AllTheNews/linked/2016_10p.json'))
# transformed_dataset = json.load(open('data/result/AllTheNews/linked/2016_10p_0819.json'))
transformed_dataset = json.load(open('data/result/VisPub/linked/linked.json'))
B, entity_dict, article_dict, links = merge_network(transformed_dataset)



5101 2632 60867


In [8]:
print(article_dict.keys())
print(len(B.nodes), len(entity_dict), len(article_dict))
for link in links:
    if (int(link[0]) not in article_dict and int(link[0]) not in entity_dict) or (link[1] not in article_dict and link[1] not in entity_dict):
        print(link, article_dict[int(link[0])]['id'], entity_dict[link[1]]['id'])

    

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,

In [16]:
for n, d in B.nodes(data=True):
    try:
        bipartite = d['bipartite']
    except:
        print(n, d)


In [10]:
H = hnx.Hypergraph.from_bipartite(B)
list(H.shape)

[5101, 2632]

In [11]:
event_hgraph_data = nx.node_link_data(B)
save_json(event_hgraph_data, r'data/result/VisPub/network/hgraph.json')
save_json(entity_dict, r'data/result/VisPub/network/entities.json')
save_json(article_dict, r'data/result/VisPub/network/articles.json')

# save_json(sub_event_hyperedges, r'data/result/AllTheNews/sub_network/sub_event_hyperedges.json')
# save_json(sub_event_links_dict, r'data/result/AllTheNews/sub_network/sub_event_links.json')

In [12]:
def transform_frontend(nodes, links, entity_dict, article_dict):
    res_nodes = []
    res_links = []
    for node in nodes:
        if node in entity_dict:
            entity_dict[node]['type'] = 'entity'
            res_nodes.append(entity_dict[node])
        else:
            # node = int(node)
            date = article_dict[node]['date'].replace("-", "/")
            date.replace("-", "/")
            article_dict[node]['date'] = date
            article_dict[node]['type'] = 'article'

            res_nodes.append(article_dict[node])
    for link in links:
        source = link[0]
        target = link[1]
        res_links.append({
            "source": source,
            "target": target,
        })
    print(len(res_nodes))
    return {
        "nodes": res_nodes, 
        "links": res_links
    }

In [None]:
print(entity_dict.keys())
print(article_dict.keys())
print(list(B.nodes))

In [13]:
BH = H.bipartite()
# BH = B
network = transform_frontend(list(BH.nodes), list(BH.edges), entity_dict, article_dict)
# save_json(network, 'data/result/AllTheNews/network/server/frontend.json')
save_json(network, 'data/result/VisPub/network/server/frontend.json')

7733


In [None]:
network = json.load(open('data/result/AllTheNews/network/server/frontend.json'))
hyperedge_nodes = list(filter(lambda node: node['type'] == 'hyper_edge', network['nodes']))
entity_nodes = list(filter(lambda node: node['type'] == 'entity' and node['id'] != node['title'], network['nodes']))

entity_node_ids = list(map(lambda node: node['id'], entity_nodes))
hyperedge_node_ids = list(map(lambda node: node['id'], hyperedge_nodes))
entity_links = list(filter(lambda link: link['source'] in entity_node_ids or link['target'] in entity_node_ids, network['links']))

In [None]:
partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions_article.json'))
frontend_data = json.load(open('data/result/AllTheNews/network/server/frontend.json'))
nodes = frontend_data['nodes']
article_nodes = [node for node in nodes if node['type'] == 'hyper_edge']

In [None]:
old_trigger_doc_id_dict = {}
for article_node in article_nodes:
    old_trigger_doc_id_dict[article_node['id']] = article_node['doc_id']

In [None]:
old_frontend_data = json.load(open('data/result/AllTheNews/network/server/old/frontend.json'))
old_article_nodes = [node for node in old_frontend_data['nodes'] if node['type'] == 'hyper_edge']
old_article_doc_ids = [node['doc_id'] for node in old_article_nodes]
print(len(old_article_nodes))

In [None]:
frontend_data = json.load(open('data/result/AllTheNews/network/server/frontend_2.json'))
article_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'article']
entity_nodes = [node for node in frontend_data['nodes'] if node['type'] == 'entity']
print(len(article_nodes), len(entity_nodes))

In [None]:
articles = { article['id']: article for article in article_nodes }
entities = { entity['id']: entity for entity in entity_nodes }
save_json(articles, 'data/result/AllTheNews/network/articles.json')
save_json(entities, 'data/result/AllTheNews/network/entities.json')

In [None]:
links = frontend_data['links']
filtered_links = []
for link in links:
    source = link['source']
    target = link['target']
    if (source in articles or source in entities) and (target in articles or target in entities):
        filtered_links.append((source, target))
print(len(filtered_links), len(links))

In [None]:
B = nx.Graph()
B.add_nodes_from(list(articles.keys()), bipartite=0)
B.add_nodes_from(list(entities.keys()), bipartite=1)
B.add_edges_from(filtered_links)
event_hgraph_data = nx.node_link_data(B)
save_json(event_hgraph_data, r'data/result/AllTheNews/network/hgraph.json')


In [None]:
B.number_of_nodes(), len(articles), len(entities), len(links)

In [None]:
for v in B.nodes():
    if v not in articles and v not in entities:
        print(v)

In [17]:
articles_1 = json.load(open('data/result/VisPub/network/old_1123/articles.json'))
articles_2 = json.load(open('data/result/VisPub/network/articles.json'))
print(len(articles_1), len(articles_2))
count = 0
for article in articles_1.values():
    if article['summary'] != "":
        count += 1
print(count)

3620 2632
3549


In [18]:
article_embeddings = json.load(open('data/result/VisPub/network/server/article_embeddings.json'))
print(len(article_embeddings))

3620


In [23]:
article_embeddings = json.load(open('data/result/VisPub/article_embeddings.json'))
embeddings_dict = { article['doc_id']: article for article in article_embeddings}
print(len(article_embeddings))
articles = json.load(open('data/result/VisPub/network/articles.json'))

3620


In [28]:
filtered_article_embeddings ={}
for article in articles.values():
    article_id = article['id']
    embedding = embeddings_dict[article_id]['embedding']
    filtered_article_embeddings[article_id] = {
        "doc_id": article_id,
        "title": article['title'],
        "content": article['content'],
        "summary": article['summary'],
        "embedding": embedding,
    }
save_json(list(filtered_article_embeddings.values()), 'data/result/VisPub/network/server/article_embeddings.json')

In [27]:
article_partitions = json.load(open('data/result/VisPub/network/server/ravasz_partitions_article.json'))
first_level = article_partitions[0]
print(len(first_level))

3620
