In [1]:
import json
import jsonlines
import networkx as nx
import hypernetx as hnx
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from refined.inference.processor import Refined
from collections import defaultdict


 No module named 'celluloid'. If you need to use hypernetx.algorithms.contagion, please install additional packages by running the following command: pip install .['all']


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
refined = Refined.from_pretrained(model_name='wikipedia_model_with_numbers',
                                  entity_set="wikipedia")

In [2]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [None]:
def merge_sentences(sentences):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in sentences] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph

def prepare_events(datum):
    # words_flattened = [word for sentence in datum['sentences'] for word in sentence]
    for event in datum['events']:
        # trigger = event['trigger']
        arguments = event['arguments']
        arguments_obj = [
            { 
                'argument_id': argument, 
                'argument_word': argument,
            }
            for argument in arguments
            ]
        event['arguments'] = arguments_obj
    return datum['events']

def link_entities(events, paragraph):
    spans = refined.process_text(paragraph)
    for span in spans:
        entity_word = span.text
        for event in events:
            for argument in event['arguments']:
                if argument['argument_word'] == entity_word:
                    if span.predicted_entity != None and span.predicted_entity.wikidata_entity_id != None:
                        entity_id = span.predicted_entity.wikidata_entity_id
                        entity_title = span.predicted_entity.wikipedia_entity_title
                        argument['argument_id'] = entity_id
                        argument['entity_title'] = entity_title
                    argument['entity_type'] = span.coarse_mention_type
    return events

def transform_dataset(dataset):
    transformed_dataset = {}
    for index, datum in enumerate(dataset):
        print("{}/{}".format(index, len(dataset)))
        paragraph = datum['summary']
        events = prepare_events(datum)
        events = link_entities(events, paragraph)
        if events == []: continue
        doc_key = datum['id']
        source_url = datum['url']
        if doc_key not in transformed_dataset.keys():
            transformed_dataset[doc_key] = {
                "doc_id": doc_key,
                "source_url": source_url,
                "content": datum['content'],
                "summary": datum['summary'],
                "events": []
            }
        transformed_dataset[doc_key]['events'] = events
    return list(transformed_dataset.values())

def remove_duplicates(dataset):
    reverse_index_url = defaultdict(list)
    kept_dataset = []
    for index, datum in enumerate(dataset):
        if datum['source_url'] in reverse_index_url.keys():
            previous_data = reverse_index_url[datum['source_url']]
            duplicate = False
            for previous_datum in previous_data:
                if " ".join(datum['content'][0][0]) == " ".join(previous_datum['content'][0][0]):
                    duplicate = True
                    break
            if duplicate: continue
        reverse_index_url[datum['source_url']].append(datum)
        kept_dataset.append(datum)
    return kept_dataset


In [None]:
# All the News
AllTheNews = json.load(open(r'data/result/AllTheNews/preprocessed/2016_10p.json'))
transformed_dataset = transform_dataset(AllTheNews)
save_json(transformed_dataset, r'data/result/AllTheNews/linked/2016_10p.json')

In [None]:
# RAMS
# dataset = json.load(open('data/result/RAMS/gpt_events_dev.json'))
# AllTheNews
dataset = json.load(open('data/result/AllTheNews/linked/2016_10p.json'))
# transformed_dataset = transform_dataset(dataset)
# transformed_dataset = remove_duplicates(dataset)
# second round linking
for index, datum in enumerate(dataset):
    print("{}/{}".format(index, len(dataset)))
    datum['events'] = second_round_linking(datum['events'])
# save_json(transformed_dataset, r'data/result/RAMS/gpt_events_dev_linked.json')
save_json(transformed_dataset, r'data/result/AllTheNews/linked/2016_10p2.json')

In [None]:
def second_round_linking(events):
    for event in events:
        for argument in event['arguments']:
            if argument['argument_id'] == argument['argument_word']:
                if 'entity_type' in argument.keys(): del argument['entity_type']
                spans = refined.process_text(argument['argument_word'])
                for span in spans:
                    if span.predicted_entity != None and span.predicted_entity.wikidata_entity_id != None:
                        entity_id = span.predicted_entity.wikidata_entity_id
                        entity_title = span.predicted_entity.wikipedia_entity_title
                        argument['argument_id'] = entity_id
                        argument['entity_title'] = entity_title
                        argument['entity_type'] = span.coarse_mention_type
    return events

In [None]:
dataset = json.load(open('data/result/RAMS/gpt_events_dev_linked.json'))
for datum in dataset:
    datum['events'] = second_round_linking(datum['events'])
save_json(dataset, r'data/result/RAMS/gpt_events_dev_linked.json')

In [None]:
from collections import defaultdict
# create node link graph
def construct_network(docs):
    nodes_dict = {}
    event_hyperedges_dict = {}
    links = []
    sub_event_links = defaultdict(list)
    sub_event_hyperedges_dict = defaultdict(list)
    for doc in docs:
        doc_id = doc['doc_id']
        doc_url = doc['source_url']
        event_triggers = []
        event_arguments = set()
        event_triggers = sorted(list(set([sub_event['trigger'] for sub_event in doc['events']])))
        # check gpt response for error
        if len(event_triggers) == 1:
            trigger = event_triggers[0]
            if trigger.startswith("I'm sorry"): continue
            if trigger.startswith("No event"): continue
            if trigger.startswith("There are no"): continue
            if trigger.startswith("There is no"): continue
            if trigger.startswith("I'm unable to "): continue

        event_hyperedge_id = str(doc_id) +  "-" + "-".join(event_triggers)
        for sub_event in doc['events']:
            arguments = sub_event['arguments']
            # create an entity node for each argument
            for argument in arguments:
                # if argument['argument_id'] == argument['argument_word']: 
                #     argument['argument_id'] = argument['argument_id'] + '-' + str(doc_id)
                argument_id = argument['argument_id']
                argument_word = argument['argument_word']
                argument_title = argument['entity_title'] if 'entity_title' in argument else argument_word
                argument_entity_type = argument['entity_type'] if 'entity_type' in argument else "None"
                # argument_span = argument['argument_span']
                # argument_role = argument['argument_role']
                if argument_id not in nodes_dict.keys():
                    nodes_dict[argument_id] = {
                        "id": argument_id, 
                        "title": argument_title,
                        "entity_type": argument_entity_type,
                        "type": "entity",
                        # "argument_role": argument_role,
                        "mentions": [
                            {
                                "doc_id": doc_id,
                                "mention": argument_word,
                                # "span": {'start': argument_span[0], 'end': argument_span[1]}
                            }
                        ]
                    }
                else:
                    nodes_dict[argument_id]['mentions'].append(
                        {
                            "doc_id": doc_id,
                            "mention": argument_word,
                            # "span": {'start': argument_span[0], 'end': argument_span[1]}
                        }
                    )
            argument_ids = [argument['argument_id'] for argument in arguments]
            sorted_argument_ids = sorted(argument_ids)
            # create hyperedge 
            trigger_id = sub_event['trigger'] 
            sub_event_hyper_edge_id = trigger_id  + "-" + str(doc_id)
            sub_event_hyperedges_dict[event_hyperedge_id].append({
                'id': sub_event_hyper_edge_id,
                'type': "subevent_hyper_edge",
                "trigger": trigger_id,
                # "trigger_type": trigger_type,
                "arguments": sorted_argument_ids,
                "doc_id": doc_id,
            })
            for argument_id in argument_ids:
                sub_event_links[event_hyperedge_id].append((sub_event_hyper_edge_id, argument_id))
            event_arguments.update(argument_ids)

        event_hyperedges_dict[event_hyperedge_id] = {
            'id': event_hyperedge_id,
            'type': "hyper_edge",
            "trigger": event_hyperedge_id,
            "arguments": list(event_arguments),
            "doc_id": doc_id,
            "summary": doc['summary'],
            "content": doc['content'],
            "date": doc['date'],
        }
        for argument_id in list(event_arguments):
            links.append((event_hyperedge_id, argument_id))

    return nodes_dict, event_hyperedges_dict, links, sub_event_hyperedges_dict, sub_event_links

def merge_network(dataset):
    nodes_dict, hyper_edges_dict, links, sub_event_hyperedges, sub_event_links_dict = construct_network(dataset)
    B = nx.Graph()
    B.add_nodes_from(list(hyper_edges_dict.keys()), bipartite=0)
    B.add_nodes_from(list(nodes_dict.keys()), bipartite=1)
    B.add_edges_from(links)

    return B, nodes_dict, hyper_edges_dict, links, sub_event_hyperedges, sub_event_links_dict


In [None]:
# transformed_dataset = json.load(open('data/result/RAMS/gpt_events_dev_linked.json'))
transformed_dataset = json.load(open('data/result/AllTheNews/linked/2016_10p.json'))
B, nodes_dict, hyper_edges_dict, links, sub_event_hyperedges, sub_event_links_dict = merge_network(transformed_dataset)



In [None]:
H = hnx.Hypergraph.from_bipartite(B)
list(H.shape)

In [None]:
event_hgraph_data = nx.node_link_data(B)
save_json(event_hgraph_data, r'data/result/AllTheNews/network/hgraph.json')
save_json(nodes_dict, r'data/result/AllTheNews/network/nodes.json')
save_json(hyper_edges_dict, r'data/result/AllTheNews/network/hyperedges.json')

save_json(sub_event_hyperedges, r'data/result/AllTheNews/sub_network/sub_event_hyperedges.json')
save_json(sub_event_links_dict, r'data/result/AllTheNews/sub_network/sub_event_links.json')

In [None]:
def transform_frontend(nodes, links, nodes_dict, hyper_edges_dict):
    res_nodes = []
    res_links = []
    for node in nodes:
        if node in nodes_dict:
            res_nodes.append(nodes_dict[node])
        else:
            print(hyper_edges_dict[node]['date'])
            date = hyper_edges_dict[node]['date'].replace("-", "/")
            date.replace("-", "/")
            hyper_edges_dict[node]['date'] = date
            print(hyper_edges_dict[node]['date'])

            res_nodes.append(hyper_edges_dict[node])
    for link in links:
        source = link[0]
        target = link[1]
        res_links.append({
            "source": source,
            "target": target,
        })
    print(len(res_nodes))
    return {
        "nodes": res_nodes, 
        "links": res_links
    }

In [None]:
BH = H.bipartite()
network = transform_frontend(list(BH.nodes), list(BH.edges), nodes_dict, hyper_edges_dict)
save_json(network, 'data/result/AllTheNews/network/server/frontend.json')

In [None]:
network = json.load(open('data/result/AllTheNews/network/server/frontend.json'))
hyperedge_nodes = list(filter(lambda node: node['type'] == 'hyper_edge', network['nodes']))
entity_nodes = list(filter(lambda node: node['type'] == 'entity' and node['id'] != node['title'], network['nodes']))

entity_node_ids = list(map(lambda node: node['id'], entity_nodes))
hyperedge_node_ids = list(map(lambda node: node['id'], hyperedge_nodes))
entity_links = list(filter(lambda link: link['source'] in entity_node_ids or link['target'] in entity_node_ids, network['links']))

In [None]:
partitions = json.load(open('data/result/AllTheNews/network/server/ravasz_partitions.json'))
partition = partitions[5]

In [None]:
# construct bipartite network for statistics
nx_entity_links = list(map(lambda link: (link['source'], link['target']), entity_links))
B = nx.Graph()
B.add_nodes_from(hyperedge_node_ids, bipartite=0)
B.add_nodes_from(entity_node_ids, bipartite=1)
B.add_edges_from(nx_entity_links)

In [None]:
# degree
entity_node_degrees = B.degree(entity_node_ids)
entity_node_statistics = {}
for node, degree in entity_node_degrees:
    entity_node_statistics[node] = {
        "degree": degree,
    }
save_json(entity_node_statistics, 'data/result/AllTheNews/network/server/entity_node_statistics.json')

In [None]:
# connectivity to each cluster
cluster_connectivity = defaultdict(lambda: defaultdict(int))
for link in entity_links:
    source = link['source']
    target = link['target']
    if source[0] == "Q": # source is entity
        cluster_connectivity[source][partition[target]] += 1
    else:
        cluster_connectivity[target][partition[source]] += 1

['20856-Address-Call-Cancel-Express-Identify-Interrupt-Kill-Wound', '55853-Address-Attack-Conclude-Emphasize-Expose-Injure-Kill-Mention-Shoot', '55859-Attack-Express-Highlight-Kill-Mention-Reminiscent', '201155-Emphasize-Highlight-Make-Protest-Show', '171051-Avoid-Emphasize-Explain-Highlight-Include', '41142-Involve-Mention', '201129-Call out-Commit-Emphasize-Expressed-Kill-Shame-Speak up-Unity', '40918-Blame-Call-Criticize-Restore', '201196-Care-Dedicate-Do good-Include-Kill-Love-Open fire-Protect-Serve-Understand', '109196--Explored-Highlighted-Lost-Reported-Took on', '139428-Capture-Collapse-Hide behind-Shoot-Show-Sneak up on', '156388-Highlight-Incident-Show', '170724-Release-Show-Spark', '170915-Come together-Concern-Express-Involve-Reflect-Tragic', '171866-Associated with-Find associations-Looked at-Not meant to replace-Predicted-Showed promise', '213455-Emphasize-Invite-Pray-Turn-Urge', '42468--Investigation ongoing;-Mentioned-Release without charges-Result of-Road rage incident