In [44]:
import networkx as nx
import json
from pybrat.parser import BratParser, Entity, Event, Example, Relation
import matplotlib.pyplot as plt
from pprint import pprint
import numpy as np
from collections import defaultdict
import hypernetx as hnx
from networkx.algorithms import bipartite


In [33]:
def remove_keys(dict, kept_keys):
    return {key: value for key, value in dict.items() if key in kept_keys}

In [5]:
def read_brat_data(data_path):
    brat = BratParser(error="ignore")
    brat_data = brat.parse(data_path)
    return brat_data

In [55]:
def disambiguate(brat_data, EL_data_path):
    # assuming no nested events
    nodes_dict = {}
    hyper_edges_dict = {}
    links = []
    connected_nodes = set()
    for doc in brat_data:
        origin_id_to_new_id_dict = {}
        entity_umls = json.load(open(EL_data_path + doc.id + r'.json'))
        # entity disambiguation
        for entity in doc.entities:
            # TODO: add candidate selection. For now just choosing the first one.
            entity_ui = entity.id # reassign node id to either doc_id + original id or CUI
            if len(entity_umls[entity.id]) != 0: 
                entity_ui = entity_umls[entity.id][0]['ui']
            else:
                # TODO: consider partial matching to disambiguiate entities without CUIs
                entity_ui = doc.id + "-" + entity_ui
            origin_id_to_new_id_dict[entity.id] = entity_ui

            if entity_ui not in nodes_dict.keys():
                nodes_dict[entity_ui] = {
                    "id": entity_ui,
                    "type": "entity",
                    "mentions": [
                        {
                            "doc_id": doc.id, 
                            "mention": entity.mention, 
                            "span": {'start': entity.spans[0].start, 'end': entity.spans[0].end}
#                             "span": entity.spans
                        }
                    ],
                }
            else:
                nodes_dict[entity_ui]["mentions"].append(
                    {
                        "doc_id": doc.id, 
                        "mention": entity.mention, 
                        "span": {'start': entity.spans[0].start, 'end': entity.spans[0].end}
                    }
                )
        
        # each event is treated as a hyper-edge. 
        # The hyperedge id needs to be independent, but the node type is trigger id 
        for event in doc.events:
            trigger_id = origin_id_to_new_id_dict[event.trigger.id]
            origin_id_to_new_id_dict[event.id] = trigger_id 
            argument_ids = list(map(lambda argument: origin_id_to_new_id_dict[argument.id], event.arguments))
            connected_nodes.update(argument_ids)
            
            sorted_argument_ids = sorted(argument_ids)
            # create hyper edge node
            hyper_edge_node_id = trigger_id + "-" + "-".join(sorted_argument_ids)

            if hyper_edge_node_id not in hyper_edges_dict.keys():
                hyper_edges_dict[hyper_edge_node_id] = {
                    "id": hyper_edge_node_id,
                    "type": "hyper_edge",
                    "trigger": trigger_id,
                    "arguments": sorted_argument_ids,
                    "mentions": [
                        {
                            "doc_id": doc.id, 
                            # TODO: add sentence span
                            # "mention": entity.mention, 
                            # "span": [entity.spans]
                        }
                    ],
                }
            else:
                hyper_edges_dict[hyper_edge_node_id]["mentions"].append(
                    {
                        "doc_id": doc.id,
                        # TODO: add sentence span
                    }
                )

            # add links between hyper edge node and arguments
            for argument_id in argument_ids:
                links.append((hyper_edge_node_id, argument_id))
#         if doc_count == 10: break
    # clean up un connected nodes
    print(len(nodes_dict), len(hyper_edges_dict), len(links))
    nodes_dict = remove_keys(nodes_dict, kept_keys=connected_nodes)
    print(len(nodes_dict), len(hyper_edges_dict), len(links))
    return nodes_dict, hyper_edges_dict, links

In [59]:
def merge_brat_data(brat_data, EL_data_path):
    """
    Takes in a list of brat data and merge them into one hypergraph
    """
    nodes_dict, hyper_edges_dict, links = disambiguate(brat_data, EL_data_path) 

    B = nx.Graph()
    B.add_nodes_from(list(nodes_dict.keys()), bipartite=0)
    B.add_nodes_from(list(hyper_edges_dict.keys()), bipartite=1)
    B.add_edges_from(links)
    return hnx.Hypergraph.from_bipartite(B)

In [12]:
def check_nested_events(brat_data):
    for doc in brat_data:
        entity_ids = [entity.id for entity in doc.entities]
        trigger_ids = [event.trigger for event in doc.events]
        for event in doc.events:
            argument_ids = [argument.id for argument in event.arguments]
            if any([argument_id in trigger_ids for argument_id in argument_ids]):
                print("Nested event found in document: " + doc.id)
                print(argument_ids)
                print(trigger_ids)

In [62]:
data_path = '../../preprocess/data/result/ID2011/'
brat_data = read_brat_data(data_path)
H = merge_brat_data(brat_data, EL_data_path=data_path)

4998 2036 3563
1232 2036 3563


In [74]:
# print('1-component edge sets:')
# for comp in H.s_components(s=1):
#     print(comp)

# print('\n2-component edge sets:')
# for comp in H.s_components(s=2):
#     print(comp)
print('\n3-component edge sets:')
for comp in H.s_components(s=3):
    print(comp)


3-component edge sets:
{'C0969563', 'C0004482'}
{'C1701700', 'C0031715'}
{'C1451978', 'C0038568', 'C1327616', 'C0967543', 'C1433342', 'C0768768', 'C0657653', 'C0961725', 'C1739675', 'C1431905', 'C1429451', 'C5672738', 'C0916739', 'C0042765', 'C0531454', 'C0255025', 'C1441654', 'C0185117'}
{'C3001658', 'C0086860'}
{'C1429861', 'C1569547', 'C1449385'}
{'C2286466', 'C1437563'}
{'C0003537'}
{'C0004597'}
{'C0004599'}
{'C0006303'}
{'C0006304'}
{'C0007018'}
{'C0008039'}
{'C0010749'}
{'C0011522'}
{'C0012890'}
{'C0014834'}
{'C0016228'}
{'C0017243'}
{'C0017262'}
{'C0017861'}
{'C0019472'}
{'C0019494'}
{'C0020835'}
{'C0020852'}
{'C0020855'}
{'C0020859'}
{'C0020860'}
{'C0021745'}
{'C0021756'}
{'C0022959'}
{'C0024467'}
{'C0025646'}
{'C0026914'}
{'C0026917'}
{'C0026926'}
{'C0027303'}
{'C0030054'}
{'C0030894'}
{'C0032604'}
{'C0033268'}
{'C0033373'}
{'C0033809'}
{'C0033811'}
{'C0034579'}
{'C0034805'}
{'C0035644'}
{'C0036106'}
{'C0036117'}
{'C0036126'}
{'C0036766'}
{'C0037231'}
{'C0037420'}
{'C0038172'