In [1]:
import networkx as nx
from community import community_louvain
import json
from pybrat.parser import BratParser, Entity, Event, Example, Relation
import matplotlib.pyplot as plt
from pprint import pprint
import numpy as np
from collections import defaultdict

In [2]:
def read_brat_data(data_path):
    brat = BratParser(error="ignore")
    brat_data = brat.parse(data_path)
    return brat_data

In [57]:
def brat_data_to_network(data_path):
    G = nx.Graph()
    nodes_dict = {} 
    links = defaultdict(lambda: defaultdict(int))
    doc_count = 0
    argument_num_dict = defaultdict(int)
    for doc in brat_data:
        doc_count += 1
        origin_id_to_new_id_dict = {}
        entity_umls = json.load(open(data_path + doc.id + r'.json'))
#         print(doc.id)
        for entity in doc.entities:
            # TODO: add candidate selection. For now just choosing the first one.
            entity_ui = entity.id # reassign node id to either doc_id + original id or CUI
            if len(entity_umls[entity.id]) != 0: 
                entity_ui = entity_umls[entity.id][0]['ui']
            else:
                # TODO: consider partial matching to disambiguiate entities without CUIs
                entity_ui = doc.id + "-" + entity_ui
            origin_id_to_new_id_dict[entity.id] = entity_ui

            if entity_ui not in nodes_dict.keys():
                nodes_dict[entity_ui] = {
                    "id": entity_ui,
                    "type": "entity",
                    "mentions": [
                        {
                            "doc_id": doc.id, 
                            "mention": entity.mention, 
                            "span": {'start': entity.spans[0].start, 'end': entity.spans[0].end}
#                             "span": entity.spans
                        }
                    ],
                }
            else:
                nodes_dict[entity_ui]["mentions"].append(
                    {
                        "doc_id": doc.id, 
                        "mention": entity.mention, 
                        "span": {'start': entity.spans[0].start, 'end': entity.spans[0].end}


#                         "span": [entity.spans.start, entity.spans.end]
#                         "span": entity.spans
                    }
                )
        # each event is treated as a hyper-edge. 
        # First create a hyper edge node, then connect the hyper edge node with the arguments
        # The node id needs to be independent, but the node type is trigger id 
        for event in doc.events:
            trigger_id = origin_id_to_new_id_dict[event.trigger.id]
            origin_id_to_new_id_dict[event.id] = trigger_id # account for nested events
            argument_ids = list(map(lambda argument: origin_id_to_new_id_dict[argument.id], event.arguments))
            
            sorted_argument_ids = sorted(argument_ids)
            # create hyper edge node
            hyper_edge_node_id = trigger_id + "-" + "-".join(sorted_argument_ids)

            if hyper_edge_node_id not in nodes_dict.keys():
                nodes_dict[hyper_edge_node_id] = {
                    "id": hyper_edge_node_id,
                    "type": "hyper_edge",
                    "trigger": trigger_id,
                    "arguments": sorted_argument_ids,
                    "mentions": [
                        {
                            "doc_id": doc.id, 
                            # TODO: add sentence span
                            # "mention": entity.mention, 
                            # "span": [entity.spans]
                        }
                    ],
                }
            else:
                nodes_dict[hyper_edge_node_id]["mentions"].append(
                    {
                        "doc_id": doc.id,
                        # TODO: add sentence span
                    }
                )

            # add links between hyper edge node and arguments
            for argument_id in argument_ids:
                links[hyper_edge_node_id][argument_id] = 1

            # argument_num_dict[len(argument_ids)].append(trigger_id)
            argument_num_dict[len(argument_ids)] += 1
#                 links.append((trigger_id, argument_id, {'attr': 'someAttr'}))
#         if doc_count == 10: break

    pprint(argument_num_dict)
    # turn overlapping links into link length
    links_as_list = []
    for hyper_edge_node_id, argument_ids in links.items():
        for argument_id in argument_ids.keys():
            links_as_list.append((hyper_edge_node_id, argument_id))
    
    # remove nodes that do not have links
    G.add_nodes_from([(node_id, node_attribute_dict) for node_id, node_attribute_dict in nodes_dict.items()])
    G.add_edges_from(links_as_list)
    
    G.remove_nodes_from(list(n for n in G.nodes() if G.degree(n) == 0))

    print(G.number_of_nodes(), G.number_of_edges())

    return G


In [58]:
data_path = (r'brat-1.3p1/data/all-brat/')
brat_data = read_brat_data(data_path)
event_network = brat_data_to_network(data_path)

defaultdict(<class 'int'>, {1: 2277, 0: 846, 2: 602, 3: 26, 4: 1})
3234 2520


In [5]:
def draw_network(G):
    # print(G.nodes().data())
    colors = list(map(lambda node: 'blue' if node[1]['type'] == 'entity' else 'black', G.nodes().data()))
    node_sizes = list(map(lambda node: 100+G.degree(node), G.nodes()))
    options = {
        "node_color": colors,
        "node_size": node_sizes,
        "width": 0.5,
        "with_labels": False,
        "pos": nx.spring_layout(G, k=0.15)
    }
    fig = plt.figure(1, figsize=(12, 12), dpi=60)
    nx.draw(G, **options)
    plt.show()


In [6]:
def plot_degree_distribution(G, fit_line=True):
    degree_sequence = [G.degree(node) for node in G.nodes()]
    degree_counts = [(degree, degree_sequence.count(degree)) for degree in set(degree_sequence)]
    x, y = zip(*degree_counts)
        
    # fit line
    if fit_line:
        filter_degree = 15
        filtered_degree_sequence = list(filter(lambda degree: degree < filter_degree, degree_sequence))
        filtered_degree_counts = [(degree, degree_sequence.count(degree)) for degree in set(filtered_degree_sequence)]
        filtered_x, filtered_y = zip(*filtered_degree_counts)
        log_x = np.log10(filtered_x)
        log_y = np.log10(filtered_y)
        slope, intercept = np.polyfit(log_x, log_y, 1)
        print("slope:", slope, "intercept:", intercept)
        x_vals = np.array([min(filtered_x), max(filtered_x)])
        y_vals = 10**(intercept + slope*np.log10(x_vals))
        plt.plot(x_vals, y_vals, '--')
    
        
    plt.scatter(x, y)
    plt.xscale("log")
    plt.yscale("log")
    plt.xlabel('Degree')
    plt.ylabel('Probability')
    plt.show()

In [7]:
def get_k_highest_degree_node(G, k=0):
    # get the degree of each node
    degrees = dict(G.degree())
    largest_degrees = sorted(degrees, key=degrees.get, reverse=True)
    # get the data of the k nodes with the largest degree
    data = [G.nodes[node] for node in largest_degrees]
#     for k, node in enumerate(data[:50]):
#         if node['type'] == 'event':
#             print(node['id'], node['type'], degrees[largest_degrees[k]])
#         else:
#             mentions = list(map(lambda mention_data: mention_data['mention'], node['mentions']))
#             print(node['id'], node['type'], degrees[largest_degrees[k]], len(mentions))


        
    print(f"The node with the {k} highest degree is {largest_degrees[k]}, with degree {degrees[largest_degrees[k]]}")
#     pprint(data[k])

In [8]:
def plot_degree_list_bar(G):
    # parameters
    color_map = {'entity': 'r', 'event': 'b'}
    degrees = dict(G.degree())
    k = 50
    
    # prepare bar data
    # x
    node_list = sorted(degrees, key=degrees.get, reverse=True)[:k]
    for node in node_list:
        node_data = G.nodes[node]
        print(node_data['id'], node_data['type'], G.degree(node))
    # y
    degree_list = [G.degree(node) for node in node_list]
    # color
    type_list = [G.nodes[node]['type'] for node in node_list]
    color_list = [color_map[type] for type in type_list]
    
    # plot
    fig,a = plt.subplots()

    a.bar(node_list, degree_list, color=color_list, edgecolor='white', linewidth=1)
    a.xaxis.set_visible(False)

    # remove x-axis label
#     ax.set(xlabel=None)

    
    # add a legend for the color map
    legend_list = [plt.Rectangle((0,0),1,1,color=color_map[node_type]) for node_type in color_map.keys()]
    plt.legend(legend_list, color_map.keys())


    # show the plot
    plt.show()

In [9]:
def run_community_detection(G, alg='louvain'):
    if alg == 'louvain':
        return community_louvain.best_partition(G, weight='strength')

In [10]:
def community_layout(g, partition):
    """
    Compute the layout for a modular graph.


    Arguments:
    ----------
    g -- networkx.Graph or networkx.DiGraph instance
        graph to plot

    partition -- dict mapping int node -> int community
        graph partitions


    Returns:
    --------
    pos -- dict mapping int node -> (float x, float y)
        node positions

    """

    pos_communities = _position_communities(g, partition, scale=3.)

    pos_nodes = _position_nodes(g, partition, scale=1.)

    # combine positions
    pos = dict()
    for node in g.nodes():
        pos[node] = pos_communities[node] + pos_nodes[node]

    return pos

def _position_communities(g, partition, **kwargs):

    # create a weighted graph, in which each node corresponds to a community,
    # and each edge weight to the number of edges between communities
    between_community_edges = _find_between_community_edges(g, partition)

    communities = set(partition.values())
    hypergraph = nx.DiGraph()
    hypergraph.add_nodes_from(communities)
    for (ci, cj), edges in between_community_edges.items():
        hypergraph.add_edge(ci, cj, weight=len(edges))

    # find layout for communities
    pos_communities = nx.spring_layout(hypergraph, **kwargs)

    # set node positions to position of community
    pos = dict()
    for node, community in partition.items():
        pos[node] = pos_communities[community]

    return pos

def _find_between_community_edges(g, partition):

    edges = dict()

    for (ni, nj) in g.edges():
        ci = partition[ni]
        cj = partition[nj]

        if ci != cj:
            try:
                edges[(ci, cj)] += [(ni, nj)]
            except KeyError:
                edges[(ci, cj)] = [(ni, nj)]

    return edges

def _position_nodes(g, partition, **kwargs):
    """
    Positions nodes within communities.
    """

    communities = dict()
    for node, community in partition.items():
        try:
            communities[community] += [node]
        except KeyError:
            communities[community] = [node]

    pos = dict()
    for ci, nodes in communities.items():
        subgraph = g.subgraph(nodes)
        pos_subgraph = nx.spring_layout(subgraph, **kwargs)
        pos.update(pos_subgraph)

    return pos

def visualize_community(G, partition):
    # to install networkx 2.0 compatible version of python-louvain use:
    # pip install -U git+https://github.com/taynaud/python-louvain.git@networkx2
    from community import community_louvain

    partition = community_louvain.best_partition(G)
    pos = community_layout(G, partition)

    fig = plt.figure(1, figsize=(12, 12), dpi=60)
#     pprint(partition)
    nx.draw(G, pos, node_color=list(partition.values()), node_size=40)


    plt.show()
    return

In [59]:
data_path = (r'brat-1.3p1/data/all-brat/')
brat_data = read_brat_data(data_path)
event_network = brat_data_to_network(data_path)
print(event_network.number_of_nodes(), event_network.number_of_edges())
# pprint(communities)


defaultdict(<class 'int'>, {1: 2277, 0: 846, 2: 602, 3: 26, 4: 1})
3234 2520
3234 2520


In [70]:
# filter out events that only occur once
# filtered_nodes = [n for n in event_network.nodes() if event_network.nodes[n]['type']=='entity']
# filtered_nodes = [event_network.nodes[n]['type'] for n in event_network.nodes()]


filtered_nodes = [n for n in event_network.nodes() if event_network.nodes[n]['type']=='hyper_edge' and event_network.degree(n) <= 1]
# pprint(filtered_nodes)
# filtered_nodes = [n for n in event_network.nodes() if event_network.degree(n) <= 1]
event_network.remove_nodes_from(filtered_nodes)
event_network.remove_nodes_from([n for n in event_network.nodes() if event_network.degree(n) == 0])

print(event_network.number_of_nodes(), event_network.number_of_edges())


# draw_network(event_network)
# plot_degree_distribution(event_network)
# plot_degree_list_bar(event_network)
# communities = run_community_detection(event_network)

# visualize_community(event_network, communities)
# get_k_highest_degree_node(event_network, 5)

868 1013


In [71]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)
event_network_data = nx.node_link_data(event_network)
pprint(event_network_data)
save_json(event_network_data, r'event_network_data.json')

{'directed': False,
 'graph': {},
 'links': [{'source': 'C0718617', 'target': 'C1880177-C0042765-C0718617'},
           {'source': 'C0718617', 'target': 'C4296939-C0042765-C0718617'},
           {'source': 'C0718617', 'target': 'C0851285-C0718617-C1430105'},
           {'source': 'C0718617',
            'target': 'C0851285-C0718617-PMC1475658-03-Discussion-03-T18'},
           {'source': 'C3714514', 'target': 'C3245501-C1449385-C3714514'},
           {'source': 'C3714514', 'target': 'C5240713-C0175697-C3714514'},
           {'source': 'C3714514', 'target': 'C5240713-C0034579-C3714514'},
           {'source': 'C3714514', 'target': 'C3245501-C0175697-C3714514'},
           {'source': 'C3714514',
            'target': 'C0035820-C3714514-PMC2358977-03-Discussion-T14'},
           {'source': 'C3714514', 'target': 'C0455242-C0205339-C3714514'},
           {'source': 'C3714514',
            'target': 'PMC2593050-00-TIAB-T38-C0033268-C3714514'},
           {'source': 'C3714514',
            't

            'target': 'PMC2885601-04-DISCUSSION-T83-C1419948-C5419223'},
           {'source': 'C5419223',
            'target': 'PMC2885601-04-DISCUSSION-T83-C0453947-C5419223'},
           {'source': 'C5419223', 'target': 'C5419223-C1419948-C5419223'}],
 'multigraph': False,
 'nodes': [{'id': 'C0718617',
            'mentions': [{'doc_id': 'PMC1475658-00-TIAB',
                          'mention': 'AprA',
                          'span': {'end': 906, 'start': 902}},
                         {'doc_id': 'PMC1475658-00-TIAB',
                          'mention': 'aprA',
                          'span': {'end': 1027, 'start': 1023}},
                         {'doc_id': 'PMC1475658-00-TIAB',
                          'mention': 'aprA',
                          'span': {'end': 1118, 'start': 1114}},
                         {'doc_id': 'PMC1475658-01-Introduction',
                          'mention': 'AprA',
                          'span': {'end': 4756, 'start': 4752}},
              

                          'mention': 'infection',
                          'span': {'end': 625, 'start': 616}},
                         {'doc_id': 'PMC1913099-02-Results-Discussion-14',
                          'mention': 'infection',
                          'span': {'end': 868, 'start': 859}},
                         {'doc_id': 'PMC1974823-01-Introduction',
                          'mention': 'infection',
                          'span': {'end': 371, 'start': 362}},
                         {'doc_id': 'PMC1976333-02-Results-Discussion-04',
                          'mention': 'infection',
                          'span': {'end': 250, 'start': 241}},
                         {'doc_id': 'PMC1976333-02-Results-Discussion-04',
                          'mention': 'infection',
                          'span': {'end': 865, 'start': 856}},
                         {'doc_id': 'PMC1976333-02-Results-Discussion-04',
                          'mention': 'infection',
                   

                          'mention': 'infection',
                          'span': {'end': 1427, 'start': 1418}},
                         {'doc_id': 'PMC2829017-02-Results_and_discussion-03',
                          'mention': 'infection',
                          'span': {'end': 796, 'start': 787}},
                         {'doc_id': 'PMC2829017-02-Results_and_discussion-05-01',
                          'mention': 'infection',
                          'span': {'end': 551, 'start': 542}},
                         {'doc_id': 'PMC2829017-02-Results_and_discussion-05-01',
                          'mention': 'infection',
                          'span': {'end': 897, 'start': 888}},
                         {'doc_id': 'PMC2829017-02-Results_and_discussion-05-01',
                          'mention': 'infection',
                          'span': {'end': 1234, 'start': 1225}},
                         {'doc_id': 'PMC2829017-02-Results_and_discussion-05-01',
                        

                          'mention': 'virulence',
                          'span': {'end': 557, 'start': 548}},
                         {'doc_id': 'PMC2651894-02-Results-03',
                          'mention': 'virulence',
                          'span': {'end': 721, 'start': 712}},
                         {'doc_id': 'PMC2651894-03-Discussion',
                          'mention': 'virulence',
                          'span': {'end': 158, 'start': 149}},
                         {'doc_id': 'PMC2651894-03-Discussion',
                          'mention': 'virulence',
                          'span': {'end': 585, 'start': 576}},
                         {'doc_id': 'PMC2651894-03-Discussion',
                          'mention': 'virulence',
                          'span': {'end': 732, 'start': 723}},
                         {'doc_id': 'PMC2651894-03-Discussion',
                          'mention': 'virulence',
                          'span': {'end': 1931, 'start': 1922}},


                         {'doc_id': 'PMC1475658-02-Results-01',
                          'mention': 'regulation',
                          'span': {'end': 3229, 'start': 3219}},
                         {'doc_id': 'PMC1475658-03-Discussion-03',
                          'mention': 'regulation',
                          'span': {'end': 644, 'start': 634}},
                         {'doc_id': 'PMC1475658-03-Discussion-03',
                          'mention': 'regulation',
                          'span': {'end': 880, 'start': 870}},
                         {'doc_id': 'PMC1804205-03-Discussion',
                          'mention': 'regulation',
                          'span': {'end': 2694, 'start': 2684}},
                         {'doc_id': 'PMC1874608-00-TIAB',
                          'mention': 'regulation',
                          'span': {'end': 14, 'start': 4}},
                         {'doc_id': 'PMC1874608-03-RESULTS-06',
                          'mention': 'regulat

                          'span': {'end': 5270, 'start': 5261}},
                         {'doc_id': 'PMC1874608-04-DISCUSSION',
                          'mention': 'increased',
                          'span': {'end': 5654, 'start': 5645}},
                         {'doc_id': 'PMC1913099-02-Results-Discussion-05',
                          'mention': 'Increased',
                          'span': {'end': 38, 'start': 29}},
                         {'doc_id': 'PMC2242835-00-TIAB',
                          'mention': 'increased',
                          'span': {'end': 1054, 'start': 1045}},
                         {'doc_id': 'PMC2242835-02-Results-04',
                          'mention': 'increased',
                          'span': {'end': 299, 'start': 290}},
                         {'doc_id': 'PMC2430206-02-Results-01',
                          'mention': 'increased',
                          'span': {'end': 1361, 'start': 1352}},
                         {'doc_id': 'PMC2

                         {'doc_id': 'PMC1804205-02-Results-05',
                          'mention': 'PmrA',
                          'span': {'end': 135, 'start': 131}},
                         {'doc_id': 'PMC1804205-02-Results-05',
                          'mention': 'PmrA',
                          'span': {'end': 200, 'start': 196}},
                         {'doc_id': 'PMC1804205-02-Results-05',
                          'mention': 'PmrA',
                          'span': {'end': 556, 'start': 552}},
                         {'doc_id': 'PMC1804205-02-Results-05',
                          'mention': 'pmrA',
                          'span': {'end': 605, 'start': 601}},
                         {'doc_id': 'PMC1804205-03-Discussion',
                          'mention': 'PmrA',
                          'span': {'end': 105, 'start': 101}},
                         {'doc_id': 'PMC1804205-03-Discussion',
                          'mention': 'PmrA',
                          'span

                         {'doc_id': 'PMC2639726-02-Results-01',
                          'mention': 'phoP',
                          'span': {'end': 3544, 'start': 3540}},
                         {'doc_id': 'PMC2639726-02-Results-01',
                          'mention': 'phoP',
                          'span': {'end': 4145, 'start': 4141}},
                         {'doc_id': 'PMC2639726-02-Results-04',
                          'mention': 'phoP',
                          'span': {'end': 1717, 'start': 1713}},
                         {'doc_id': 'PMC2639726-02-Results-07',
                          'mention': 'PhoP',
                          'span': {'end': 393, 'start': 389}},
                         {'doc_id': 'PMC2639726-02-Results-08',
                          'mention': 'PhoP',
                          'span': {'end': 416, 'start': 412}},
                         {'doc_id': 'PMC2639726-02-Results-08',
                          'mention': 'phoP',
                         

                         {'doc_id': 'PMC2639726-02-Results-08',
                          'mention': 'slyA',
                          'span': {'end': 1836, 'start': 1832}},
                         {'doc_id': 'PMC2639726-03-Discussion-01',
                          'mention': 'slyA',
                          'span': {'end': 843, 'start': 839}},
                         {'doc_id': 'PMC2639726-03-Discussion-01',
                          'mention': 'slyA',
                          'span': {'end': 4951, 'start': 4947}},
                         {'doc_id': 'PMC2639726-03-Discussion-01',
                          'mention': 'slyA',
                          'span': {'end': 5451, 'start': 5447}},
                         {'doc_id': 'PMC2639726-03-Discussion-03',
                          'mention': 'SlyA',
                          'span': {'end': 4, 'start': 0}},
                         {'doc_id': 'PMC2639726-03-Discussion-03',
                          'mention': 'SlyA',
              

                          'span': {'end': 2092, 'start': 2088}},
                         {'doc_id': 'PMC1874608-04-DISCUSSION',
                          'mention': 'InvF',
                          'span': {'end': 389, 'start': 385}},
                         {'doc_id': 'PMC1874608-04-DISCUSSION',
                          'mention': 'InvF',
                          'span': {'end': 724, 'start': 720}},
                         {'doc_id': 'PMC1874608-04-DISCUSSION',
                          'mention': 'invF',
                          'span': {'end': 882, 'start': 878}},
                         {'doc_id': 'PMC1874608-04-DISCUSSION',
                          'mention': 'invF',
                          'span': {'end': 1805, 'start': 1801}},
                         {'doc_id': 'PMC1874608-04-DISCUSSION',
                          'mention': 'invF',
                          'span': {'end': 2968, 'start': 2964}},
                         {'doc_id': 'PMC2266911-02-Results_and_Discussi

            'mentions': [{'doc_id': 'PMC1913099-02-Results-Discussion-06',
                          'mention': 'ATP',
                          'span': {'end': 706, 'start': 703}},
                         {'doc_id': 'PMC2682197-00-TIAB',
                          'mention': 'ATP',
                          'span': {'end': 92, 'start': 89}},
                         {'doc_id': 'PMC2682197-00-TIAB',
                          'mention': 'ATP',
                          'span': {'end': 1276, 'start': 1273}},
                         {'doc_id': 'PMC2682197-00-TIAB',
                          'mention': 'ATP',
                          'span': {'end': 1353, 'start': 1350}},
                         {'doc_id': 'PMC2682197-00-TIAB',
                          'mention': 'ATP',
                          'span': {'end': 1460, 'start': 1457}},
                         {'doc_id': 'PMC2682197-00-TIAB',
                          'mention': 'ATP',
                          'span': {'end': 1636, 'sta

                         {'doc_id': 'PMC1974823-04-Discussion',
                          'mention': 'mfa1',
                          'span': {'end': 1227, 'start': 1223}},
                         {'doc_id': 'PMC1974823-04-Discussion',
                          'mention': 'mfa1',
                          'span': {'end': 1288, 'start': 1284}},
                         {'doc_id': 'PMC1974823-04-Discussion',
                          'mention': 'mfa1',
                          'span': {'end': 1695, 'start': 1691}},
                         {'doc_id': 'PMC1974823-04-Discussion',
                          'mention': 'mfa1',
                          'span': {'end': 2706, 'start': 2702}},
                         {'doc_id': 'PMC1974823-04-Discussion',
                          'mention': 'mfa1',
                          'span': {'end': 2993, 'start': 2989}},
                         {'doc_id': 'PMC1974823-04-Discussion',
                          'mention': 'mfa1',
                     

            'id': 'C1159366-C1429451-PMC2242835-03-Discussion-T56',
            'mentions': [{'doc_id': 'PMC2242835-03-Discussion'}],
            'trigger': 'C1159366',
            'type': 'hyper_edge'},
           {'arguments': ['C1429451', 'PMC2242835-03-Discussion-T57'],
            'id': 'C1159366-C1429451-PMC2242835-03-Discussion-T57',
            'mentions': [{'doc_id': 'PMC2242835-03-Discussion'}],
            'trigger': 'C1159366',
            'type': 'hyper_edge'},
           {'arguments': ['C0042765', 'PMC2242835-03-Discussion-T28'],
            'id': 'C0726639-C0042765-PMC2242835-03-Discussion-T28',
            'mentions': [{'doc_id': 'PMC2242835-03-Discussion'}],
            'trigger': 'C0726639',
            'type': 'hyper_edge'},
           {'arguments': ['C1327616', 'PMC2242835-03-Discussion-T76'],
            'id': 'C0205224-C1327616-PMC2242835-03-Discussion-T76',
            'mentions': [{'doc_id': 'PMC2242835-03-Discussion'},
                         {'doc_id': 'PMC22

                          'mention': 'degrading',
                          'span': {'end': 506, 'start': 497}}],
            'type': 'entity'},
           {'arguments': ['C0962697', 'C1550641'],
            'id': 'C0962697-C0962697-C1550641',
            'mentions': [{'doc_id': 'PMC2565068-02-Results-05'}],
            'trigger': 'C0962697',
            'type': 'hyper_edge'},
           {'arguments': ['C0026914', 'C0962697'],
            'id': 'C0962697-C0026914-C0962697',
            'mentions': [{'doc_id': 'PMC2565068-02-Results-05'}],
            'trigger': 'C0962697',
            'type': 'hyper_edge'},
           {'id': 'C1827066',
            'mentions': [{'doc_id': 'PMC2565068-02-Results-07',
                          'mention': 'enhanced',
                          'span': {'end': 128, 'start': 120}},
                         {'doc_id': 'PMC2565068-02-Results-07',
                          'mention': 'enhanced',
                          'span': {'end': 616, 'start': 608}},
   

                          'span': {'end': 323, 'start': 319}},
                         {'doc_id': 'PMC2816692-02-Results-01',
                          'mention': 'SsrB',
                          'span': {'end': 945, 'start': 941}},
                         {'doc_id': 'PMC2816692-02-Results-01',
                          'mention': 'SsrB',
                          'span': {'end': 988, 'start': 984}},
                         {'doc_id': 'PMC2816692-02-Results-01',
                          'mention': 'SsrB',
                          'span': {'end': 1139, 'start': 1135}},
                         {'doc_id': 'PMC2816692-02-Results-01',
                          'mention': 'SsrB',
                          'span': {'end': 1324, 'start': 1320}},
                         {'doc_id': 'PMC2816692-02-Results-01',
                          'mention': 'ssrB',
                          'span': {'end': 1482, 'start': 1478}},
                         {'doc_id': 'PMC2816692-02-Results-01',
       

                          'span': {'end': 3523, 'start': 3519}},
                         {'doc_id': 'PMC2837388-03-Discussion',
                          'mention': 'sseA',
                          'span': {'end': 4637, 'start': 4633}}],
            'type': 'entity'},
           {'arguments': ['C0768768', 'C5226623'],
            'id': 'C1879547-C0768768-C5226623',
            'mentions': [{'doc_id': 'PMC2639726-02-Results-08'}],
            'trigger': 'C1879547',
            'type': 'hyper_edge'},
           {'arguments': ['C1158770', 'C1429451'],
            'id': 'C1879547-C1158770-C1429451',
            'mentions': [{'doc_id': 'PMC2639726-02-Results-08'}],
            'trigger': 'C1879547',
            'type': 'hyper_edge'},
           {'arguments': ['C1437584', 'C1741620'],
            'id': 'C1553423-C1437584-C1741620',
            'mentions': [{'doc_id': 'PMC2639726-03-Discussion-02'}],
            'trigger': 'C1553423',
            'type': 'hyper_edge'},
           {'argument

            'mentions': [{'doc_id': 'PMC2652828-02-Results_and_Discussion-07',
                          'mention': 'MdoH',
                          'span': {'end': 1150, 'start': 1146}}],
            'type': 'entity'},
           {'arguments': ['C0255025', 'C1137089'],
            'id': 'PMC2652828-02-Results_and_Discussion-07-T42-C0255025-C1137089',
            'mentions': [{'doc_id': 'PMC2652828-02-Results_and_Discussion-07'}],
            'trigger': 'PMC2652828-02-Results_and_Discussion-07-T42',
            'type': 'hyper_edge'},
           {'arguments': ['C0027303', 'C3252346'],
            'id': 'C0851827-C0027303-C3252346',
            'mentions': [{'doc_id': 'PMC2652828-02-Results_and_Discussion-07'}],
            'trigger': 'C0851827',
            'type': 'hyper_edge'},
           {'arguments': ['C0027303', 'C2933417'],
            'id': 'C0851827-C0027303-C2933417',
            'mentions': [{'doc_id': 'PMC2652828-02-Results_and_Discussion-07'}],
            'trigger': 'C0851

                          'span': {'end': 95, 'start': 91}},
                         {'doc_id': 'PMC2816692-03-Discussion-03',
                          'mention': 'srcA',
                          'span': {'end': 344, 'start': 340}},
                         {'doc_id': 'PMC2816692-03-Discussion-03',
                          'mention': 'srcA',
                          'span': {'end': 529, 'start': 525}},
                         {'doc_id': 'PMC2816692-03-Discussion-03',
                          'mention': 'srcA',
                          'span': {'end': 891, 'start': 887}},
                         {'doc_id': 'PMC2816692-03-Discussion-03',
                          'mention': 'srcA',
                          'span': {'end': 1093, 'start': 1089}},
                         {'doc_id': 'PMC2816692-03-Discussion-03',
                          'mention': 'SrcA',
                          'span': {'end': 1704, 'start': 1700}},
                         {'doc_id': 'PMC2816692-03-Discussio

                          'mention': 'BvrR',
                          'span': {'end': 3526, 'start': 3522}},
                         {'doc_id': 'PMC2858072-02-Results_and_Discussion-02',
                          'mention': 'bvrR',
                          'span': {'end': 3697, 'start': 3693}},
                         {'doc_id': 'PMC2858072-02-Results_and_Discussion-02',
                          'mention': 'bvrR',
                          'span': {'end': 3855, 'start': 3851}},
                         {'doc_id': 'PMC2858072-02-Results_and_Discussion-02',
                          'mention': 'bvrR',
                          'span': {'end': 3965, 'start': 3961}},
                         {'doc_id': 'PMC2858072-02-Results_and_Discussion-02',
                          'mention': 'BvrR',
                          'span': {'end': 4809, 'start': 4805}},
                         {'doc_id': 'PMC2858072-02-Results_and_Discussion-02',
                          'mention': 'bvrR',
          

            'type': 'hyper_edge'},
           {'arguments': ['C0205242', 'C0453947'],
            'id': 'C0205242-C0205242-C0453947',
            'mentions': [{'doc_id': 'PMC2885601-03-RESULTS-03'},
                         {'doc_id': 'PMC2885601-03-RESULTS-03'},
                         {'doc_id': 'PMC2885601-03-RESULTS-03'}],
            'trigger': 'C0205242',
            'type': 'hyper_edge'},
           {'arguments': ['C1302275', 'C1419948'],
            'id': 'C1302275-C1302275-C1419948',
            'mentions': [{'doc_id': 'PMC2885601-03-RESULTS-03'},
                         {'doc_id': 'PMC2885601-03-RESULTS-04'},
                         {'doc_id': 'PMC2885601-04-DISCUSSION'}],
            'trigger': 'C1302275',
            'type': 'hyper_edge'},
           {'id': 'C1621243',
            'mentions': [{'doc_id': 'PMC2885601-04-DISCUSSION',
                          'mention': 'GAS Mac',
                          'span': {'end': 467, 'start': 460}}],
            'type': 'entity'}

In [75]:
def save_hyper_edges(G, filepath=r'hyper_edges.txt'):
    hyper_edges = [n for n in event_network.nodes() if event_network.nodes[n]['type']=='hyper_edge']
    entities = [n for n in event_network.nodes() if event_network.nodes[n]['type']=='entity']
    # assuming no event-event connection
    node_to_index = {node: i+1 for i, node in enumerate(entities)}
    
    save_json(node_to_index, 'node_to_index.json')
    with open('hyper_edges.txt', 'w', encoding='utf-8') as f:
        for hyper_edge in hyper_edges:
            # assuming no event-event connection
            edge_nodes = [str(node_to_index[n]) for n in G.neighbors(hyper_edge) if event_network.nodes[n]['type']=='entity']
#             edge_nodes = [n for n in G[hyper_edge]]
            
#             if len(edge_nodes) == 1:
#                 pprint(hyper_edge)
#                 pprint([event_network.nodes[n] for n in edge_nodes])

            if len(edge_nodes) > 2:
                print(len(edge_nodes))
            line = ','.join(edge_nodes)
            f.write(line)
            f.write('\n')
    f.close()
        

In [76]:
save_hyper_edges(event_network)

3
3
3
3
3
3
3
3
3
3
3
3
3
4
3
3
3
3
3
3
3
3
