In [124]:
import json
import jsonlines
import networkx as nx
import hypernetx as hnx
import numpy as np
import matplotlib.pyplot as plt
from refined.inference.processor import Refined


  from .autonotebook import tqdm as notebook_tqdm


In [125]:
refined = Refined.from_pretrained(model_name='wikipedia_model_with_numbers',
                                  entity_set="wikipedia")

Downloading /Users/samytlee/.cache/refined/wikipedia_model_with_numbers/model.pt: 100%|██████████| 724M/724M [02:16<00:00, 5.30MB/s] 
Downloading /Users/samytlee/.cache/refined/wikipedia_model_with_numbers/config.json: 100%|██████████| 658/658 [00:00<00:00, 2.54kB/s]
Downloading /Users/samytlee/.cache/refined/wikipedia_model_with_numbers/precomputed_entity_descriptions_emb_wikipedia_6269457-300.np: 100%|██████████| 3.76G/3.76G [09:58<00:00, 6.29MB/s] 
Downloading /Users/samytlee/.cache/refined/roberta-base/pytorch_model.bin: 100%|██████████| 501M/501M [01:04<00:00, 7.76MB/s] 
Downloading /Users/samytlee/.cache/refined/roberta-base/config.json: 100%|██████████| 481/481 [00:00<00:00, 2.33kB/s]
Downloading /Users/samytlee/.cache/refined/roberta-base/vocab.json: 100%|██████████| 899k/899k [00:01<00:00, 603kB/s]
Downloading /Users/samytlee/.cache/refined/roberta-base/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 612kB/s]
Downloading /Users/samytlee/.cache/refined/wikipedia_data/pem.l

In [29]:
def save_json(data, filepath=r'new_data.json'):
    with open(filepath, 'w') as fp:
        json.dump(data, fp, indent=4)

In [192]:
dev_reader = jsonlines.open(r'data/raw/RAMS/dev.jsonlines')
test_reader = jsonlines.open(r'data/raw/RAMS/test.jsonlines')
train_reader = jsonlines.open(r'data/raw/RAMS/train.jsonlines')


In [205]:
def merge_sentences(datum):
    sentence_list = [" ".join(sentence_word_list) for sentence_word_list in datum['sentences']] # merge the words into sentences
    paragraph = " ".join(sentence_list)
    return paragraph

def merge_events(datum):
    words_flattened = [word for sentence in datum['sentences'] for word in sentence]
    triggers = datum['evt_triggers']
    trigger_type_dict = {}
    for trigger_datum in triggers:
        trigger_span = trigger_datum[:2]
        trigger_word = " ".join(words_flattened[trigger_span[0]:trigger_span[1]+1])
        trigger_type = trigger_datum[2][0][0]
        trigger_type_dict[trigger_word] = trigger_type
    links = datum['gold_evt_links']
    events = {}
    for link in links:
        trigger_span = link[0] # a list of [start, end]
        trigger_word = " ".join(words_flattened[trigger_span[0]:trigger_span[1]+1]) # a string
        argument_span = link[1] # a list of [start, end]
        argument_word = " ".join(words_flattened[argument_span[0]:argument_span[1]+1]) # a string
        argument_role = link[2] # a string
        trigger_type = trigger_type_dict[trigger_word]
        if trigger_word not in events.keys():
            events[trigger_word] = {
                "trigger": trigger_word,
                "trigger_span": trigger_span,
                "trigger_type": trigger_type,
                "arguments": [
                    {
                        "argument_id": argument_word,
                        "argument_word": argument_word,
                        "argument_role": argument_role,
                        "argument_span": argument_span
                    }
                ],
            }
        else:
            events[trigger_word]['arguments'].append({
                "argument_id": argument_word,
                "argument_word": argument_word,
                "argument_role": argument_role,
                "argument_span": argument_span
            })
    return list(events.values())

def link_entities(events, paragraph):
    spans = refined.process_text(paragraph)
    for span in spans:
        entity_word = span.text
        for event in events:
            for argument in event['arguments']:
                if argument['argument_word'] == entity_word:
                    if span.predicted_entity != None and span.predicted_entity.wikidata_entity_id != None:
                        entity_id = span.predicted_entity.wikidata_entity_id
                        entity_title = span.predicted_entity.wikipedia_entity_title
                        argument['argument_id'] = entity_id
                        argument['entity_title'] = entity_title
                    argument['entity_type'] = span.coarse_mention_type
    return events

def transform_dataset(dataset):
    transformed_dataset = {}
    for index, datum in enumerate(dataset):
        print("{}/{}".format(index, len(dataset)))
        paragraph = merge_sentences(datum)
        events = merge_events(datum)
        if events == []: continue
        events = link_entities(events, paragraph)
        doc_key = datum['doc_key']
        source_url = datum['source_url']
        if doc_key not in transformed_dataset.keys():
            transformed_dataset[doc_key] = {
                "doc_id": doc_key,
                "source_url": source_url,
                "events": []
            }
        transformed_dataset[doc_key]['events'] += events
    return list(transformed_dataset.values())

In [206]:
dev_reader = jsonlines.open(r'data/raw/RAMS/dev.jsonlines')
for datum in dev_reader:
    doc_id = datum['doc_key']
    if doc_id == "nw_RC0da9ca01673da1e2a47f6ccf9d239cbde98f30122f50c5ced8fa4743":
        paragraph = merge_sentences(datum)
        events = merge_events(datum)
        events = link_entities(events, paragraph)
        print(events)
        break




[{'trigger': 'confiscated', 'trigger_span': [58, 58], 'trigger_type': 'transaction.transaction.n/a', 'arguments': [{'argument_id': 'Tahir Javed', 'argument_word': 'Tahir Javed', 'argument_role': 'evt130arg01participant', 'argument_span': [48, 49], 'entity_type': 'PERSON'}, {'argument_id': 'Q173', 'argument_word': 'Alabama', 'argument_role': 'evt130arg04place', 'argument_span': [44, 44], 'entity_title': 'Alabama', 'entity_type': 'GPE'}]}]


In [207]:
# dataset = [datum for datum in dev_reader] + [datum for datum in test_reader] + [datum for datum in train_reader]
dataset = [datum for datum in dev_reader]
transformed_dataset = transform_dataset(dataset)
save_json(transformed_dataset, r'data/result/RAMS/events.json')

0/878
1/878
2/878
3/878
4/878
5/878
6/878
7/878
8/878
9/878
10/878
11/878
12/878
13/878
14/878
15/878
16/878
17/878
18/878
19/878
20/878
21/878
22/878
23/878
24/878
25/878
26/878
27/878
28/878
29/878
30/878
31/878
32/878
33/878
34/878
35/878
36/878
37/878
38/878
39/878
40/878
41/878
42/878
43/878
44/878
45/878
46/878
47/878
48/878
49/878
50/878
51/878
52/878
53/878
54/878
55/878
56/878
57/878
58/878
59/878
60/878
61/878
62/878
63/878
64/878
65/878
66/878
67/878
68/878
69/878
70/878
71/878
72/878
73/878
74/878
75/878
76/878
77/878
78/878
79/878
80/878
81/878
82/878
83/878
84/878
85/878
86/878
87/878
88/878
89/878
90/878
91/878
92/878
93/878
94/878
95/878
96/878
97/878
98/878
99/878
100/878
101/878
102/878
103/878
104/878
105/878
106/878
107/878
108/878
109/878
110/878
111/878
112/878
113/878
114/878
115/878
116/878
117/878
118/878
119/878
120/878
121/878
122/878
123/878
124/878
125/878
126/878
127/878
128/878
129/878
130/878
131/878
132/878
133/878
134/878
135/878
136/878
137/878
138/87

In [226]:
def disambiguate(docs):
    nodes_dict = {}
    hyper_edges_dict = {}
    links = []
    for doc in docs:
        doc_id = doc['doc_id']
        doc_url = doc['source_url']
        for event in doc['events']:
            arguments = event['arguments']
            # create an entity node for each argument
            for argument in arguments:
                argument_id = argument['argument_id']
                argument_word = argument['argument_word']
                argument_title = argument['entity_title'] if 'entity_title' in argument else argument_word
                argument_entity_type = argument['entity_type'] if 'entity_type' in argument else "None"
                argument_span = argument['argument_span']
                argument_role = argument['argument_role']
                if argument_id not in nodes_dict.keys():
                    nodes_dict[argument_id] = {
                        "id": argument_id, 
                        "title": argument_title,
                        "entity_type": argument_entity_type,
                        "type": "entity",
                        "argument_role": argument_role,
                        "mentions": [
                            {
                                "doc_id": doc_id,
                                "mention": argument_word,
                                "span": {'start': argument_span[0], 'end': argument_span[1]}
                            }
                        ]
                    }
                else:
                    nodes_dict[argument_id]['mentions'].append(
                        {
                            "doc_id": doc_id,
                            "mention": argument_word,
                            "span": {'start': argument_span[0], 'end': argument_span[1]}
                        }
                    )
            argument_ids = [argument['argument_id'] for argument in arguments]
            if any([argument_id == None for argument_id in argument_ids]):
                print(doc_id, argument_ids)
            sorted_argument_ids = sorted(argument_ids)
            # create hyperedge 
            trigger_id = event['trigger'] # TODO: add disambiguation
            trigger_type = event['trigger_type']
            hyper_edge_id = trigger_id  + "-" + "-".join(sorted_argument_ids)
            if hyper_edge_id not in hyper_edges_dict.keys():
                hyper_edges_dict[hyper_edge_id] = {
                    'id': hyper_edge_id,
                    'type': "hyper_edge",
                    "trigger": trigger_id,
                    "trigger_type": trigger_type,
                    "arguments": sorted_argument_ids,
                    "mentions": [
                        {
                            "doc_id": doc_id,
                        }
                    ]
                }
            else:
                hyper_edges_dict[hyper_edge_id]['mentions'].append(
                    {
                        "doc_id": doc_id
                    }
                )
            for argument_id in argument_ids:
                links.append((hyper_edge_id, argument_id))
    return nodes_dict, hyper_edges_dict, links

def merge_RAMS(dataset):
    nodes_dict, hyper_edges_dict, links = disambiguate(dataset)
    B = nx.Graph()
    B.add_nodes_from(list(hyper_edges_dict.keys()), bipartite=0)
    B.add_nodes_from(list(nodes_dict.keys()), bipartite=1)
    B.add_edges_from(links)
    return hnx.Hypergraph.from_bipartite(B), nodes_dict, hyper_edges_dict, links


In [227]:
H, nodes_dict, hyper_edges_dict, links = merge_RAMS(transformed_dataset)
list(H.shape)

[1239, 834]

In [117]:
def plot_degree_distribution(HG, fit_line=False):
    degree_sequence = [HG.degree(node) for node in HG.nodes]
    degree_counts = [(degree, degree_sequence.count(degree)) for degree in set(degree_sequence)]
    x, y = zip(*degree_counts)
        
    # fit line
    if fit_line:
        filter_degree = 15
        filtered_degree_sequence = list(filter(lambda degree: degree < filter_degree, degree_sequence))
        filtered_degree_counts = [(degree, degree_sequence.count(degree)) for degree in set(filtered_degree_sequence)]
        filtered_x, filtered_y = zip(*filtered_degree_counts)
        log_x = np.log10(filtered_x)
        log_y = np.log10(filtered_y)
        slope, intercept = np.polyfit(log_x, log_y, 1)
        print("slope:", slope, "intercept:", intercept)
        x_vals = np.array([min(filtered_x), max(filtered_x)])
        y_vals = 10**(intercept + slope*np.log10(x_vals))
        plt.plot(x_vals, y_vals, '--')
    
        
    plt.scatter(x, y)
    # plt.xscale("log")
    # plt.yscale("log")
    plt.xlabel('Degree')
    plt.ylabel('Probability')
    plt.show()

In [120]:
print(list(H.nodes))
removed_nodes = [node for node in H.nodes if H.degree(node) == 1]
SH = H.remove_nodes(removed_nodes)
print(SH.shape)

['food or clothes', 'World War II', 'one', 'a coffin', 'these ties', 'nuclear weapons', 'Sunni groups', 'thousands of criminal aliens', '$ 1.7 billion', 'demolition', 'real estate empire', 'government agencies', 'Iraqis', 'Little Rock hotel room', 'wooden shack', 'Ottoman Turks', 'Syrian opposition', 'viewers and listeners', 'Trump', 'the rebels', 'you', 'CrowdStrike', 'Mrs Clinton', 'nominee', 'Italy and Germany', 'buildings', 'U.S. warplanes', 'Christians', 'Levitan', 'Obama DHS chief', 'Reuters', '25,000 Chinese', 'Al Qaeda', 'Mrs Clinton or her aides', 'commercial activities', 'countries', 'the public', 'demolition contractor', 'New Democrats', 'loans', "the ' babies '", 'outside groups', 'Ted', '@WIStateFair', 'political parties', 'Two undercover videos', 'Bill Clinton', 'Kenya', 'an illegal', 'donations', '$ 2,700 to his campaign and $ 25,000', 'Political Vindication Radio', 'secret ledgers', 'Jeh Johnson', 'Saudi Arabia', 'Cornel West', 'mother', 'America', 'Mrs. Clinton', 'Shaa

In [92]:
def transform_frontend(nodes, links, nodes_dict, hyper_edges_dict):
    res_nodes = []
    res_links = []
    for node in nodes:
        if node in nodes_dict:
            res_nodes.append(nodes_dict[node])
        else:
            res_nodes.append(hyper_edges_dict[node])
    for link in links:
        source = link[0]
        target = link[1]
        res_links.append({
            "source": source,
            "target": target,
        })
    print(len(res_nodes))
    return {
        "nodes": res_nodes, 
        "links": res_links
    }

In [228]:
BH = H.bipartite()
BSH = SH.bipartite()
network = transform_frontend(list(BH.nodes), list(BH.edges), nodes_dict, hyper_edges_dict)
save_json(network, 'data/result/RAMS/dev_subgraph.json')

2073


: 