In [24]:
from collections import defaultdict
import json
import pickle
from pprint import pprint

import amrlib
from amrlib.graph_processing.annotator import add_lemmas
from amrlib.alignments.rbw_aligner import RBWAligner
import penman
from penman.surface import Alignment
from transformers import AutoTokenizer

In [7]:
with open("/projects/flow_graphs/data/risec/amr.pkl", "rb" ) as f:
    graphs = pickle.load(f)

In [30]:
graph = graphs[0][0]["graph"]
tokens = graphs[0][0]["tokens"]

In [16]:
def viz(graph):
    print(penman.encode(graph))

In [17]:
viz(graph)

# ::snt 1) In a saucepan over low heat, stir together the half-and-half and sugar.
# ::tokens ["1", ")", "In", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::lemmas ["1", ")", "in", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::alignments 0-1.1 4-1.5 6-1.5.1.1 7-1.5.1 9-1 10-1.4 12-1.3.1 14-1.3 18-1.3.2
(z1 / stir-01~e.9
    :li 1~e.0
    :ARG0 (z2 / you)
    :ARG1 (z3 / and~e.14
              :op1 (z4 / half-and-half~e.12)
              :op2 (z5 / sugar~e.18))
    :mod (z6 / together~e.10)
    :location (z7 / saucepan~e.4
                  :location-of (z8 / heat~e.7
                                   :ARG1-of (z9 / low-04~e.6))))


In [18]:
lemma_graph = add_lemmas(penman.encode(graph), snt_key="snt")

In [19]:
viz(lemma_graph)

# ::snt 1) In a saucepan over low heat, stir together the half-and-half and sugar.
# ::tokens ["1", ")", "In", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::lemmas ["1", ")", "in", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::alignments 0-1.1 4-1.5 6-1.5.1.1 7-1.5.1 9-1 10-1.4 12-1.3.1 14-1.3 18-1.3.2
(z1 / stir-01~e.9
    :li 1~e.0
    :ARG0 (z2 / you)
    :ARG1 (z3 / and~e.14
              :op1 (z4 / half-and-half~e.12)
              :op2 (z5 / sugar~e.18))
    :mod (z6 / together~e.10)
    :location (z7 / saucepan~e.4
                  :location-of (z8 / heat~e.7
                                   :ARG1-of (z9 / low-04~e.6))))


In [20]:
aligner = RBWAligner.from_penman_w_json(lemma_graph)    # use this with an annotated penman graph object
graph_string  = aligner.get_graph_string()               # get the aligned graph string
aligned_graph = aligner.get_penman_graph()
print(graph_string)

# ::snt 1) In a saucepan over low heat, stir together the half-and-half and sugar.
# ::tokens ["1", ")", "In", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::lemmas ["1", ")", "in", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::alignments 0-1.1 4-1.5 6-1.5.1.1 7-1.5.1 9-1 10-1.4 12-1.3.1 14-1.3 18-1.3.2
(z1 / stir-01~e.9~e.9
      :li 1~e.0~e.0
      :ARG0 (z2 / you)
      :ARG1 (z3 / and~e.14~e.14
            :op1 (z4 / half-and-half~e.12~e.12)
            :op2 (z5 / sugar~e.18~e.18))
      :mod (z6 / together~e.10~e.10)
      :location (z7 / saucepan~e.4~e.4
            :location-of (z8 / heat~e.7~e.7
                  :ARG1-of (z9 / low-04~e.6~e.6))))


In [21]:
penman.surface.alignments(aligned_graph)

{('z1', ':instance', 'stir-01'): Alignment((9,), prefix='e.'),
 ('z1', ':li', '1'): Alignment((0,), prefix='e.'),
 ('z3', ':instance', 'and'): Alignment((14,), prefix='e.'),
 ('z4', ':instance', 'half-and-half'): Alignment((12,), prefix='e.'),
 ('z5', ':instance', 'sugar'): Alignment((18,), prefix='e.'),
 ('z6', ':instance', 'together'): Alignment((10,), prefix='e.'),
 ('z7', ':instance', 'saucepan'): Alignment((4,), prefix='e.'),
 ('z8', ':instance', 'heat'): Alignment((7,), prefix='e.'),
 ('z9', ':instance', 'low-04'): Alignment((6,), prefix='e.')}

In [23]:
aligned_graph.triples

[('z1', ':instance', 'stir-01'),
 ('z1', ':li', '1'),
 ('z1', ':ARG0', 'z2'),
 ('z2', ':instance', 'you'),
 ('z1', ':ARG1', 'z3'),
 ('z3', ':instance', 'and'),
 ('z3', ':op1', 'z4'),
 ('z4', ':instance', 'half-and-half'),
 ('z3', ':op2', 'z5'),
 ('z5', ':instance', 'sugar'),
 ('z1', ':mod', 'z6'),
 ('z6', ':instance', 'together'),
 ('z1', ':location', 'z7'),
 ('z7', ':instance', 'saucepan'),
 ('z7', ':location-of', 'z8'),
 ('z8', ':instance', 'heat'),
 ('z8', ':ARG1-of', 'z9'),
 ('z9', ':instance', 'low-04')]

In [42]:
edge_to_type = defaultdict(lambda : 1)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

## Single Graph

In [91]:

# first, get a node to idx mapping
node_to_idx = {}
for (s, r, t) in aligned_graph.triples:
    if s not in node_to_idx:
        node_to_idx[s] = len(node_to_idx)
    if t not in node_to_idx:
        node_to_idx[t] = len(node_to_idx)

idx_to_node = {v:k for k, v in node_to_idx.items()}

# iterate through the triples in order to:
## get a node-to-token mapping 
## construct the COO format edge representation
## construct the edge information
token_to_node = defaultdict(list)
edges = []
edge_types = []

alignments = penman.surface.alignments(aligned_graph)
for triple in aligned_graph.triples:
    s,r,t = triple 
    if triple in alignments:
        for token_idx in alignments[triple].indices:
            token_to_node[token_idx].append(node_to_idx[s])
            token_to_node[token_idx].append(node_to_idx[t])
    edges.append((node_to_idx[s], node_to_idx[t]))
    edge_types.append(edge_to_type[r])


tokenized_input_ids = [101]
node_to_token = defaultdict(list)
for i, token in enumerate(tokens):
    tokenized = tokenizer(token, add_special_tokens=False)["input_ids"]
    current_idx = len(tokenized_input_ids)
    tokenized_input_ids.extend(tokenized)
    if i in token_to_node:
        node_indices= token_to_node[i]
        for node_idx in node_indices:
            node_to_token[node_idx].extend(range(current_idx, current_idx + len(tokenized)))
tokenized_input_ids.append(102)

In [92]:
print("\n".join([str((i, tokenizer.decode(token))) for (i, token) in enumerate(tokenized_input_ids)]))

(0, '[CLS]')
(1, '1')
(2, ')')
(3, 'in')
(4, 'a')
(5, 'sauce')
(6, '##pan')
(7, 'over')
(8, 'low')
(9, 'heat')
(10, ',')
(11, 'stir')
(12, 'together')
(13, 'the')
(14, 'half')
(15, '-')
(16, 'and')
(17, '-')
(18, 'half')
(19, 'and')
(20, 'sugar')
(21, '.')
(22, '[SEP]')


In [93]:
{idx_to_node[k]: v for k,v in node_to_token.items()}

{'z1': [1, 11],
 '1': [1],
 'z7': [5, 6],
 'saucepan': [5, 6],
 'z9': [8],
 'low-04': [8],
 'z8': [9],
 'heat': [9],
 'stir-01': [11],
 'z6': [12],
 'together': [12],
 'z4': [14],
 'half-and-half': [14],
 'z3': [16],
 'and': [16],
 'z5': [20],
 'sugar': [20]}

In [90]:
viz(aligned_graph)

# ::snt 1) In a saucepan over low heat, stir together the half-and-half and sugar.
# ::tokens ["1", ")", "In", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::lemmas ["1", ")", "in", "a", "saucepan", "over", "low", "heat", ",", "stir", "together", "the", "half", "-", "and", "-", "half", "and", "sugar", "."]
# ::alignments 0-1.1 4-1.5 6-1.5.1.1 7-1.5.1 9-1 10-1.4 12-1.3.1 14-1.3 18-1.3.2
(z1 / stir-01~e.9~e.9
    :li 1~e.0~e.0
    :ARG0 (z2 / you)
    :ARG1 (z3 / and~e.14~e.14
              :op1 (z4 / half-and-half~e.12~e.12)
              :op2 (z5 / sugar~e.18~e.18))
    :mod (z6 / together~e.10~e.10)
    :location (z7 / saucepan~e.4~e.4
                  :location-of (z8 / heat~e.7~e.7
                                   :ARG1-of (z9 / low-04~e.6~e.6))))


In [94]:
edges

[(0, 1),
 (0, 2),
 (0, 3),
 (3, 4),
 (0, 5),
 (5, 6),
 (5, 7),
 (7, 8),
 (5, 9),
 (9, 10),
 (0, 11),
 (11, 12),
 (0, 13),
 (13, 14),
 (13, 15),
 (15, 16),
 (15, 17),
 (17, 18)]

## Multiple Graphs

In [95]:
instance = graphs[0]

In [98]:
aligned_graph.top

'z1'

In [100]:
node_to_idx = {"head_node": 0}
tokenized_input_ids = [101]
edges = []
edge_types = []
node_to_token = defaultdict(list)



for sentence in instance:
    aligned_graph = sentence["graph"]
    tokens = sentence["tokens"]

    if aligned_graph is None:


        for (s, r, t) in aligned_graph.triples:
            if s not in node_to_idx:
                node_to_idx[s] = len(node_to_idx)
            if t not in node_to_idx:
                node_to_idx[t] = len(node_to_idx)

        token_to_node = defaultdict(list)


        alignments = penman.surface.alignments(aligned_graph)
        for triple in aligned_graph.triples:
            s,r,t = triple 
            if triple in alignments:
                for token_idx in alignments[triple].indices:
                    token_to_node[token_idx].append(node_to_idx[s])
                    token_to_node[token_idx].append(node_to_idx[t])
            edges.append((node_to_idx[s], node_to_idx[t]))
            edge_types.append(edge_to_type[r])
        # add an edge linking to the top node across sentences
        edges.append((0, node_to_idx[aligned_graph.top]))

    for i, token in enumerate(tokens):
        tokenized = tokenizer(token, add_special_tokens=False)["input_ids"]
        current_idx = len(tokenized_input_ids)
        tokenized_input_ids.extend(tokenized)
        if i in token_to_node:
            node_indices= token_to_node[i]
            for node_idx in node_indices:
                node_to_token[node_idx].extend(range(current_idx, current_idx + len(tokenized)))
                
tokenized_input_ids.append(102)
idx_to_node = {v:k for k, v in node_to_idx.items()}
node_to_token = dict(node_to_token)