In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import defaultdict
import dill
import json
import pickle
from pprint import pprint

import amrlib
from amrlib import load_stog_model
from amrlib.graph_processing.amr_plot import AMRPlot
from amrlib.graph_processing.amr_loading import load_amr_entries
from amrlib.graph_processing.annotator import add_lemmas
from amrlib.alignments.rbw_aligner import RBWAligner
from amrlib.alignments.faa_aligner import FAA_Aligner
import penman
from penman.surface import Alignment
from transformers import AutoTokenizer

from annotate_datasets import *

In [3]:
amr_model = load_stog_model(device="cuda:0")

In [4]:
with open("/projects/flow_graphs/data/risec/train.json") as f:
    train_data = json.load(f)

In [5]:
train_data[1]["rels"]

[{'_id': 'R1',
  'arg1_start': 3,
  'arg1_end': 10,
  'arg1_word': 'Preheat',
  'arg1_label': 'AC',
  'arg2_start': 11,
  'arg2_end': 15,
  'arg2_word': 'oven',
  'arg2_label': 'TOOL',
  'arg_label': 'Arg_PPT'},
 {'_id': 'R2',
  'arg1_start': 3,
  'arg1_end': 10,
  'arg1_word': 'Preheat',
  'arg1_label': 'AC',
  'arg2_start': 19,
  'arg2_end': 32,
  'arg2_word': '350 degrees F',
  'arg2_label': 'TEMPERATURE',
  'arg_label': 'ArgM_MNR'},
 {'_id': 'R3',
  'arg1_start': 3,
  'arg1_end': 10,
  'arg1_word': 'Preheat',
  'arg1_label': 'AC',
  'arg2_start': 34,
  'arg2_end': 47,
  'arg2_word': '175 degrees C',
  'arg2_label': 'TEMPERATURE',
  'arg_label': 'ArgM_MNR'},
 {'_id': 'R4',
  'arg1_start': 53,
  'arg1_end': 59,
  'arg1_word': 'Grease',
  'arg1_label': 'AC',
  'arg2_start': 60,
  'arg2_end': 81,
  'arg2_word': 'one 9x5 inch loaf pan',
  'arg2_label': 'TOOL',
  'arg_label': 'Arg_PPT'},
 {'_id': 'R5',
  'arg1_start': 86,
  'arg1_end': 93,
  'arg1_word': 'Measure',
  'arg1_label': 'AC',


In [12]:
split_sentences = sentencizer(train_data[0]["text"])

In [13]:
graphs, sentences, tokens = annotate_sentences(train_data[0]["text"], amr_model, None)

100%|█████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.20it/s]


In [14]:
sentences[0]

'1) In a saucepan over low heat, stir together the half-and-half and sugar.'

In [20]:
penman.decode(graphs[0]).triples

[('z1', ':instance', 'stir-01'),
 ('z1', ':li', '1'),
 ('z1', ':arg0', 'z2'),
 ('z2', ':instance', 'you'),
 ('z1', ':arg1', 'z3'),
 ('z3', ':instance', 'and'),
 ('z3', ':op1', 'z4'),
 ('z4', ':instance', 'half-and-half'),
 ('z3', ':op2', 'z5'),
 ('z5', ':instance', 'sugar'),
 ('z1', ':mod', 'z6'),
 ('z6', ':instance', 'together'),
 ('z1', ':location', 'z7'),
 ('z7', ':instance', 'saucepan'),
 ('z8', ':location', 'z7'),
 ('z8', ':instance', 'heat'),
 ('z9', ':arg1', 'z8'),
 ('z9', ':instance', 'low-04')]

In [15]:
tokens[0]

[PenmanToken(token_str='1)', start_idx=0, end_idx=2),
 PenmanToken(token_str='In', start_idx=3, end_idx=5),
 PenmanToken(token_str='a', start_idx=6, end_idx=7),
 PenmanToken(token_str='saucepan', start_idx=8, end_idx=16),
 PenmanToken(token_str='over', start_idx=17, end_idx=21),
 PenmanToken(token_str='low', start_idx=22, end_idx=25),
 PenmanToken(token_str='heat,', start_idx=26, end_idx=31),
 PenmanToken(token_str='stir', start_idx=32, end_idx=36),
 PenmanToken(token_str='together', start_idx=37, end_idx=45),
 PenmanToken(token_str='the', start_idx=46, end_idx=49),
 PenmanToken(token_str='half-and-half', start_idx=50, end_idx=63),
 PenmanToken(token_str='and', start_idx=64, end_idx=67),
 PenmanToken(token_str='sugar.', start_idx=68, end_idx=74)]

In [16]:
alignments = penman.surface.alignments(penman.decode(graphs[0]))

In [27]:
a = alignments[('z1', ':instance', 'stir-01')]
a.indices[0]

7

In [11]:
tokens

[[PenmanToken(token_str='1)', start_idx=0, end_idx=2),
  PenmanToken(token_str='In', start_idx=3, end_idx=5),
  PenmanToken(token_str='a', start_idx=6, end_idx=7),
  PenmanToken(token_str='saucepan', start_idx=8, end_idx=16),
  PenmanToken(token_str='over', start_idx=17, end_idx=21),
  PenmanToken(token_str='low', start_idx=22, end_idx=25),
  PenmanToken(token_str='heat,', start_idx=26, end_idx=31),
  PenmanToken(token_str='stir', start_idx=32, end_idx=36),
  PenmanToken(token_str='together', start_idx=37, end_idx=45),
  PenmanToken(token_str='the', start_idx=46, end_idx=49),
  PenmanToken(token_str='half-and-half', start_idx=50, end_idx=63),
  PenmanToken(token_str='and', start_idx=64, end_idx=67),
  PenmanToken(token_str='sugar.', start_idx=68, end_idx=74)],
 [PenmanToken(token_str='', start_idx=0, end_idx=0),
  PenmanToken(token_str='2)', start_idx=1, end_idx=3),
  PenmanToken(token_str='Whisk', start_idx=4, end_idx=9),
  PenmanToken(token_str='in', start_idx=10, end_idx=12),
  Penm

In [28]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [33]:
offset_mapping = tokenizer(sentences[0], return_offsets_mapping=True, max_length=512)["offset_mapping"]

In [34]:
offset_mapping[1:-1]

[(0, 1),
 (1, 2),
 (3, 5),
 (6, 7),
 (8, 13),
 (13, 16),
 (17, 21),
 (22, 25),
 (26, 30),
 (30, 31),
 (32, 36),
 (37, 45),
 (46, 49),
 (50, 54),
 (54, 55),
 (55, 58),
 (58, 59),
 (59, 63),
 (64, 67),
 (68, 73),
 (73, 74)]