In [1]:
%load_ext autoreload
%autoreload 2

In [19]:
from collections import Counter, defaultdict
import pickle
from pprint import pprint
import random
import re

import amrlib
import dill
from nltk.stem import WordNetLemmatizer
import pandas as pd
import penman
import stanza

from annotate_datasets import align_tokens_to_sentence

stanza_nlp = stanza.Pipeline(lang="en")r

## Constants

In [3]:
datasets = ["risec", "japflow", "chemu", "mscorpus"]

## EDA: what is unaligned?

In [5]:
unaligned_triples = {}
unaligned_instances = {}

# check unaligned words: if they occur enough, they're probably AMR-model fillins
unaligned_words = []

node_pattern = re.compile(r"z\d+")
pbf_pattern = re.compile(r"(\w+-)+(\d){2}")

with open("/usr0/home/sgururaj/miniconda3/envs/amr/lib/python3.7/site-packages/amrlib/models/parse_spring/resources/additions.txt") as f:
    additions = f.read().split("\n")

with open("code/amr/amr_keywords.txt") as f:
    amr_keywords = [word.strip() for word in f.readlines() if word.strip() and not word.startswith("#")]

def is_unit_instance(triple, graph):
    s, r, t = triple
    is_instance = r == ":instance"
    is_unit_instance = any([parent_triple[1] in {":unit", ":scale"} and parent_triple[2] == s for parent_triple in graph.triples])
    return is_instance and is_unit_instance

def should_be_unaligned(triple, graph):
    # if this is a node like z1 --> z2, or z1 --> temperature_quantity
    is_node2node = bool(node_pattern.fullmatch(triple[0])) and bool(node_pattern.match(triple[2]))
    # if this is an AMR specifier node
    is_amr_keyword = triple[1] == ":instance" and triple[2] in additions or triple[2] in amr_keywords
    # if this is an intervening node triple
    is_name_triple = bool(node_pattern.fullmatch(triple[0])) and triple[2] == "name"
    # if this is a "you" truple
    is_imperative = triple[2] == "you" or triple[1] == ":mode" and triple[2] == "imperative"

    return is_node2node or is_amr_keyword or is_name_triple or is_unit_instance(triple, graph) or is_imperative


def is_propbank_frame(node_str):
    return bool(pbf_pattern.fullmatch(node_str))


for dataset in datasets:
    with open(f"/scratch/sgururaj/flow_graphs/{dataset}/amr_train.pkl", "rb") as f:
        amr_data = pickle.load(f)
    
    unaligned_instances[dataset] = []
    unaligned_triples[dataset] = []

    for doc in amr_data:
        for sentence in doc:
            amr_graph = sentence["graph"]
            text = sentence["text"]
            if amr_graph is None or amr_graph.triples[0][0] is None:
                continue
            alignments = penman.surface.alignments(amr_graph)
            for triple in amr_graph.triples:
                if triple not in alignments and not should_be_unaligned(triple, amr_graph):
                    unaligned_triples[dataset].append((triple, sentence))
                    if triple[1] == ":instance":
                        unaligned_instances[dataset].append((triple, sentence))
                        unaligned_words.append(triple[2])

In [6]:
word_counter = Counter(unaligned_words)
word_counter.most_common(10)

[('and', 1555),
 ('mean-01', 781),
 ('between', 589),
 ('dry-02', 491),
 ('after', 386),
 ('add-02', 294),
 ('example', 232),
 ('equal-01', 225),
 ('slash', 175),
 ('heat-01', 169)]

In [7]:
error_counts = {}

mystery = []
hyphens = []

for dataset in datasets:
    err_counts = defaultdict(lambda: 0) 
    unaligned = unaligned_instances[dataset]
    for (s,r,t), _ in unaligned:
        if t in additions or t in amr_keywords:
            err_counts["keywords"] += 1
        elif pbf_pattern.match(t):
            err_counts["pbf"] += 1
        elif "-" in t:
            err_counts["hyphenated"] += 1
            hyphens.append(t)
        elif t == "you":
            err_counts["you"] +=1
        else:
            mystery.append(t)
    error_counts[dataset] = err_counts, len(unaligned)

error_counts

{'risec': (defaultdict(<function __main__.<lambda>()>,
              {'pbf': 765, 'hyphenated': 6}),
  1139),
 'japflow': (defaultdict(<function __main__.<lambda>()>,
              {'pbf': 2280, 'hyphenated': 18}),
  3629),
 'chemu': (defaultdict(<function __main__.<lambda>()>,
              {'pbf': 2302, 'hyphenated': 275}),
  5601),
 'mscorpus': (defaultdict(<function __main__.<lambda>()>,
              {'pbf': 1068, 'hyphenated': 129}),
  2462)}

Takeaway here: some hyphenated words do exist: match against these by replacing hyphens and seeing if you can find a string match. Other hyphenated words are weird, and do not appear in the AMR spec, so not sure if they're valid to use or add there. The vast majority of unaligned words seem actually to be just that: unaligned words.

In [8]:
for dataset in datasets:
    unaligned = unaligned_instances[dataset]
    print(Counter([t for (_, _, t), _ in unaligned if not pbf_pattern.match(t) and t not in additions and t not in amr_keywords]).most_common(20))

[('and', 100), ('between', 57), ('juice', 25), ('cinnamon', 20), ('bubbly', 5), ('fryer', 4), ('cayenne', 4), ('chill', 4), ('parmesan', 4), ('sprinkle', 4), ('more', 4), ('we', 4), ('ratio-of', 3), ('oat', 3), ('aluminum', 3), ('worcestershire', 3), ('cilantro', 3), ('gradual', 3), ('layer', 3), ('cumin', 3)]
[('and', 304), ('between', 151), ('gas', 130), ('juice', 81), ('or', 75), ('meanwhile', 27), ('slash', 26), ('ice', 24), ('lid', 21), ('cumin', 18), ('thyme', 18), ('cinnamon', 16), ('zest', 15), ('wok', 15), ('after', 13), ('more', 13), ('cheese', 12), ('then', 11), ('pan', 10), ('i', 10)]
[('and', 944), ('between', 305), ('after', 277), ('example', 232), ('compound', 121), ('slash', 110), ('step', 105), ('ratio-of', 72), ('water', 62), ('or', 61), ('then', 59), ('this', 48), ('intermediate', 38), ('tetrahydrofuran', 36), ('hexane', 35), ('small-methanol', 32), ('method', 29), ('angle-quantity', 29), ('string-entity', 28), ('column', 25)]
[('and', 207), ('after', 94), ('between'

In [9]:
wnl = WordNetLemmatizer()

In [10]:
wnl.lemmatize("leaves")

'leaf'

In [11]:
## visual inspection cell

instance = random.choice(unaligned_instances["mscorpus"])
sentence = instance[1]["text"]
graph = instance[1]["graph"]

print(instance[1]["text"])
print(instance[0])
print(penman.encode(instance[1]["graph"]))

The powders were mixed in ethanol and ball-milled with high-purity corundum balls in Al2O3 jars for 10 h with 0.08 wt% MgO powder (99.998%, Alfa Aesar) and 0.8 wt% tetraethoxysilane (TEOS, >99.999%, Alfa Aesar) as sintering aids.
('z9', ':instance', 'purity')
(z1 / and~e.6
    :op1 (z2 / mix-01~e.3
             :arg1 (z3 / powder~e.1)
             :arg2 (z4 / ethanol~e.5))
    :op2 (z5 / mill-01
             :arg1 (z6 / ball~e.7)
             :arg2 (z7 / ball~e.7,11
                       :mod (z8 / corundum~e.10)
                       :mod (z9 / purity
                                :arg1-of (z10 / high-02~e.9)))
             :location~e.12 (z11 / jar~e.14
                                 :mod (z12 / small-molecule
                                           :name (z13 / name
                                                      :op1 "al2o3"))
                                 :accompanier~e.18 (z14 / and
                                                        :op1 (z15 / powder~e.22


In [12]:
penman.surface.alignments(graph)

{('z1', ':instance', 'and'): Alignment((6,), prefix='e.'),
 ('z2', ':instance', 'mix-01'): Alignment((3,), prefix='e.'),
 ('z3', ':instance', 'powder'): Alignment((1,), prefix='e.'),
 ('z4', ':instance', 'ethanol'): Alignment((5,), prefix='e.'),
 ('z6', ':instance', 'ball'): Alignment((7,), prefix='e.'),
 ('z7', ':instance', 'ball'): Alignment((7, 11), prefix='e.'),
 ('z8', ':instance', 'corundum'): Alignment((10,), prefix='e.'),
 ('z10', ':instance', 'high-02'): Alignment((9,), prefix='e.'),
 ('z11', ':instance', 'jar'): Alignment((14,), prefix='e.'),
 ('z15', ':instance', 'powder'): Alignment((22,), prefix='e.'),
 ('z21', ':value', '"0.8"'): Alignment((27,), prefix='e.'),
 ('z23', ':instance', 'and'): Alignment((26,), prefix='e.'),
 ('z30', ':quant', '10'): Alignment((16,), prefix='e.'),
 ('z32', ':instance', 'aid'): Alignment((36,), prefix='e.'),
 ('z33', ':instance', 'sinter-01'): Alignment((35,), prefix='e.')}

In [13]:
def get_unaligned_tokens(aligned_tokens, graph):
    is_aligned = [False] * len(aligned_tokens)
    for alignment in penman.surface.alignments(graph).values():
        for index in alignment.indices:
            is_aligned[index] = True
    return [token for token_aligned, token in zip(is_aligned, aligned_tokens) if not token_aligned]

In [14]:
def search_for_aligning_token(sentence, word, graph, lemmatizer):
    if pbf_pattern.match(word):
        segments = word.split("-")
        if len(segments) > 2:
            return None
        else: 
            word = segments[0]
    elif len(word.split("-")) > 1:
        return None
    word_lemma = lemmatizer.lemmatize(word)

    char_aligned_tokens = align_tokens_to_sentence([token for token in re.split("\s+", sentence) if token.strip()], sentence)
    unaligned_tokens = get_unaligned_tokens(char_aligned_tokens, graph)

    for unaligned_token in unaligned_tokens:
        unaligned_text = unaligned_token.token_str
        
        unaligned_lemma = lemmatizer.lemmatize(unaligned_text.lower())
        if unaligned_lemma == word_lemma:
            # print(word_lemma, unaligned_lemma)
            return unaligned_token

In [15]:
sentence = instance[1]["text"]
graph = instance[1]["graph"]

print(instance[1]["text"])
print(instance[0])
print(search_for_aligning_token(sentence, instance[0][2], graph, wnl))
print()

print(penman.encode(instance[1]["graph"]))

The powders were mixed in ethanol and ball-milled with high-purity corundum balls in Al2O3 jars for 10 h with 0.08 wt% MgO powder (99.998%, Alfa Aesar) and 0.8 wt% tetraethoxysilane (TEOS, >99.999%, Alfa Aesar) as sintering aids.
('z9', ':instance', 'purity')
None

(z1 / and~e.6
    :op1 (z2 / mix-01~e.3
             :arg1 (z3 / powder~e.1)
             :arg2 (z4 / ethanol~e.5))
    :op2 (z5 / mill-01
             :arg1 (z6 / ball~e.7)
             :arg2 (z7 / ball~e.7,11
                       :mod (z8 / corundum~e.10)
                       :mod (z9 / purity
                                :arg1-of (z10 / high-02~e.9)))
             :location~e.12 (z11 / jar~e.14
                                 :mod (z12 / small-molecule
                                           :name (z13 / name
                                                      :op1 "al2o3"))
                                 :accompanier~e.18 (z14 / and
                                                        :op1 (z15 / powder

In [23]:
alignment_counts = {}



for dataset in datasets:
    with open(f"/scratch/sgururaj/flow_graphs/{dataset}/amr_train.pkl", "rb") as f:
        amr_data = pickle.load(f)
    alignment_counts[dataset] = defaultdict(lambda: 0)

    for doc in amr_data:
        for sentence in doc:
            amr_graph = sentence["graph"]
            text = sentence["text"]
            if amr_graph is None or amr_graph.triples[0][0] is None:
                continue
            alignments = penman.surface.alignments(amr_graph)
            for triple in amr_graph.triples:
                alignment_counts[dataset]["triples"] += 1
                if triple not in alignments and should_be_unaligned(triple, amr_graph):
                    alignment_counts[dataset]["unaligned_fine"] += 1
                if triple not in alignments and not should_be_unaligned(triple, amr_graph):
                    alignment_counts[dataset]["unaligned_concerning"] += 1

                    new_alignment = search_for_aligning_token(text, triple[2], amr_graph, wnl)
                    if new_alignment is None:
                        alignment_counts[dataset]["concerning_unfixed"] += 1
                    else:
                        alignment_counts[dataset]["concerning_fixed"] += 1

(z1 / and~e.34
    :op1 (z2 / cool-01~e.46
             :arg1 (z3 / mixture~e.44
                       :arg0-of (z4 / react-01~e.43))
             :arg2 (z5 / temperature-quantity
                       :quant 78
                       :scale (z6 / celsius))
             :direction~e.47 (z7 / back~e.47))
    :op2 (z8 / treat-04~e.25
             :arg1 z3
             :arg2 (z9 / trifluoride~e.28
                       :mod (z10 / small-molecule
                                 :name (z11 / name
                                            :op1 "tert-butyl"~e.5)
                                 :mod (z12 / small-molecule
                                           :name (z13 / name
                                                      :op1 "3s,6r-6-(f3000 / fluoromethyl)tetrahydro-2h-pyran-3-yl)_)_)"
                                                      :location~e.17 (z14 / dichloromethane~e.18
                                                                          :mod (z15 / small-m

In [20]:
pd.DataFrame(alignment_counts)

Unnamed: 0,risec,japflow,chemu,mscorpus
triples,15750,54239,129382,47350
unaligned_fine,7826,31171,79960,28751
unaligned_concerning,2039,3936,17332,6131
concerning_unfixed,1325,1672,15836,5552
concerning_fixed,714,2264,1496,579
