In [None]:
# Fix PATH to include OPAM binaries where grewpy_backend is located
import os

os.environ["PATH"] = os.environ["HOME"] + "/.opam/5.2.0/bin:" + os.environ["PATH"]

In [None]:
from pathlib import Path

import udapi
from udapi.block.ud.fixchain import FixChain
from udapi.block.ud.fixleaf import FixLeaf
from udapi.block.ud.fixmultisubjects import FixMultiSubjects
from udapi.block.ud.fixpunct import FixPunct
from udapi.block.ud.fixrightheaded import FixRightheaded
from udapi.block.ud.setspaceafterfromtext import SetSpaceAfterFromText

In [None]:
# filename = "UD_output.conllu"
filename = "UD_Norwegian-NynorskLIA/no_nynorsklia-ud-dev.conllu"

repo_dir = Path.cwd().parent
UDFILE = repo_dir.parent / filename

assert UDFILE.exists()
UDFILE = str(UDFILE)

In [None]:
doc = udapi.Document(filename=UDFILE)

In [None]:
# Processing full document

spaceafter = SetSpaceAfterFromText()
spaceafter.run(document=doc)

fixpunct = FixPunct(check_paired_punct_upos=True)
fixpunct.run(document=doc)

fix_chain = FixChain()
fix_chain.run(document=doc)

fix_multisubj = FixMultiSubjects()
fix_multisubj.run(document=doc)

fix_right = FixRightheaded()
fix_right.run(document=doc)

fix_leaf = FixLeaf(deprels="aux,cop,case,mark,cc,det")
fix_leaf.run(document=doc)

doc.store_conllu("out.conllu")

In [None]:
# Run validation script at the end
validation_script = repo_dir / "tools/validate.py"

!pdm run python $validation_script --max-err 0 --lang no out.conllu

In [None]:
# Processing individual nodes

for b in doc.bundles[:2]:
    root = b.get_tree()
    root.draw(layout="align", attributes="ord,form,feats")
    nodes = root.descendants
    for node in nodes:
        if (node.upos == "VERB") and (node.feats["VerbForm"] == ""):
            #            node.feats["VerbForm"] = "Fin"
            print(node.form, node.feats)

# doc.store_conllu("out.conllu")

In [None]:
import grewpy
from grewpy import Corpus, CorpusDraft, Request

grewpy.set_config("ud")  # ud or basic

# This code block is a work in progress


def fix_punct(graph: grewpy.Graph):
    """Grewpy-compatible implementation of udapi's FixPunct Block:
    https://udapi.readthedocs.io/en/latest/udapi.block.ud.fixpunct.html#udapi.block.ud.fixpunct.FixPunct
    """
    PAIRED_PUNCT = {
        "(": ")",
        "[": "]",
        "{": "}",
        '"': '"',  # ASCII double quotes
        "'": "'",  # ASCII single quotes
        "“": "”",  # quotation marks used in English, ...
        "„": "“",  # Czech, German, Russian, ...
        "«": "»",  # French, Russian, Spanish, ...
        "‹": "›",  # dtto
        "《": "》",  # Korean, Chinese
        "「": "」",  # Chinese, Japanese
        "『": "』",  # ditto
        "¿": "?",  # Spanish paired question marks
        "¡": "!",  # Spanish paired exclamation marks
    }

    FINAL_PUNCT = ".?!"

    def _is_punct(node):
        if node["upos"] == "PUNCT":
            return True
        if node["form"] in "'\"":
            return False
        if node["form"] in PAIRED_PUNCT or node["form"] in PAIRED_PUNCT.values():
            return True
        return False

    def has_children(node_id: str):
        deps = graph.sucs.get(node_id, [])
        return len(deps) > 0

    parent_lookup = {
        child_node[0]: parent_id
        for parent_id in graph.sucs
        for child_node in graph.sucs[parent_id]
    }

    # First, make sure no PUNCT has children.
    # This may introduce multiple subroots, which will be fixed later on
    # (preventing to temporarily create multiple subroots here would prevent fixing some errors).
    for node_id in graph:
        node = graph[node_id]
        if _is_punct(node) and has_children(node_id):
            parent = parent_lookup[node_id]

    # Second, fix paired punctuations: quotes and brackets, marking them in _punct_type.
    # This should be done before handling the subordinate punctuation,
    # in order to prevent non-projectivities e.g. in dot-before-closing-quote style sentences:
    #   I call him "Bob."
    # Here both quotes and the sentence-final dot should be attached to "Bob".
    # (As you can see on the previous line, I don't like this American typographic rule.)
    self._punct_type = [None] * (1 + len(root.descendants))
    for node in root.descendants:
        if self._punct_type[node.ord] != "closing":
            closing_punct = PAIRED_PUNCT.get(node.form)
            if closing_punct is not None:
                self._fix_paired_punct(root, node, closing_punct)

    # Third, fix subordinate punctuation (i.e. any punctuation not marked in _punct_type).
    for node in root.descendants:
        if node.upos == "PUNCT" and not self._punct_type[node.ord]:
            self._fix_subord_punct(node)

    # UD requires "exactly one word is the head of the sentence, dependent on a notional ROOT", i.e. a single "subroot".
    # This seems to be a stronger rule than no-PUNCT-children because it is checked by the validator.
    # So lets prevent multiple subroots (at the cost of possibly re-introducing PUNCT-children).
    if len(root.children) > 1:
        selected_subroot = next(
            (n for n in root.children if n.udeprel == "root"), root.children[0]
        )
        for a_subroot in root.children:
            if a_subroot != selected_subroot:
                a_subroot.parent = selected_subroot

    # Check if the subroot is still marked with deprel=root.
    # This may not hold if the original subroot was a paired punctuation, which was rehanged.
    if root.children[0].udeprel != "root":
        root.children[0].udeprel = "root"
        if self.copy_to_enhanced:
            root.children[0].deps = [{"parent": root, "deprel": "root"}]
        for another_node in root.children[0].descendants:
            if another_node.udeprel == "root":
                another_node.udeprel = "punct"

    # TODO: This block changes parents not only for PUNCT nodes. These should be reflected into enhanced deps as well.
    if self.copy_to_enhanced:
        for node in root.descendants:
            if node.upos == "PUNCT":
                node.deps = [{"parent": node.parent, "deprel": node.deprel}]

    def _fix_subord_punct(self, node):
        # Dot used as the ordinal-number marker (in some languages) or abbreviation marker.
        # TODO: detect these cases somehow
        # Numbers can be detected with `node.parent.form.isdigit()`,
        # but abbreviations are more tricky because the Abbr=Yes feature is not always used.
        if node.form == "." and node.parent == node.prev_node:
            return

        # Even non-paired punctuation like commas and dashes may work as paired.
        # Detect such cases and try to preserve, but only if projective.
        p_desc = node.parent.descendants(add_self=1)
        if (
            node in (p_desc[0], p_desc[-1])
            and len(p_desc) == p_desc[-1].ord - p_desc[0].ord + 1
        ):
            if (
                p_desc[0].upos == "PUNCT"
                and p_desc[-1].upos == "PUNCT"
                and p_desc[0].parent == node.parent
                and p_desc[-1].parent == node.parent
            ):
                return

        # Initialize the candidates (left and right) with the nearest nodes excluding punctuation.
        # Final punctuation should not be attached to any following, so exclude r_cand there.
        l_cand, r_cand = node.prev_node, node.next_node
        if node.form in FINAL_PUNCT:
            r_cand = None
        while l_cand.ord > 0 and l_cand.upos == "PUNCT":
            if self._punct_type[l_cand.ord] == "opening" and l_cand.parent != node:
                l_cand = None
                break
            l_cand = l_cand.prev_node
        while r_cand is not None and r_cand.upos == "PUNCT":
            if self._punct_type[r_cand.ord] == "closing" and r_cand.parent != node:
                r_cand = None
                break
            r_cand = r_cand.next_node

        # Climb up from the candidates, until we would reach the root or "cross" the punctuation.
        # If the candidates' descendants span across the punctuation, we also stop
        # because climbing higher would cause a non-projectivity (the punct would be the gap).
        l_path, r_path = [l_cand], [r_cand]
        if l_cand is None or l_cand.is_root():
            l_cand, l_path = None, []
        else:
            while (
                not l_cand.parent.is_root()
                and l_cand.parent < node
                and not node < l_cand.descendants(add_self=1)[-1]
            ):
                l_cand = l_cand.parent
                l_path.append(l_cand)
        if r_cand is not None:
            while (
                not r_cand.parent.is_root()
                and node < r_cand.parent
                and not r_cand.descendants(add_self=1)[0] < node
            ):
                r_cand = r_cand.parent
                r_path.append(r_cand)

        # Filter out candidates which would lead to non-projectivities, i.e. bugs
        # punct-nonproj and punct-nonproj-gap as checked by the UD validator and ud.MarkBugs.
        orig_parent = node.parent
        l_path = [n for n in l_path if n and self._will_be_projective(node, n)]
        r_path = [n for n in r_path if n and self._will_be_projective(node, n)]
        l_cand = l_path[-1] if l_path else None
        r_cand = r_path[-1] if r_path else None
        node.parent = orig_parent

        # Now select between l_cand and r_cand -- which will be the new parent?
        # The lower one. Note that if neither is descendant of the other and neither is None
        # (which can happen in rare non-projective cases), we arbitrarily prefer l_cand,
        # but if the original parent is either on l_path or r_path, we keep it as acceptable.
        if l_cand is not None and l_cand.is_descendant_of(r_cand):
            cand, path = l_cand, l_path
        elif r_cand is not None and r_cand.is_descendant_of(l_cand):
            cand, path = r_cand, r_path
        elif l_cand is not None:
            cand, path = l_cand, l_path + r_path
        elif r_cand is not None:
            cand, path = r_cand, l_path + r_path
        else:
            return

        # The guidelines say:
        #    Within the relevant unit, a punctuation mark is attached
        #    at the highest possible node that preserves projectivity.
        # However, sometimes it is difficult to detect the unit (and its head).
        # E.g. in "Der Mann, den Sie gestern kennengelernt haben, kam wieder."
        # the second comma should depend on "kennengelernt", not on "Mann"
        # because the unit is just the relative clause.
        # We try to be conservative and keep the parent, unless we are sure it is wrong.
        if node.parent not in path:
            node.parent = cand
        node.deprel = "punct"

    def _will_be_projective(self, node, cand):
        node.parent = cand
        return not node.is_nonprojective() and not self._causes_gap(node)

    def _causes_gap(self, node):
        return node.is_nonprojective_gap() and not node.parent.is_nonprojective_gap()

    def _fix_paired_punct(self, root, opening_node, closing_punct):
        if (
            self.check_paired_punct_upos or opening_node.form in "'\""
        ) and opening_node.upos != "PUNCT":
            return
        nested_level = 0
        for node in root.descendants[opening_node.ord :]:
            if node.form == closing_punct:
                if nested_level > 0:
                    nested_level -= 1
                else:
                    self._fix_pair(root, opening_node, node)
                    return
            elif node.form == opening_node.form:
                nested_level += 1

    def _fix_pair(self, root, opening_node, closing_node):
        # Ideally, paired punctuation symbols should be attached to the single
        # head of the subtree inside. Provided the inside segment is a single
        # subtree.
        heads = []
        punct_heads = []
        for node in root.descendants:
            if node == opening_node or node == closing_node:
                continue
            # If this is a node inside of the pair, is its parent outside?
            if node > opening_node and node < closing_node:
                if node.parent < opening_node or node.parent > closing_node:
                    if node.upos == "PUNCT":
                        punct_heads.append(node)
                    else:
                        heads.append(node)
            # Not only the punctuation symbols must not be attached non-projectively,
            # they also must not cause non-projectivity of other relations. This could
            # happen if an outside node is attached to an inside node. To account for
            # this, mark the inside parent as a head, too.
            elif node.parent > opening_node and node.parent < closing_node:
                if node.parent.upos == "PUNCT":
                    punct_heads.append(node.parent)
                else:
                    heads.append(node.parent)

        # Punctuation should not have children, but if there is no other head candidate,
        # let's break this rule.
        if len(heads) == 0:
            heads = punct_heads
        # If there are no nodes between the opening and closing mark (),
        # let's treat the marks as any other (non-pair) punctuation.
        if len(heads) == 0:
            return
        else:
            # Ideally, there should be only a single head.
            # If not, we could try e.g. to choose the "widests-span head":
            #  opening_node.parent = sorted(heads, key=lambda n: n.descendants(add_self=1)[0].ord)[0]
            #  closing_node.parent = sorted(heads, key=lambda n: -n.descendants(add_self=1)[-1].ord)[0]
            # which often leads to selecting the same head for the opening and closing punctuation
            # ignoring single words inside the paired punct which are non-projectively attached outside.
            # However, this means that the paired punctuation will be attached non-projectively,
            # which is forbidden by the UD guidelines.
            # Thus, we will choose the nearest head, which is the only way how to prevent non-projectivities.
            # Sort the heads by their ords (this is not guaranteed because we were adding a mixture of
            # inside heads and inside parents of outside nodes).
            heads.sort(key=lambda x: x.ord)
            opening_node.parent = heads[0]
            closing_node.parent = heads[-1]

        self._punct_type[opening_node.ord] = "opening"
        self._punct_type[closing_node.ord] = "closing"

        # In rare cases, non-projective gaps may remain. Let's dirty fix these!
        # E.g. in "the (lack of) reproducibility", the closing parenthesis
        # should be attached to "of" rather than to "lack"
        # -- breaking the paired-marks-have-same-parent rule
        # in order to prevent the punct-nonproj-gap bug (recently checked by validator.py).
        if self._causes_gap(opening_node):
            opening_node.parent = opening_node.next_node
            while opening_node.parent.ord < closing_node.ord - 1 and (
                opening_node.parent.upos == "PUNCT"
                or opening_node.is_nonprojective()
                or self._causes_gap(opening_node)
            ):
                opening_node.parent = opening_node.parent.next_node
        if self._causes_gap(closing_node):
            closing_node.parent = closing_node.prev_node
            while closing_node.parent.ord > opening_node.ord + 1 and (
                closing_node.parent.upos == "PUNCT"
                or closing_node.is_nonprojective()
                or self._causes_gap(closing_node)
            ):
                closing_node.parent = closing_node.parent.prev_node

In [None]:
UD_treebank_file = "../data/UD_output/*.conll"

# Gjør om immutable Corpus til et mutable CorpusDraft
draft = CorpusDraft(UD_treebank_file)

# Fiks tegnsetting
draft.map(add_SyntacticLevel_misc_annotation, in_place=True)

# Skriv endringene tilbake til et Corpus-objekt
corpus = Corpus(draft)

graph = corpus[0]
node_id = "3"
node = graph[node_id]


deps = graph.sucs.get(node_id, [])
try:
    dep_id = deps[0][0]

    graph.edge(node_id, dep_id)

    graph.lower(node_id, dep_id)
finally:
    print(graph.meta["text"])
#

graph.edge_diff_up_to(node_id)

In [None]:
graph.sucs

parent_lookup = {
    child_node[0]: parent_id
    for parent_id in graph.sucs
    for child_node in graph.sucs[parent_id]
}