Add first version of v2 conversion script.

UniversalDependencies · Jan 13, 2017 · f69ffb2 · f69ffb2
1 parent 1b4ada7
commit f69ffb2
Show file tree

Hide file tree

Showing 5 changed files with 482 additions and 0 deletions.
diff --git a/v2-conversion/README.md b/v2-conversion/README.md
@@ -0,0 +1,38 @@
+# UD v2 conversion script
+Author: Sebastian Schuster (sebschu@stanford.edu) 
+
+
+This script performs the following automatic updates to a treebank to be compliant with the v2 guidlines:
+
+* Rename UPOS tag `CONJ` to `CCONJ`.
+* Rename the `mwe` relation to `fixed`.
+* Rename the `name` relation to `flat`.
+* Rename the `dobj` relation to `obj`.
+* Rename the `(nsubj|csubj|aux)pass` relations to `(nsubj|csubj|aux):pass`.
+* Change some `nmod` relations to `obl` (wherever appropriate). Note that in some cases, 
+  it is ambiguous whether an nmod relation should be nmod or obl. If this is the case, the script  
+  adds the property `ManualCheck=Yes` to the `MISC` column of the relation.
+* Reattach coordinating conjunctions and commas to the succeeding conjunct.
+* (Designed only for English!) Change `neg` relations to `advmod` or `det`. 
+
+Note that this script does NOT perform all required changes. In particular, it does NOT perform the following changes, which either have to be performed manually or using custom scripts.
+
+* New treatment of gapped constructions (using `orphan` relations instead of `remnant` relations).
+* Changes to morphological features.
+* Addition of enhanced dependencies.
+* Changes to tokenization.
+* Changes to copular constructions.
+* Changes to POS tags beyond renaming `CONJ` to `CCONJ`.
+
+IMPORTANT: I only tested this script on the English treebank. It should also work for other languages, but if you run this script on any other treebank, make sure to do thorough manual checks.
+
+
+## Usage
+
+The script requires python3. You can run the script with the following command.
+
+```
+python convert.py PATH_TO_CONLLU_FILE > OUTPUT_PATH
+```
+
+The script will write the converted trees to stdout (which is piped to `OUTPUT_PATH` in the above command). Warnings (including the corresponding tree) are printed to stderr.
diff --git a/v2-conversion/convert.py b/v2-conversion/convert.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+
+
+######################################################################################
+#                                                                                    #
+# This script converts a CoNLL-U formatted treebank which was annotated according to #
+# the v1 guidelines to be compatible with the v2 guidelines.                         #
+#                                                                                    #
+# Limitations:                                                                       #
+#                                                                                    #
+# * This script does NOT update gapped constructions with remnant relations. This    #
+#   has to be done manually.                                                         #
+# * This script does NOT add enhanced dependencies and it does not update the        #
+#   contents of the DEPS field.                                                      #
+# * In some cases, it is ambiguous whether an nmod relation should be nmod or obl in #
+#   v2. If this is the case, the script adds the property ManualCheck=Yes to the     #
+#   MISC column of the relation.                                                     #
+# * The script does NOT rename, add, or remove any morphological features.           #
+#                                                                                    #
+# Author: Sebastian Schuster (sebschu@stanford.edu)                                  #
+#                                                                                    #
+######################################################################################
+
+
+import sys
+import argparse
+
+from depgraph_utils import *
+from processors_universal import *
+from processors_en import *
+
+
+######################################################################################
+# Processors                                                                         #
+#                                                                                    #
+# Each processor in this list is applied to each UD graph in turn.                   #
+# Processors are defined in processors_universal.py and processors_lang              #
+#                                                                                    #
+# If you implement treebank-specific processors, make sure to add them to this list. #
+######################################################################################
+
+processors = [UPosRenameUpdateProcessor("CONJ", "CCONJ"),
+              RelnRenameUpdateProcessor("mwe", "fixed"),
+              RelnRenameUpdateProcessor("dobj", "obj"),
+              RelnRenameUpdateProcessor("nsubjpass", "nsubj:pass"),
+              RelnRenameUpdateProcessor("csubjpass", "csubj:pass"),
+              RelnRenameUpdateProcessor("auxpass", "aux:pass"),
+              RelnRenameUpdateProcessor("name", "flat"),
+              NmodUpdateProcessor(),
+              CoordinationReattachmentProcessor(),
+              NegationProcessor()
+              ]
+
+
+def main():
+
+    parser = argparse.ArgumentParser(description='Convert a CoNLL-U formatted UD treebank from v1 to v2.')
+    parser.add_argument('filename', metavar='FILENAME', type=str, help='Path to CoNLL-U file.')
+    args = parser.parse_args()
+
+
+
+    f = open(args.filename, "r")
+    lines = []
+    for line in f:
+        if line.strip() == "":
+            if len(lines) > 0:
+                graph = DependencyGraph(lines=lines)
+                for processor in processors:
+                    processor.process(graph)
+                graph.print_conllu()
+            lines = []
+        else:
+            lines.append(line)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/v2-conversion/depgraph_utils.py b/v2-conversion/depgraph_utils.py
@@ -0,0 +1,175 @@
+#!/usr/bin/env python3
+
+from collections import defaultdict
+import sys
+
+
+'''
+    Utils for reading in UD trees, working with them, and outputting them in CoNLL-U format.
+'''
+
+
+COMMENT_START_CHAR = "#"
+
+class DependencyGraph(object):
+
+    def __init__(self, lines=None):
+
+
+        root_node = DependencyGraphNode(0, "ROOT") 
+
+        self.nodes = {0: root_node}
+        self.edges = set()
+        self.outgoingedges = defaultdict(set)
+        self.incomingedges = defaultdict(set)
+        self.comments = []
+
+
+        if lines != None:
+            self._parse_conllu(lines)
+
+
+    def _parse_conllu(self, lines):
+
+        #extract nodes
+        for line in lines:
+            line = line.strip()
+            if line.startswith(COMMENT_START_CHAR):
+                self.comments.append(line)
+                continue
+
+            idx, form, lemma, upos, pos, feats, _, _, deps, misc = line.split("\t")
+            idx = int(idx)
+            node = DependencyGraphNode(idx, form, lemma=lemma, upos=upos, pos=pos, 
+                                      features=feats, misc=misc, enhanced=deps)
+            self.nodes[idx] = node
+
+        #extract edges
+        for line in lines:
+            line = line.strip()
+            if line.startswith(COMMENT_START_CHAR):
+                continue
+
+            #TODO: support enhanced dependencies
+            idx, _, _, _, _, _, gov, reln, _, _ = line.split("\t")
+            idx = int(idx)
+            gov = int(gov)
+            self.add_edge(gov, idx, reln)
+
+
+    def add_edge(self, gov, dep, reln):
+        edge = DependencyGraphEdge(gov, dep, reln)
+        self.edges.add(edge)
+        self.outgoingedges[gov].add((dep, reln))
+        self.incomingedges[dep].add((gov, reln))
+
+    def remove_edge(self, gov, dep, reln=None):
+        if reln == None:
+            to_remove = set()
+            for edge in self.edges:
+                if edge.gov == gov and edge.dep == dep:
+                    to_remove.add(edge)
+                self.outgoingedges[gov].remove((dep, edge.relation))
+                self.incomingedges[dep].remove((gov, edge.relation))
+            self.edges.difference_update(to_remove)
+        else:
+            edge = DependencyGraphEdge(gov, dep, reln)
+            self.edges.remove(edge)
+            self.outgoingedges[gov].remove((dep, reln))
+            self.incomingedges[dep].remove((gov, reln))
+
+    def has_edge(self, gov, dep, reln=None):
+        if reln == None:
+            for edge in self.edges:
+                if edge.gov == gov and edge.dep == dep:
+                    return True
+        else:
+            edge = DependencyGraphEdge(gov, dep, reln)
+            return edge in self.edges
+
+
+    '''
+        Returns a list of node indices which are attached to gov via reln.
+    '''
+    def dependendents_with_reln(self, gov, reln):
+        results = []
+        for (dep, reln2) in self.outgoingedges[gov]:
+            if reln == reln2:
+                results.append(dep)
+        return results;
+
+
+
+
+    def print_conllu(self, f=sys.stdout):
+          for comment in self.comments:
+              print(comment, file=f)
+
+          for idx in sorted(self.nodes.keys()):
+              node = self.nodes[idx]
+              if idx > 0:
+                  parents = self.incomingedges[node.index]
+                  gov, reln = next(iter(parents)) if len(parents) > 0 else (-1, "null")
+                  print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (node.index,
+                                                                      node.form,
+                                                                      node.lemma,
+                                                                      node.upos,
+                                                                      node.pos,
+                                                                      node.features,
+                                                                      gov,
+                                                                      reln,
+                                                                      node.enhanced,
+                                                                      node.misc), file=f)
+
+          print(file=f)
+
+
+
+class DependencyGraphNode(object):
+
+    def __init__(self, index, form, lemma=None, upos=None, pos=None, features=None, enhanced=None, misc=None):
+        self.index = index
+        self.form = form
+        self.lemma = lemma
+        self.upos = upos
+        self.pos = pos
+        self.features = features
+        self.misc = misc
+        self.enhanced = enhanced 
+
+    def __hash__(self):
+        return self.index.__hash__() + \
+                 self.form.__hash__() + \
+                 self.lemma.__hash__() + \
+                 self.upos.__hash__() + \
+                 self.pos.__hash__() + \
+                 self.features.__hash__() + \
+                 self.misc.__hash__()
+
+    def __eq__(self, other):
+        return self.index == other.index and \
+                 self.form == other.form and \
+                 self.lemma == other.lemma and \
+                 self.upos == other.upos and \
+                 self.pos == other.pos and \
+                 self.features == other.features and \
+                 self.misc == other.misc
+
+    def __str__(self):
+        return self.form + "-" + str(self.index)
+
+
+
+class DependencyGraphEdge(object):
+
+    def __init__(self, gov, dep, relation):
+        self.gov = gov
+        self.dep = dep
+        self.relation = relation
+
+    def __hash__(self):
+        return self.gov.__hash__() + self.dep.__hash__() + self.relation.__hash__()
+
+    def __eq__(self, other):
+        return self.gov == other.gov and self.dep == other.dep and self.relation == other.relation
+
diff --git a/v2-conversion/processors_en.py b/v2-conversion/processors_en.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+import sys
+
+from processors_universal import *
+
+'''
+    Turns deprecated "neg" relation into "det" or "advmod" relation, depending on its syntactic
+    function (as determined by the POS tag of the negating token).
+'''
+class NegationProcessor(UpdateProcessor):
+
+    def process(self, graph):
+
+        # Stores changes in the form (old_gov, old_dep, old_reln, new_reln).
+        # (new_gov = old_gov and new_dep = old_dep, so we don't have to store them.)
+        reln_changes = []
+
+        for edge in graph.edges:
+            if edge.relation == "neg":
+                dep = graph.nodes[edge.dep]
+                if dep.upos in ['ADV', 'PART']:
+                    reln_changes.append((edge.gov, edge.dep, "neg", "advmod"))
+                elif dep.upos == "DET":
+                    reln_changes.append((edge.gov, edge.dep, "neg", "det"))
+                else:
+                    print("WARNING: Dependent of neg relation is neither ADV, PART, nor DET." +
+                          "You'll have to manually update the relation.", file=sys.stderr)
+                    graph.print_conllu(f=sys.stderr)
+
+        for (gov, dep, old_reln, new_reln) in reln_changes:
+            graph.remove_edge(gov, dep, old_reln)
+            graph.add_edge(gov, dep, new_reln)
+
+