Skip to content

Commit

Permalink
Add first version of v2 conversion script.
Browse files Browse the repository at this point in the history
  • Loading branch information
sebschu committed Jan 13, 2017
1 parent 1b4ada7 commit f69ffb2
Show file tree
Hide file tree
Showing 5 changed files with 482 additions and 0 deletions.
38 changes: 38 additions & 0 deletions v2-conversion/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# UD v2 conversion script
Author: Sebastian Schuster (sebschu@stanford.edu)


This script performs the following automatic updates to a treebank to be compliant with the v2 guidlines:

* Rename UPOS tag `CONJ` to `CCONJ`.
* Rename the `mwe` relation to `fixed`.
* Rename the `name` relation to `flat`.
* Rename the `dobj` relation to `obj`.
* Rename the `(nsubj|csubj|aux)pass` relations to `(nsubj|csubj|aux):pass`.
* Change some `nmod` relations to `obl` (wherever appropriate). Note that in some cases,
it is ambiguous whether an nmod relation should be nmod or obl. If this is the case, the script
adds the property `ManualCheck=Yes` to the `MISC` column of the relation.
* Reattach coordinating conjunctions and commas to the succeeding conjunct.
* (Designed only for English!) Change `neg` relations to `advmod` or `det`.

Note that this script does NOT perform all required changes. In particular, it does NOT perform the following changes, which either have to be performed manually or using custom scripts.

* New treatment of gapped constructions (using `orphan` relations instead of `remnant` relations).
* Changes to morphological features.
* Addition of enhanced dependencies.
* Changes to tokenization.
* Changes to copular constructions.
* Changes to POS tags beyond renaming `CONJ` to `CCONJ`.

IMPORTANT: I only tested this script on the English treebank. It should also work for other languages, but if you run this script on any other treebank, make sure to do thorough manual checks.


## Usage

The script requires python3. You can run the script with the following command.

```
python convert.py PATH_TO_CONLLU_FILE > OUTPUT_PATH
```

The script will write the converted trees to stdout (which is piped to `OUTPUT_PATH` in the above command). Warnings (including the corresponding tree) are printed to stderr.
78 changes: 78 additions & 0 deletions v2-conversion/convert.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python3


######################################################################################
# #
# This script converts a CoNLL-U formatted treebank which was annotated according to #
# the v1 guidelines to be compatible with the v2 guidelines. #
# #
# Limitations: #
# #
# * This script does NOT update gapped constructions with remnant relations. This #
# has to be done manually. #
# * This script does NOT add enhanced dependencies and it does not update the #
# contents of the DEPS field. #
# * In some cases, it is ambiguous whether an nmod relation should be nmod or obl in #
# v2. If this is the case, the script adds the property ManualCheck=Yes to the #
# MISC column of the relation. #
# * The script does NOT rename, add, or remove any morphological features. #
# #
# Author: Sebastian Schuster (sebschu@stanford.edu) #
# #
######################################################################################


import sys
import argparse

from depgraph_utils import *
from processors_universal import *
from processors_en import *


######################################################################################
# Processors #
# #
# Each processor in this list is applied to each UD graph in turn. #
# Processors are defined in processors_universal.py and processors_lang #
# #
# If you implement treebank-specific processors, make sure to add them to this list. #
######################################################################################

processors = [UPosRenameUpdateProcessor("CONJ", "CCONJ"),
RelnRenameUpdateProcessor("mwe", "fixed"),
RelnRenameUpdateProcessor("dobj", "obj"),
RelnRenameUpdateProcessor("nsubjpass", "nsubj:pass"),
RelnRenameUpdateProcessor("csubjpass", "csubj:pass"),
RelnRenameUpdateProcessor("auxpass", "aux:pass"),
RelnRenameUpdateProcessor("name", "flat"),
NmodUpdateProcessor(),
CoordinationReattachmentProcessor(),
NegationProcessor()
]


def main():

parser = argparse.ArgumentParser(description='Convert a CoNLL-U formatted UD treebank from v1 to v2.')
parser.add_argument('filename', metavar='FILENAME', type=str, help='Path to CoNLL-U file.')
args = parser.parse_args()



f = open(args.filename, "r")
lines = []
for line in f:
if line.strip() == "":
if len(lines) > 0:
graph = DependencyGraph(lines=lines)
for processor in processors:
processor.process(graph)
graph.print_conllu()
lines = []
else:
lines.append(line)


if __name__ == '__main__':
main()
175 changes: 175 additions & 0 deletions v2-conversion/depgraph_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,175 @@
#!/usr/bin/env python3

from collections import defaultdict
import sys


'''
Utils for reading in UD trees, working with them, and outputting them in CoNLL-U format.
'''


COMMENT_START_CHAR = "#"

class DependencyGraph(object):

def __init__(self, lines=None):


root_node = DependencyGraphNode(0, "ROOT")

self.nodes = {0: root_node}
self.edges = set()
self.outgoingedges = defaultdict(set)
self.incomingedges = defaultdict(set)
self.comments = []


if lines != None:
self._parse_conllu(lines)


def _parse_conllu(self, lines):

#extract nodes
for line in lines:
line = line.strip()
if line.startswith(COMMENT_START_CHAR):
self.comments.append(line)
continue

idx, form, lemma, upos, pos, feats, _, _, deps, misc = line.split("\t")
idx = int(idx)
node = DependencyGraphNode(idx, form, lemma=lemma, upos=upos, pos=pos,
features=feats, misc=misc, enhanced=deps)
self.nodes[idx] = node

#extract edges
for line in lines:
line = line.strip()
if line.startswith(COMMENT_START_CHAR):
continue

#TODO: support enhanced dependencies
idx, _, _, _, _, _, gov, reln, _, _ = line.split("\t")
idx = int(idx)
gov = int(gov)
self.add_edge(gov, idx, reln)


def add_edge(self, gov, dep, reln):
edge = DependencyGraphEdge(gov, dep, reln)
self.edges.add(edge)
self.outgoingedges[gov].add((dep, reln))
self.incomingedges[dep].add((gov, reln))

def remove_edge(self, gov, dep, reln=None):
if reln == None:
to_remove = set()
for edge in self.edges:
if edge.gov == gov and edge.dep == dep:
to_remove.add(edge)
self.outgoingedges[gov].remove((dep, edge.relation))
self.incomingedges[dep].remove((gov, edge.relation))
self.edges.difference_update(to_remove)
else:
edge = DependencyGraphEdge(gov, dep, reln)
self.edges.remove(edge)
self.outgoingedges[gov].remove((dep, reln))
self.incomingedges[dep].remove((gov, reln))

def has_edge(self, gov, dep, reln=None):
if reln == None:
for edge in self.edges:
if edge.gov == gov and edge.dep == dep:
return True
else:
edge = DependencyGraphEdge(gov, dep, reln)
return edge in self.edges


'''
Returns a list of node indices which are attached to gov via reln.
'''
def dependendents_with_reln(self, gov, reln):
results = []
for (dep, reln2) in self.outgoingedges[gov]:
if reln == reln2:
results.append(dep)
return results;




def print_conllu(self, f=sys.stdout):
for comment in self.comments:
print(comment, file=f)

for idx in sorted(self.nodes.keys()):
node = self.nodes[idx]
if idx > 0:
parents = self.incomingedges[node.index]
gov, reln = next(iter(parents)) if len(parents) > 0 else (-1, "null")
print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (node.index,
node.form,
node.lemma,
node.upos,
node.pos,
node.features,
gov,
reln,
node.enhanced,
node.misc), file=f)

print(file=f)



class DependencyGraphNode(object):

def __init__(self, index, form, lemma=None, upos=None, pos=None, features=None, enhanced=None, misc=None):
self.index = index
self.form = form
self.lemma = lemma
self.upos = upos
self.pos = pos
self.features = features
self.misc = misc
self.enhanced = enhanced

def __hash__(self):
return self.index.__hash__() + \
self.form.__hash__() + \
self.lemma.__hash__() + \
self.upos.__hash__() + \
self.pos.__hash__() + \
self.features.__hash__() + \
self.misc.__hash__()

def __eq__(self, other):
return self.index == other.index and \
self.form == other.form and \
self.lemma == other.lemma and \
self.upos == other.upos and \
self.pos == other.pos and \
self.features == other.features and \
self.misc == other.misc

def __str__(self):
return self.form + "-" + str(self.index)



class DependencyGraphEdge(object):

def __init__(self, gov, dep, relation):
self.gov = gov
self.dep = dep
self.relation = relation

def __hash__(self):
return self.gov.__hash__() + self.dep.__hash__() + self.relation.__hash__()

def __eq__(self, other):
return self.gov == other.gov and self.dep == other.dep and self.relation == other.relation

35 changes: 35 additions & 0 deletions v2-conversion/processors_en.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3

import sys

from processors_universal import *

'''
Turns deprecated "neg" relation into "det" or "advmod" relation, depending on its syntactic
function (as determined by the POS tag of the negating token).
'''
class NegationProcessor(UpdateProcessor):

def process(self, graph):

# Stores changes in the form (old_gov, old_dep, old_reln, new_reln).
# (new_gov = old_gov and new_dep = old_dep, so we don't have to store them.)
reln_changes = []

for edge in graph.edges:
if edge.relation == "neg":
dep = graph.nodes[edge.dep]
if dep.upos in ['ADV', 'PART']:
reln_changes.append((edge.gov, edge.dep, "neg", "advmod"))
elif dep.upos == "DET":
reln_changes.append((edge.gov, edge.dep, "neg", "det"))
else:
print("WARNING: Dependent of neg relation is neither ADV, PART, nor DET." +
"You'll have to manually update the relation.", file=sys.stderr)
graph.print_conllu(f=sys.stderr)

for (gov, dep, old_reln, new_reln) in reln_changes:
graph.remove_edge(gov, dep, old_reln)
graph.add_edge(gov, dep, new_reln)


Loading

0 comments on commit f69ffb2

Please sign in to comment.