Merge pull request #6 from System-T/issue-5

closes issue #5
UniversalPropositions · Feb 21, 2020 · 60e2fb8 · 60e2fb8
2 parents 41f3007 + 8043150
commit 60e2fb8
Show file tree

Hide file tree

Showing 9 changed files with 306,567 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -57,7 +57,13 @@ The German verb 'unterstützt' is labeled as evoking the '**support.01**' frame
 
 ### Format 
 
-The universal propbank (UP) for each language consists of three files in CoNLL-U format (one for training, dev and test data). In addition, each language has a folder with verb overview files in html format. These files can be viewed in a browser and give an overview of all English frames that each target language verb can evoke. 
+The universal propbank (UP) for each language consists of three files (training, dev and test data) with the extension `.conllu` but currently encoding an extension of the [CoNLL-U format](http://universaldependencies.org/format.html). The extension is based on the CoNLL format produced by the [Propbank conversion scripts](https://github.com/propbank/propbank-release/blob/master/docs/conll-conversion-notes.md), called `.gold_conll`. 
+
+Besides the original 10 columns from the CoNLL-U format, the roleset column (column 11) gives the actual sense used, and that sense provides roleset specific meanings for each of the numbered arguments. Every column after the eleventh is a predicate, in order that they appear in the sentence. Note that the Propbank `.gold_conll` files contain a "frame file" column (column 11) that lets you know which ".xml" [file](https://github.com/propbank/propbank-frames/) contains the actual semantic form for the predicate in question (which is not always the same as the predicate: one must reference "lighten.xml" for lighten_up.02), but since all predicate identifier is unique, we haven't preserved this column.
+
+The English dataset was the only one obtained in a different maner. See the README.org file in that directory for information.
+
+In addition, each language has a folder with verb overview files (produced from the frame files) in html format. These files can be viewed in a browser and give an overview of all English frames that each target language verb can evoke.
 
 ### Scope
 

diff --git a/UP_English-EWT/README.org b/UP_English-EWT/README.org
@@ -0,0 +1,282 @@
+#+Title: The EN Universal Propbank
+
+The files on this directory were obtained from the merge of the
+sources:
+
+1. https://github.com/universaldependencies/UD_English-EWT
+2. https://github.com/propbank/propbank-release
+3. https://catalog.ldc.upenn.edu/LDC2012T13
+
+The main idea is to project the rolesets and its arguments (2) on top
+of the constituent analysis in the original treebank (3) to the
+dependency trees from the UD analysis (1). 
+
+Consider the sentence and its original annotations obtained from
+(2,3):
+
+#+BEGIN_EXAMPLE
+sent_id = weblog-blogspot.com_thelameduck_20041119192207_ENG_20041119_192207-0008
+text = With the demand so high, the question arises on to who should be or has the right to be the Santa of nuclear weapons.
+─┮  
+ │   ╭─╼With SCONJ mark			*	*		*	(ARGM-ADV 
+ │   │ ╭─╼the DET det			*	(ARG1 
+ │   ├─┶demand NOUN nsubj	        (V*)	*)		 
+ │   ├─╼so ADV advmod			*	(ARGM-EXT*)	 
+ │ ╭─┶high ADJ advcl			*	(V*)		*	*)	 
+ │ ├─╼, PUNCT punct   
+ │ │ ╭─╼the DET det			*	*		*	(ARG1 
+ │ ├─┶question NOUN nsubj		*	*		(V*)	*)	 
+ ╰─┾arises VERB root			*	*		*	(V*)	 
+   │ ╭─╼on ADP mark			*	*		(ARG1*	(C-ARG1 
+   │ ├─╼to SCONJ mark   
+   │ ├─╼who PRON nsubj			*	*		*	*	(ARG1*)	(ARG0*)	 
+   │ ├─╼should AUX aux			*	*		*	*	(ARGM-MOD*)	 
+   ├─┾be AUX obl			*	*		*	*	(V*)		 
+   │ │ ╭─╼or CCONJ cc   
+   │ ╰─┾has VERB conj			*	*		*	*	*		(V*)		 
+   │   │ ╭─╼the DET det			*	*		*	*	*		(ARG1 
+   │   ╰─┾right NOUN obj		*	*		*	*	*		*		(V*)	 
+   │     │ ╭─╼to PART mark		*	*		*	*	*		*		(ARG2 
+   │     │ ├─╼be AUX cop		*	*		*	*	*		*		*	(V*) 
+   │     │ ├─╼the DET det		*	*		*	*	(ARG2*		*		*	(ARG2 
+   │     ╰─┾Santa PROPN acl   
+   │       │ ╭─╼of ADP case   
+   │       │ ├─╼nuclear ADJ amod   
+   │       ╰─┶weapons NOUN nmod	        *	*		*)	*)	*)		*)		*)	*) 
+   ╰─╼. PUNCT punct   
+#+END_EXAMPLE
+
+The same sentence in the final `.conllu` files will be annotated as
+below. The arguments marks will be on the head of the subtree from the
+origial constituent.
+
+#+BEGIN_EXAMPLE
+─┮  
+ │   ╭─╼With SCONJ mark			_	_		_	_		_		_	_	_ 
+ │   │ ╭─╼the DET det			_	_		_	_		_		_	_	_ 
+ │   ├─┶demand NOUN nsubj		V	ARG1		_	_		_		_	_	_ 
+ │   ├─╼so ADV advmod			_	ARGM-EXT	_	_		_		_	_	_ 
+ │ ╭─┶high ADJ advcl			_	V		_	ARGM-ADV	_		_	_	_ 
+ │ ├─╼, PUNCT punct			_	_		_	_		_		_	_	_ 
+ │ │ ╭─╼the DET det			_	_		_	_		_		_	_	_ 
+ │ ├─┶question NOUN nsubj		_	_		V	ARG1		_		_	_	_ 
+ ╰─┾arises VERB root			_	_		_	V		_		_	_	_ 
+   │ ╭─╼on ADP mark			_	_		_	_		_		_	_	_ 
+   │ ├─╼to SCONJ mark			_	_		_	_		_		_	_	_ 
+   │ ├─╼who PRON nsubj			_	_		_	_		ARG1		ARG0	_	_ 
+   │ ├─╼should AUX aux			_	_		_	_		ARGM-MOD	_	_	_ 
+   ├─┾be AUX obl			_	_		ARG1	C-ARG1		V		_	_	_ 
+   │ │ ╭─╼or CCONJ cc			_	_		_	_		_		_	_	_ 
+   │ ╰─┾has VERB conj			_	_		_	_		_		V	_	_ 
+   │   │ ╭─╼the DET det			_	_		_	_		_		_	_	_ 
+   │   ╰─┾right NOUN obj		_	_		_	_		_		ARG1	V	_ 
+   │     │ ╭─╼to PART mark		_	_		_	_		_		_	_	_ 
+   │     │ ├─╼be AUX cop		_	_		_	_		_		_	_	V 
+   │     │ ├─╼the DET det		_	_		_	_		_		_	_	_ 
+   │     ╰─┾Santa PROPN acl		_	_		_	_		ARG2		_	ARG2	ARG2 
+   │       │ ╭─╼of ADP case		_	_		_	_		_		_	_	_ 
+   │       │ ├─╼nuclear ADJ amod	_	_		_	_		_		_	_	_ 
+   │       ╰─┶weapons NOUN nmod		_	_		_	_		_		_	_	_ 
+   ╰─╼. PUNCT punct			_	_		_	_		_		_	_	_ 
+#+END_EXAMPLE
+
+** How to execute the code
+
+The code should work only in MacOS or Linux. To reexecute the code you
+will need the following tools:
+
+- Common Lisp implementation. Tested in http://sbcl.org
+- The GNU Awk and Bash
+
+When these files will need to be regenerate? Whenever a new release of
+any dataset (1-3) above is made public.
+
+The dataset (3) was used to complete the stand-off files from the
+Propbank dataset. We followed the proceduce described in the README of
+the propbank release data to produce the =.gold_conll= files using the
+script =map_all_to_conll.py= provided in the propbank release.
+
+Consider that you have cloned the propbank repository (2) above to the
+directory =$PBHOME= and this directory is the =$UPENHOME=. Before
+execute the comands below, edit the =merge.lisp= file changing the
+location of the UD files in the function =main=. Finally execute:
+
+#+BEGIN_SRC bash
+cd $PBHOME
+find data/google -name "*.gold_conll" -exec awk -f $UPHOME/conll-to-conllu.awk {} \; > propbank-all.conllu
+mv propbank-all.conllu $UPHOME/
+cd $UPHOME
+sh make.sh
+#+END_SRC
+
+The output format is described in the README file of this
+repository. 
+
+** Statistics
+
+Top 20 predicates in the dataset:
+
+#+BEGIN_EXAMPLE
+% awk '$0 ~ /^[0-9]/ {if (NF > 10) print $11; else print "NP";}' en_ewt-up-*.conllu | sort | uniq -c | sort -nr | head -20
+204270 _
+6076 be.01
+2949 be.03
+1583 have.01
+1176 do.01
+1070 have.03
+ 544 say.01
+ 511 do.02
+ 510 know.01
+ 472 be.02
+ 417 go.02
+ 412 want.01
+ 375 see.01
+ 374 thank.01
+ 368 need.01
+ 346 think.01
+ 344 use.01
+ 338 get.01
+ 301 NP
+ 289 work.01
+#+END_EXAMPLE   
+
+Verbs not tagged as predicate:
+
+#+BEGIN_EXAMPLE
+% awk '$4 ~ /^V/ && $10 !~ /\(V\*/ {print}' propbank-all.conllu | wc -l
+    1133
+#+END_EXAMPLE
+
+Words tagged as predicates by POS tags (in the original PTB tags):
+
+#+BEGIN_EXAMPLE
+% awk '$10 ~ /\(V\*/  {print $4}' propbank-all.conllu | sort | uniq -c
+   1 CD
+   3 GW
+2679 JJ
+ 158 JJR
+  13 JJS
+6724 NN
+2041 NNS
+   1 RB
+11663 VB
+6398 VBD
+3648 VBG
+4300 VBN
+6853 VBP
+5777 VBZ
+#+END_EXAMPLE
+
+The same but in the output files:
+
+#+BEGIN_EXAMPLE
+% awk '$0 ~ /^[0-9]/ && $11 != "_" {print $4}' en_ewt*.conllu | sort | uniq -c | sort -nr | head -20
+27349 VERB
+11326 AUX
+8848 NOUN
+2912 ADJ
+  40 PUNCT
+  23 ADP
+  21 DET
+  19 ADV
+  18 PROPN
+  11 CCONJ
+   6 PRON
+   4 X
+   3 NUM
+   3 INTJ
+   2 PART
+   1 SCONJ
+#+END_EXAMPLE
+
+The following cases of discontinuous and reference arguments:
+
+#+BEGIN_EXAMPLE
+% awk '$0 ~ /C-V\*/ {print}' propbank-all.conllu | wc -l
+     187
+
+% awk '$0 ~ /C-ARG[0-9]\*/ {print}' propbank-all.conllu | wc -l
+     695
+
+% awk '$0 ~ /R-ARG[0-9]\*/ {print}' propbank-all.conllu | wc -l
+    1292
+#+END_EXAMPLE
+
+** Notes
+
+1. constituents splited into multiple subtree
+
+In some sentences, arguments in a constituent span over multiple
+subtrees. See the case of 's' and 'call' as ARG1 of the verb
+'let'. Currently, our solution is to duplicate the argument in the
+column as shown below. Note also the case of 'call off' (discontinuous
+verb).
+
+#+BEGIN_EXAMPLE
+sent_id = answers-20070723111604AAzUvhb_ans-0006
+text = Lets call the whole thing off.
+─┮  
+ ╰─┮Let VERB root		V	_ 
+   ├─╼s PRON obj		ARG1	ARG0 
+   ├─┮call VERB xcomp		ARG1	V 
+   │ │ ╭─╼the DET det		_	_ 
+   │ │ ├─╼whole ADJ amod	_	_ 
+   │ ├─┶thing NOUN obj		_	ARG1 
+   │ ╰─╼off ADP compound:prt	_	C-V 
+   ╰─╼. PUNCT punct		_	_ 
+#+END_EXAMPLE
+
+
+2. Difference between PTB POS tag and UD XPOSTAG
+
+The documentation of UD_English-EWT treebank says that XPOSTAG field
+preserved the original value of the dataset (3) with manual annotation
+and corrections. We found 194 tokens with different values maked in
+the MISC field with the key PTBPOS.
+
+#+BEGIN_EXAMPLE
+% grep PTBPOS en_ewt*.conllu | wc -l
+     194
+#+END_EXAMPLE
+
+The token below is one example of this case. In UD has XPOSTAG
+of token 4 is 'VBG' but in the LDC (3) dataset the POS is 'NN'.
+
+#+BEGIN_EXAMPLE
+# sent_id = weblog-blogspot.com_rigorousintuition_20060511134300_ENG_20060511_134300-0085
+# text = It is the marrying off of young girls to older men often close relatives that enfurates me.
+1       It      it      PRON    PRP     Case=Nom|Gender=Neut|Number=Sing|Person=3|PronType=Prs  2       expl    2:expl  _       _       _       _       _
+2       is      be      VERB    VBZ     Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   0       root    0:root  _       be.01   V       _       _
+3       the     the     DET     DT      Definite=Def|PronType=Art       4       det     4:det   _       _       _       _       _
+4       marrying        marrying        NOUN    VBG     Number=Sing     2       nsubj   2:nsubj|16:nsubj        PTBPOS=NN       marry_off.02    ARG2    V       ARG0
+5       off     off     NOUN    NN      Number=Sing     4       advmod  4:advmod        _       _       _       _       _
+6       of      of      ADP     IN      _       8       case    8:case  _       _       _       _       _
+7       young   young   ADJ     JJ      Degree=Pos      8       amod    8:amod  _       _       _       _       _
+8       girls   girl    NOUN    NNS     Number=Plur     4       nmod    4:nmod:of       _       _       _       ARG1    _
+9       to      to      ADP     IN      _       11      case    11:case _       _       _       _       _
+10      older   older   ADJ     JJR     Degree=Cmp      11      amod    11:amod _       _       _       _       _
+11      men     man     NOUN    NNS     Number=Plur     4       nmod    4:nmod:to       _       _       _       ARG2    _
+12      often   often   ADV     RB      _       14      advmod  14:advmod       _       _       _       _       _
+13      close   close   ADJ     JJ      Degree=Pos      14      amod    14:amod _       _       _       _       _
+14      relatives       relative        NOUN    NNS     Number=Plur     11      appos   11:appos        _       _       _       _       _
+15      that    that    PRON    WDT     PronType=Rel    16      nsubj   4:ref   _       _       _       _       R-ARG0
+16      enfurates       enfurate        VERB    VBZ     Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin   4       acl:relcl       4:acl:relcl     _       infuriate.01    ARG1    _       V
+17      me      I       PRON    PRP     Case=Acc|Number=Sing|Person=1|PronType=Prs      16      obj     16:obj  SpaceAfter=No   _       _       _       ARG1
+18      .       .       PUNCT   .       _       2       punct   2:punct _       _       _       _       _
+#+END_EXAMPLE
+
+3. Missing sentences or mismatch in the number of tokens
+
+In the final data, few sentences were annotated with a =metadata=
+field. The =no-up= means the sentence was not in the propbank release
+(reported [[https://github.com/propbank/propbank-release/issues/7][here]]). The only case where the sentence has different number
+of tokens in the datasets (1,2/3) is marked with =diff-number-tokens=
+(reported [[https://github.com/propbank/propbank-release/issues/8][here]]).
+
+#+BEGIN_EXAMPLE
+% grep "# propbank" *.conllu | sort | uniq -c
+  28 en_ewt-up-dev.conllu:# propbank = no-up
+  15 en_ewt-up-test.conllu:# propbank = no-up
+   1 en_ewt-up-train.conllu:# propbank = diff-number-tokens
+#+END_EXAMPLE
+
diff --git a/UP_English-EWT/conll-to-conllu.awk b/UP_English-EWT/conll-to-conllu.awk
@@ -0,0 +1,19 @@
+
+BEGIN {OFS = "\t"; new_sent = 1}
+
+NF == 0 { new_sent = 1; print ""; next }
+
+new_sent == 1 && NF > 0 {
+    new_sent = 0;
+    doc = $1;
+    sent = $2;
+    print "# filename = " FILENAME
+    print "# sent_id = " $2
+    print "# doc_id = " $1
+}
+
+new_sent == 0 && NF > 1 {
+    for(i=j=9; i < NF; i+=1) {$j = $j"/"$(i+1)}
+    # ID form lemma upos xpos feats head deprel deps misc
+    print $3, $4, $4, $5, "_", "_", "_", "_", $6, "Framefile=" $7 "|" "Roleset=" $8 "|" "Args=" $9
+} 
diff --git a/UP_English-EWT/conllu-to-conll.awk b/UP_English-EWT/conllu-to-conll.awk
@@ -0,0 +1,61 @@
+
+function str2map(str,fs1,fs2,map) {
+   n=split(str,lmap,fs1)
+   for (x in lmap) { 
+     split(lmap[x],tmp,fs2);
+     map[tmp[1]]=tmp[2];
+   }
+   return n
+}
+
+function join1(array, sep) {
+    result = ""
+    for(i in array)
+	if(result == "")
+	    result = array[i]
+	else
+	    result = result sep array[i]
+    return result
+}
+
+function join2(array, sep1, sep2) {
+    result = ""
+    for(i in array)
+	if(result == "")
+	    result = i sep2 array[i]
+	else
+	    result = result sep1 i sep2 array[i]
+    return result
+}
+
+
+BEGIN {OFS = "\t";}
+
+$0 ~ /^#/ {print; next}
+
+$0 ~ /^[0-9]/ && NF == 10 {
+    pmisc = $10;
+    delete a;
+    delete b;
+
+    str2map(pmisc,"|","=",a)
+    frame = a["Framefile"]
+    delete a["Framefile"]
+    if ( frame == "-") frame = "_"
+    role = a["Roleset"]
+    delete a["Roleset"]
+    if ( role == "-" ) role = "_"
+
+    args=a["Args"]
+    delete a["Args"]
+    split(args,b,/\//)
+    margs = join1(b,"\t")
+
+    misc  = join2(a,"|","=")
+    if (misc == "") misc = "_"
+
+    print $1,$2,$3,$4,$5,$6,$7,$8,$9,misc,role,margs
+    next
+}
+
+{print}