3.0draft1 - smoothing out the writeup

OpenTreeOfLife · Feb 19, 2017 · ff7cc9b · ff7cc9b
1 parent 3c887e1
commit ff7cc9b
Show file tree

Hide file tree

Showing 18 changed files with 194 additions and 110 deletions.
diff --git a/Makefile b/Makefile
@@ -95,7 +95,7 @@ bin/smasher:
 
 # The open tree taxonomy
 
-ott: tax/ott/log.tsv tax/ott/version.txt
+ott: tax/ott/log.tsv tax/ott/version.txt tax/ott/README.html
 tax/ott/log.tsv: $(CLASS) make-ott.py assemble_ott.py adjustments.py amendments.py \
                     tax/silva/taxonomy.tsv \
 		    tax/fung/taxonomy.tsv tax/713/taxonomy.tsv \
@@ -114,14 +114,17 @@ tax/ott/log.tsv: $(CLASS) make-ott.py assemble_ott.py adjustments.py amendments.
 	@date
 	@rm -f *py.class
 	@mkdir -p tax/ott
-	@echo Writing transcript to tax/ott/transcript.out.new
-	time bin/jython make-ott.py 2>&1 | tee tax/ott/transcript.out.new
+	@echo Writing transcript to tax/ott/transcript.out
+	time bin/jython make-ott.py $(WHICH) 2>&1 | tee tax/ott/transcript.out.new
 	mv tax/ott/transcript.out.new tax/ott/transcript.out
 	echo $(WHICH) >tax/ott/version.txt
 
 tax/ott/version.txt:
 	echo $(WHICH) >tax/ott/version.txt
 
+tax/ott/README.html: tax/ott/about.json util/make_readme.py
+	python util/make_readme.py tax/ott/ >$@
+
 # ----- Taxonomy inputs
 
 # Input: Index Fungorum
@@ -536,8 +539,11 @@ t/tax/aster/taxonomy.tsv: compile t/aster.py \
 	@mkdir -p `dirname $@`
 	bin/jython t/aster.py
 
+t/tax/aster/README.html: t/tax/aster/about.json util/make_readme.py
+	python util/make_readme.py t/tax/aster/ >$@
+
 test: aster
-aster: t/tax/aster/taxonomy.tsv
+aster: t/tax/aster/taxonomy.tsv t/tax/aster/README.html
 
 aster-tarball: t/tax/aster/taxonomy.tsv
 	(mkdir -p $(TARDIR) && \

diff --git a/amendments.py b/amendments.py
@@ -2,7 +2,7 @@
 
 from org.opentreeoflife.taxa import TsvEdits
 from claim import *
-from chromista_spreadsheet import fixChromista
+#from chromista_spreadsheet import fixChromista
 
 # ----- Final patches -----
 
@@ -101,8 +101,12 @@ def patch_ott(ott):
 
     # Patches from the Katz lab to give decent parents to taxa classified
     # as Chromista or Protozoa
-    print '-- Chromista/Protozoa spreadsheet from Katz lab --'
-    fixChromista(ott)
+    # DISABLED: as of 2017-02-19, all but one of the changes listed on the spreadsheet
+    # either were already there, or else the taxon was missing.  So it doesn't
+    # make much sense to continue using it.
+    #
+    # print '-- Chromista/Protozoa spreadsheet from Katz lab --'
+    # fixChromista(ott)
     # 2016-06-30 deleted from spreadsheet because ambiguous:
     #   Enigma,Protozoa,Polychaeta ,,,,, -
     #   Acantharia,Protozoa,Radiozoa,,,,,

diff --git a/assemble_ott.py b/assemble_ott.py
@@ -25,9 +25,10 @@
 additions_clone_path = 'feed/amendments/amendments-1'
 new_taxa_path = 'new_taxa'
 
-def create_ott():
+def create_ott(version):
 
     ott = UnionTaxonomy.newTaxonomy('ott')
+    ott.version = version;
 
     # Would be nice if there were tests for all of these...
     for name in names_of_interest:
@@ -245,6 +246,8 @@ def get_default_extinct_info_from_gbif(gbif, gbif_to_ott):
                     # It's OK if it's also in IRMNG
                     flagged += 1
                     taxon.extinct()
+                else:
+                    print '| PaleoDB taxon %s appears to be extant' % taxon
     infile.close()
     print '| Flagged %s of %s taxa from paleodb\n' % (flagged, paleos)
 

diff --git a/doc/method/method-details.md b/doc/method/method-details.md
@@ -236,8 +236,9 @@ follows:
       1. Let C' = those members of C that have score Z
       1. If Z > 0 and C' contains only one candidate, we are done (match is that candidate)
       1. Otherwise, replace C with C' and proceed to the next heuristic
- 4. If C is singleton, its member is taken to be the correct match.
- 5. Otherwise, the source node does not match unambiguously.
+ 4. If C is singleton after all heuristics are exhausted, its
+    member is taken to be the correct match.
+ 5. Otherwise, the source node does not match unambiguously; alignment fails.
 
 ### Failure to choose
 
@@ -247,16 +248,18 @@ it is dropped, which is OK because it probably corresponds to one of
 the existing candidates and therefore would make no new contribution
 to the workspace.  If the ambiguous source node has children, it is
 treated as unaligned and therefore new, possibly turning an N-way
-homonym into an N+1-way homonym, which could easily be wrong.
+homonym into an N+1-way homonym.  This could easily be wrong because 
+it is so unlikely that the source node really represents a distinct taxon.
 Usually, the subsequent merge phase determines that the grouping is
 not needed because it inconsistent or can be 'absorbed', and it is
 dropped.  If it is not dropped, then there is a troublesome situation
 that calls for manual review.
 
-For example, for GBIF _Katoella pulchra_, the candidates are NCBI
+As an example of an unaligned tip, consider GBIF _Katoella pulchra_.  
+The candidates are NCBI
 _Davallodes pulchra_ and _Davallodes yunnanensis_.  (There is no
-_Katoella pulchra_ in the workspace at the time of the alignment and
-the two candidates come from synonymies with _Katoella pulchra_
+_Katoella pulchra_ in the workspace at the time of alignment.
+The two candidates come from synonymies with _Katoella pulchra_
 declared by GBIF.)  
 Neither candidate is preferable to the other, so
 _Katoella pulchra_ is left unaligned and
@@ -338,6 +341,11 @@ and the new source, we retain the workspace.
 
    So that we have a term for this situation, say that x is _absorbed_ into z.
 
+[KC: I couldn't find an example that looked like case number 6.  We could replace
+what was there with a new tree showing conflict, but it would have to
+be very simple.  The only two cases I've found so far (Pisces and 
+Archaeognatha) have the form ((a,b)c) + (a,(b,c)).  Thoughts?]
+
 ## Finishing the assembly
 
 After all source taxonomies are aligned and merged, we apply general ad hoc

diff --git a/doc/method/method-sources.md b/doc/method/method-sources.md
@@ -17,7 +17,7 @@ linked from the OTT taxonomy files and user interfaces so that
 provenance is always available.
 
 **Separation taxa**  
-This is a small curated tree containing 27 major groups such
+This is a small curated tree containing 29 major groups such
 as animals, plants, and fungi.  Its purpose is to assist
 in separating homonyms.  If a node
 is found in one of these separation groups, then it will not match a
@@ -37,8 +37,8 @@ Metazoa, compared with over 500,000 taxa under Metazoa in NCBI Taxonomy.
 
 **Extinct / extant annotations**  
 Curators requested information about whether taxa were extinct
-vs. extant.  (See below for the reason this was so important.) This
-information was not explicitly present in any of our other sources, so we imported IRMNG,
+vs. extant.  With the exception of limited data from WoRMS and Index Fungorum, this
+information was not explicitly present in our other sources, so we imported IRMNG,
 which logs the extinct / extant status of taxa.
 
 As a secondary heuristic, records from GBIF that originate from
@@ -59,10 +59,11 @@ We suppress the following source taxonomy records:
   sequences', or any of about 15 similar designations
 
 The IPNI and IRMNG records are suppressed because they include many
-invalid names.  Although the original taxonomic sources indicate which
-names are known to be invalid, this information is not preserved when
-the records are exported by GBIF, since Darwin Core does not provide a
-standard way to express it.  Note that the GBIF taxonomy might import
-the same name from more than one source, but its export file only
-lists one of the sources.  We suppress the record if that source is
-IPNI or IRMNG, but not if it is some other source.
+invalid names.  We pick up most of the valid names from other sources,
+such as direct from IRMNG, so this is not a great loss.  Although
+the original taxonomic sources indicate which names are known to be
+invalid, this information is not preserved when the records are
+exported by the GBIF backbone.  Note that the GBIF backbone might
+import the same name from more than one source, but its provenance
+information only lists one of the sources.  We suppress the record if
+that source is IPNI or IRMNG, but not if it is some other source.
diff --git a/doc/method/sources_table.py b/doc/method/sources_table.py
@@ -1,4 +1,6 @@
 
+import sys, csv
+
 # Name
 # Release data / download date / version
 # Number of taxon records in source (terminal only?)
@@ -9,13 +11,13 @@
 # Reference number(s) - full reference in article's reference list
 # Maximum depth
 
-table = [
+properties = [
     {'name': 'separation taxa',
      'reference': 'see code',
      'version': '4b3ba1a',
      'priority': 1,
-     'focus': '',
-     'taxa': 28,
+     'focus': 'life',
+     'taxa': 29,
      'synonyms': 8,
      'goals': ''},
     {'name': 'ARB-SILVA',
@@ -88,7 +90,7 @@
      'taxa': 1706655,    # Boils down to 1685134
      'synonyms': 685983,
      'goals': 'T'}, # Boils down to 659851
-    {'name': 'OpenTree curation',
+    {'name': 'Open Tree curation',
      'reference': 'see code',
      'version': '4b3ba1a',
      'priority': 10,
@@ -98,32 +100,55 @@
      'goals': 'O'}
 ]
 
-def cell(val):
-    print '    <td>'
-    print '   ', val
-    print '    </td>'
+header = ['name', 'reference',
+          #'version',   #Removed at KC's request
+          'focus', 'taxa', 'synonyms', 'priority', 'reasons']
+
+table = []
+table.append(header)
 
+for plist in properties:
+    table.append([plist['name'], plist['reference'],
+                  #plist['version'],
+                  plist['focus'], plist['taxa'], plist['synonyms'], plist['priority'], plist['goals']])
+
+
+def show_table_csv(table):
+    print
+    print '```'
+    writer = csv.writer(sys.stdout, lineterminator='  \n')
+    for row in table:
+        writer.writerow(row)
+    print '```'
+
+def show_table_html(table):
+    print '<table>'
+    for row in table:
+        do_row(row)
+    print '</table>'
 
 def do_row(cells):
     print '  <tr>'
     for val in cells:
         cell(val)
     print '  </tr>'
 
-header = ['name', 'reference', 'version', 'focus', 'taxa', 'synonyms', 'priority', 'reasons']
+def cell(val):
+    print '    <td>'
+    print '   ', val
+    print '    </td>'
 
 
 print '<!--**** THIS FILE IS AUTOMATICALLY GENERATED - DO NOT EDIT ****-->'
 print '### (Table 1)'
 
-print '<table>'
-do_row(header)
-for row in table:
-    do_row([row['name'], row['reference'], row['version'], row['focus'], row['taxa'], row['synonyms'], row['priority'], row['goals']])
-print '</table>'
+show_table_html(table)
+show_table_csv(table)
+
+print
+print """[JAR: get final curation counts.  `cat amendments/*.json | grep original_label | wc` plus `grep "^add" feed/ott/edits/*.tsv | wc`]"""
 
 """
-The root clade could be a column in the table?  No.
 
 Maybe put number of binomials in the table?
 

diff --git a/feed/ott/edits/flag-test.tsv b/feed/ott/edits/flag-test.tsv
diff --git a/make-ott.py b/make-ott.py
@@ -1,8 +1,11 @@
 # Called from Makefile
 
+import sys
 import assemble_ott
 
-ott = assemble_ott.create_ott()
+version = sys.argv[1]
+
+ott = assemble_ott.create_ott(version)
 
 ott.dump('tax/ott/')
 assemble_ott.report(ott)

diff --git a/org/opentreeoflife/smasher/MergeMachine.java b/org/opentreeoflife/smasher/MergeMachine.java
@@ -133,6 +133,7 @@ void report(Taxonomy source, int startroots, int startcount) {
        that has one.
        */
 	void augment(Taxon node, Taxon sink) {
+        if (node.prunedp) return;
         Taxon unode = alignment.getTaxon(node);
 
 		if (node.children == null) {
@@ -147,7 +148,9 @@ else if (a.value <= Answer.HECK_NO)
                     // (weak no) or ambiguous (noinfo)
                     // YES > NOINFO > NO > HECK_NO  (sorry)
                     acceptNew(node, "new/polysemy");
-                }
+                else
+                    tick("ambiguous/redundant");
+            }
 		} else {
             if (unode != null) {
                 for (Taxon child: node.children)
@@ -159,7 +162,7 @@ else if (a.value <= Answer.HECK_NO)
                     augment(child, sink);
                 // Examine mapped parents of the children
                 boolean consistentp = true;
-                Taxon commonParent = null;    // should end up being targetMrca(node)
+                Taxon commonParent = null;
                 Taxon child1 = null, child2 = null; // for inconsistency reporting
                 int count = 0;
                 for (Taxon child : node.children) {
@@ -186,18 +189,12 @@ else if (a.value <= Answer.HECK_NO)
                 } else if (!consistentp) {
                     inconsistent(node, child1, child2, sink);
                 } else if (!commonParent.descendsFrom(sink)) {
-                    // This is the philosophically troublesome case.
-                    // Could be either an outlier/mistake, or something serious.
-                    if (node.markEvent("sibling-sink mismatch"))
-                        System.out.format("* Parent of %s's children's images, %s, is not a descendant of %s\n",
-                                          node, commonParent, sink);
-                    inconsistent(node, child1, child2, sink);
+                    overtake(node, commonParent, sink);
                 } else if (refinementp(node, sink)) {
                     Taxon newnode = acceptNew(node, "new/refinement");
                     takeOld(node, newnode);
                     takeOn(node, newnode, 0); // augmentation
                 } else {
-                    // 'trouble' = paraphyly risk - plain merge.
                     takeOn(node, commonParent, 0);
                     // should include a witness for debugging purposes - merged to/from what?
                     reject(node, "reject/merged", commonParent, Taxonomy.MERGED);
@@ -215,7 +212,8 @@ else if (a.value <= Answer.HECK_NO)
 
     void inconsistent(Taxon node, Taxon child1, Taxon child2, Taxon sink) {
         // Paraphyletic / conflicted.
-        // Put the new children unplaced under the mrca of the placed children.
+        // Put the new children unplaced under the sink, or the mrca of the
+        // placed children, whichever is smaller.
         reportConflict(node, child1, child2, sink);
         // Tighten it if possible... does this always make sense?
         Taxon unode = alignment.getTargetMrca(node);
@@ -225,6 +223,35 @@ void inconsistent(Taxon node, Taxon child1, Taxon child2, Taxon sink) {
         reject(node, "reject/inconsistent", sink, Taxonomy.INCONSISTENT);
     }
 
+    private final static boolean MORE_SENSIBLE_BUT_DOESNT_WORK = false;
+
+    // The symptom of getting this wrong is the creation of a cycle.
+
+    void overtake(Taxon node, Taxon commonParent, Taxon sink) {
+        // This is a troublesome case.
+        // Workspace says children are under sink, but source says they're not.
+        if (node.markEvent("sibling-sink mismatch"))
+            System.out.format("* Parent of %s's children's images, %s, is an ancestor of %s\n",
+                              node,
+                              commonParent,
+                              sink);
+
+        if (MORE_SENSIBLE_BUT_DOESNT_WORK) {
+            takeOn(node, commonParent, 0);
+            reject(node, "reject/overtaken", commonParent, Taxonomy.MERGED);
+        } else {
+            // was: inconsistent(node, child1, child2, sink);
+            Taxon point;
+            Taxon unode = alignment.getTargetMrca(node);
+            if (unode != null && unode.descendsFrom(sink))
+                point = unode;
+            else
+                point = sink;
+            takeOn(node, point, Taxonomy.UNPLACED);
+            reject(node, "reject/overtaken", point, Taxonomy.MERGED);
+        }
+    }
+
     /* Refinement: feature necessary for merging Silva into the
        skeleton and NCBI into Silva.  This lets an internal "new" node
        (in the "new" taxonomy) be inserted in between internal "old"