votable: fix/improve output of unicodeChar columns

When using votcopy to write a BINARY or BINARY2 serialized VOTable in which some (non-empty) columns have datatype="unicodeChar", there was a serious bug which meant the output was inconsistent with the FIELD declaration (FIELD said datatype="unicodeChar", but the output stream used 1-byte characters). This rendered the output VOTable unreadable (reported by Peter Draper). Fixed. Some other slight improvement to handling of unicodeChar columns too: columns that originally had datatype="unicodeChar" (originating from a VOTable) are now written out with the same datatype, rather than being squashed to type char. However, it's still a bit of a mess, partly because unicode handling is sloppy in STIL in general, and partly because VOTable's unicodeChar datatype value doesn't really make sense (VOTable 1.3 and earlier). The first issue may require more work somewhere down the line, e.g. STIL becoming aware of whether columns are Unicode or ASCII-alike, at least for output. The second has been acknowledged in recent VOTable discussions, and may be resolved or improved in a future version of the VOTable standard.
Starlink · Aug 12, 2014 · d40fc77 · d40fc77
1 parent 255fb30
commit d40fc77
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 25 deletions.
diff --git a/topcat/src/docs/sun253.xml b/topcat/src/docs/sun253.xml
@@ -20373,8 +20373,13 @@ introduced since the last version:
     <ul>
     <li>Support viewing tables up to 2^31 (2 billion) rows in the table
         viewer window.  The previous limit was 2^27 (134 million) rows.</li>
-        <li>Attempting to write FITS tables with &gt;999 columns now fails
-            with a more helpful error message.</li>
+    <li>Attempting to write FITS tables with &gt;999 columns now fails
+        with a more helpful error message.</li>
+    <li>Somewhat improved Unicode handling in VOTables.
+        If you load a VOTable with columns marked datatype="unicodeChar"
+        and save it again, the columns now remain unicodeChar
+        instead of getting squashed to type char.
+        Some lurking Unicode-related issues remain.</li>
     </ul>
     </p></dd>
 

diff --git a/ttools/src/docs/sun256.xml b/ttools/src/docs/sun256.xml
@@ -8701,6 +8701,14 @@ eds. C. Gabriel et al., ASP Conf. Ser. 351, p. 666 (2006)
             from Error to Warning.</li>
         <li>Attempting to write FITS tables with &gt;999 columns now fails
             with a more helpful error message.</li>
+        <li>Improved Unicode handling in VOTables.
+            Fixed a serious bug in <code>votcopy</code>
+            that generated unreadable output to BINARY or BINARY2 serialization
+            if any non-empty column had datatype="unicodeChar".
+            Also improved behaviour when copying between tables with
+            unicodeChar columns; these are usually preserved now, rather
+            than being squashed to datatype char.
+            Some lurking Unicode-related issues remain.</li>
         </ul>
         </p></dd>
 

diff --git a/ttools/src/main/uk/ac/starlink/ttools/copy/VotCopyHandler.java b/ttools/src/main/uk/ac/starlink/ttools/copy/VotCopyHandler.java
@@ -216,7 +216,7 @@ else if ( "FIELD".equals( localName ) ) {
 
             /* Fix up arraysize values here. */
             if ( ( "char".equals( datatype ) ||
-                   "unsignedChar".equals( datatype ) ) &&
+                   "unicodeChar".equals( datatype ) ) &&
                  atts.getValue( "arraysize" ) == null ) {
                 String arraysize;
                 if ( strict_ ) {

diff --git a/votable/src/main/uk/ac/starlink/votable/Encoder.java b/votable/src/main/uk/ac/starlink/votable/Encoder.java
@@ -239,10 +239,15 @@ public void setNullString( String nullString ) {
      *          be encoded
      * @param   magicNulls  if true, the returned encoder may attempt to use
      *          a magic value to signify null values; if false, it never will
+     * @param   useUnicodeChar  if true, character-type columns will be output
+     *          with datatype unicodeChar, else with datatype char
      * @return  an encoder object which can do it
      */
-    public static Encoder getEncoder( ValueInfo info, boolean magicNulls ) {
+    public static Encoder getEncoder( ValueInfo info, boolean magicNulls,
+                                      boolean useUnicodeChar ) {
 
+        final CharWriter cwrite = useUnicodeChar ? CharWriter.UCS2
+                                                 : CharWriter.ASCII;
         Class clazz = info.getContentClass();
         int[] dims = info.getShape();
         final boolean isNullable = info.isNullable() && magicNulls;
@@ -370,8 +375,8 @@ public void encodeToStream( Object val, DataOutput out )
         }
 
         else if ( clazz == Character.class ) {
-            final int badVal = (int) '\0';
-            return new ScalarEncoder( info, "char", null ) {
+            final char badVal = '\0';
+            return new ScalarEncoder( info, cwrite.getDatatype(), null ) {
                 /* For a single character take care to add the attribute
                  * arraysize="1" - although this is implicit according to
                  * the standard, it's often left off and assumed to be
@@ -383,8 +388,8 @@ else if ( clazz == Character.class ) {
                 public void encodeToStream( Object val, DataOutput out )
                         throws IOException {
                     Character value = (Character) val;
-                    out.write( value == null ? badVal 
-                                             : (int) value.charValue() );
+                    cwrite.writeChar( out, value == null ? badVal
+                                                         : value.charValue() );
                 }
             };
         }
@@ -517,8 +522,7 @@ else if ( clazz == String.class ) {
 
             /* Fixed length strings. */
             if ( nChar > 0 ) {
-                final byte[] padBuf = new byte[ nChar ];
-                return new Encoder( info, "char" ) {
+                return new Encoder( info, cwrite.getDatatype() ) {
                     /*anonymousConstructor*/ {
                         attMap.put( "arraysize", Integer.toString( nChar ) );
                     }
@@ -534,17 +538,19 @@ public void encodeToStream( Object val, DataOutput out )
                             String value = val.toString();
                             int limit = Math.min( value.length(), nChar );
                             for ( ; i < limit; i++ ) {
-                                out.write( value.charAt( i ) );
+                                cwrite.writeChar( out, value.charAt( i ) );
                             }
                         }
-                        out.write( padBuf, 0, nChar - i );
+                        for ( ; i < nChar; i++ ) {
+                            cwrite.writeChar( out, '\0' );
+                        }
                     }
                 };
             }
 
             /* Variable length strings. */
             else {
-                return new Encoder( info, "char" ) {
+                return new Encoder( info, cwrite.getDatatype() ) {
                     /*anonymousConstructor*/ {
                         attMap.put( "arraysize", "*" );
                     }
@@ -563,7 +569,7 @@ public void encodeToStream( Object val, DataOutput out )
                             int leng = value.length();
                             out.writeInt( leng );
                             for ( int i = 0 ; i < leng; i++ ) {
-                                out.write( value.charAt( i ) );
+                                cwrite.writeChar( out, value.charAt( i ) );
                             }
                         }
                     }
@@ -610,10 +616,8 @@ else if ( clazz == String[].class ) {
             final String arraysize = sbuf.toString();
             final int nString = ns;
 
-            return new Encoder( info, "char" ) {
+            return new Encoder( info, cwrite.getDatatype() ) {
                 char[] cbuf = new char[ nChar ];
-                byte[] bbuf = new byte[ nChar ];
-                byte[] padding = new byte[ nChar ];
 
                 /*anonymousConstructor*/ {
                     attMap.put( "arraysize", arraysize );
@@ -662,16 +666,16 @@ public void encodeToStream( Object val, DataOutput out )
                         int climit = Math.min( str.length(), nChar );
                         int ic = 0;
                         for ( ; ic < climit; ic++ ) {
-                            bbuf[ ic ] = (byte) str.charAt( ic );
+                            cwrite.writeChar( out, str.charAt( ic ) );
                         }
                         for ( ; ic < nChar; ic++ ) {
-                            bbuf[ ic ] = (byte) '\0';
+                            cwrite.writeChar( out, '\0' );
                         }
-                        out.write( bbuf );
                     }
                     if ( ! isVariable ) {
-                        for ( ; is < nString; is++ ) {
-                            out.write( padding );
+                        int nc = ( nString - is ) * nChar;
+                        for ( int ic = 0; ic < nc; ic++ ) {
+                            cwrite.writeChar( out, '\0' );
                         }
                     }
                 }
@@ -844,4 +848,61 @@ void encode1( Object array, int index, DataOutput out )
          */
         void pad1( DataOutput out ) throws IOException;
     }
+
+    /**
+     * Handles character output.
+     */
+    private static abstract class CharWriter {
+        private final String datatype_;
+
+        /**
+         * Implementation using 1-byte characters.
+         * Only the lower 7 bits of a char are used, anything else is ignored.
+         */
+        public static final CharWriter ASCII = new CharWriter( "char" ) {
+            public void writeChar( DataOutput out, char c ) throws IOException {
+                out.write( (int) c );
+            }
+        };
+
+        /**
+         * Implementation using 2-byte characters.
+         * The encoding is java's character storage, which is either identical
+         * or similar (not sure which) to the deprecated Unicode UCS-2 encoding.
+         */
+        public static final CharWriter UCS2 = new CharWriter( "unicodeChar" ) {
+            public void writeChar( DataOutput out, char c ) throws IOException {
+                out.writeChar( c );
+            }
+        };
+
+        /**
+         * Constructor.
+         *
+         * @return   value of VOTable datatype attribute corresponding
+         *           to this writer's output
+         */
+        public CharWriter( String datatype ) {
+            datatype_ = datatype;
+        }
+
+        /**
+         * Returns the value of the VOTable datatype attribute corresponding
+         * to this writer's output.
+         *
+         * @return   datatype attribute value
+         */
+        public String getDatatype() {
+            return datatype_;
+        }
+
+        /**
+         * Writes a character to an output stream.
+         *
+         * @param  out  destination stream
+         * @param  c   character to output
+         */
+        public abstract void writeChar( DataOutput out, char c )
+                throws IOException;
+    }
 }
diff --git a/votable/src/main/uk/ac/starlink/votable/VOSerializer.java b/votable/src/main/uk/ac/starlink/votable/VOSerializer.java
@@ -232,7 +232,7 @@ public void writeParams( BufferedWriter writer ) throws IOException {
             pinfo.setNullable( Tables.isBlank( pvalue ) );
 
             /* Try to write it as a typed PARAM element. */
-            Encoder encoder = Encoder.getEncoder( pinfo, false );
+            Encoder encoder = Encoder.getEncoder( pinfo, false, false );
             if ( encoder != null ) {
                 String valtext = encoder.encodeAsText( pvalue );
                 String content = encoder.getFieldContent();
@@ -660,7 +660,12 @@ private static Encoder[] getEncoders( StarTable table,
         Encoder[] encoders = new Encoder[ ncol ];
         for ( int icol = 0; icol < ncol; icol++ ) {
             ColumnInfo info = table.getColumnInfo( icol );
-            encoders[ icol ] = Encoder.getEncoder( info, magicNulls );
+            boolean isUnicode =
+                "unicodeChar"
+               .equals( info.getAuxDatumValue( VOStarTable.DATATYPE_INFO,
+                                               String.class ) );
+            encoders[ icol ] =
+                Encoder.getEncoder( info, magicNulls, isUnicode );
             if ( encoders[ icol ] == null ) {
                 logger.warning( "Can't serialize column " + info + " of type " +
                                 info.getContentClass().getName() );
@@ -964,7 +969,7 @@ public void writeFields( BufferedWriter writer ) throws IOException {
                     /* Get the basic information for this column. */
                     Encoder encoder =
                         Encoder.getEncoder( getTable().getColumnInfo( icol ),
-                                            true );
+                                            true, false );
                     String content = encoder.getFieldContent();
                     Map atts = encoder.getFieldAttributes();