From d40fc77649a35a4a99d1a3e9af80997483f7c744 Mon Sep 17 00:00:00 2001 From: Mark Taylor Date: Tue, 12 Aug 2014 11:40:20 +0100 Subject: [PATCH] votable: fix/improve output of unicodeChar columns When using votcopy to write a BINARY or BINARY2 serialized VOTable in which some (non-empty) columns have datatype="unicodeChar", there was a serious bug which meant the output was inconsistent with the FIELD declaration (FIELD said datatype="unicodeChar", but the output stream used 1-byte characters). This rendered the output VOTable unreadable (reported by Peter Draper). Fixed. Some other slight improvement to handling of unicodeChar columns too: columns that originally had datatype="unicodeChar" (originating from a VOTable) are now written out with the same datatype, rather than being squashed to type char. However, it's still a bit of a mess, partly because unicode handling is sloppy in STIL in general, and partly because VOTable's unicodeChar datatype value doesn't really make sense (VOTable 1.3 and earlier). The first issue may require more work somewhere down the line, e.g. STIL becoming aware of whether columns are Unicode or ASCII-alike, at least for output. The second has been acknowledged in recent VOTable discussions, and may be resolved or improved in a future version of the VOTable standard. --- topcat/src/docs/sun253.xml | 9 +- ttools/src/docs/sun256.xml | 8 ++ .../starlink/ttools/copy/VotCopyHandler.java | 2 +- .../main/uk/ac/starlink/votable/Encoder.java | 99 +++++++++++++++---- .../uk/ac/starlink/votable/VOSerializer.java | 11 ++- 5 files changed, 104 insertions(+), 25 deletions(-) diff --git a/topcat/src/docs/sun253.xml b/topcat/src/docs/sun253.xml index a3d286ae1c..16b101017e 100644 --- a/topcat/src/docs/sun253.xml +++ b/topcat/src/docs/sun253.xml @@ -20373,8 +20373,13 @@ introduced since the last version:

diff --git a/ttools/src/docs/sun256.xml b/ttools/src/docs/sun256.xml index b5602f8087..67d60d4d9c 100644 --- a/ttools/src/docs/sun256.xml +++ b/ttools/src/docs/sun256.xml @@ -8701,6 +8701,14 @@ eds. C. Gabriel et al., ASP Conf. Ser. 351, p. 666 (2006) from Error to Warning.
  • Attempting to write FITS tables with >999 columns now fails with a more helpful error message.
  • +
  • Improved Unicode handling in VOTables. + Fixed a serious bug in votcopy + that generated unreadable output to BINARY or BINARY2 serialization + if any non-empty column had datatype="unicodeChar". + Also improved behaviour when copying between tables with + unicodeChar columns; these are usually preserved now, rather + than being squashed to datatype char. + Some lurking Unicode-related issues remain.
  • diff --git a/ttools/src/main/uk/ac/starlink/ttools/copy/VotCopyHandler.java b/ttools/src/main/uk/ac/starlink/ttools/copy/VotCopyHandler.java index a5180535fc..ecdf47a04a 100644 --- a/ttools/src/main/uk/ac/starlink/ttools/copy/VotCopyHandler.java +++ b/ttools/src/main/uk/ac/starlink/ttools/copy/VotCopyHandler.java @@ -216,7 +216,7 @@ else if ( "FIELD".equals( localName ) ) { /* Fix up arraysize values here. */ if ( ( "char".equals( datatype ) || - "unsignedChar".equals( datatype ) ) && + "unicodeChar".equals( datatype ) ) && atts.getValue( "arraysize" ) == null ) { String arraysize; if ( strict_ ) { diff --git a/votable/src/main/uk/ac/starlink/votable/Encoder.java b/votable/src/main/uk/ac/starlink/votable/Encoder.java index 60d2b915c2..a512935f1a 100644 --- a/votable/src/main/uk/ac/starlink/votable/Encoder.java +++ b/votable/src/main/uk/ac/starlink/votable/Encoder.java @@ -239,10 +239,15 @@ public void setNullString( String nullString ) { * be encoded * @param magicNulls if true, the returned encoder may attempt to use * a magic value to signify null values; if false, it never will + * @param useUnicodeChar if true, character-type columns will be output + * with datatype unicodeChar, else with datatype char * @return an encoder object which can do it */ - public static Encoder getEncoder( ValueInfo info, boolean magicNulls ) { + public static Encoder getEncoder( ValueInfo info, boolean magicNulls, + boolean useUnicodeChar ) { + final CharWriter cwrite = useUnicodeChar ? CharWriter.UCS2 + : CharWriter.ASCII; Class clazz = info.getContentClass(); int[] dims = info.getShape(); final boolean isNullable = info.isNullable() && magicNulls; @@ -370,8 +375,8 @@ public void encodeToStream( Object val, DataOutput out ) } else if ( clazz == Character.class ) { - final int badVal = (int) '\0'; - return new ScalarEncoder( info, "char", null ) { + final char badVal = '\0'; + return new ScalarEncoder( info, cwrite.getDatatype(), null ) { /* For a single character take care to add the attribute * arraysize="1" - although this is implicit according to * the standard, it's often left off and assumed to be @@ -383,8 +388,8 @@ else if ( clazz == Character.class ) { public void encodeToStream( Object val, DataOutput out ) throws IOException { Character value = (Character) val; - out.write( value == null ? badVal - : (int) value.charValue() ); + cwrite.writeChar( out, value == null ? badVal + : value.charValue() ); } }; } @@ -517,8 +522,7 @@ else if ( clazz == String.class ) { /* Fixed length strings. */ if ( nChar > 0 ) { - final byte[] padBuf = new byte[ nChar ]; - return new Encoder( info, "char" ) { + return new Encoder( info, cwrite.getDatatype() ) { /*anonymousConstructor*/ { attMap.put( "arraysize", Integer.toString( nChar ) ); } @@ -534,17 +538,19 @@ public void encodeToStream( Object val, DataOutput out ) String value = val.toString(); int limit = Math.min( value.length(), nChar ); for ( ; i < limit; i++ ) { - out.write( value.charAt( i ) ); + cwrite.writeChar( out, value.charAt( i ) ); } } - out.write( padBuf, 0, nChar - i ); + for ( ; i < nChar; i++ ) { + cwrite.writeChar( out, '\0' ); + } } }; } /* Variable length strings. */ else { - return new Encoder( info, "char" ) { + return new Encoder( info, cwrite.getDatatype() ) { /*anonymousConstructor*/ { attMap.put( "arraysize", "*" ); } @@ -563,7 +569,7 @@ public void encodeToStream( Object val, DataOutput out ) int leng = value.length(); out.writeInt( leng ); for ( int i = 0 ; i < leng; i++ ) { - out.write( value.charAt( i ) ); + cwrite.writeChar( out, value.charAt( i ) ); } } } @@ -610,10 +616,8 @@ else if ( clazz == String[].class ) { final String arraysize = sbuf.toString(); final int nString = ns; - return new Encoder( info, "char" ) { + return new Encoder( info, cwrite.getDatatype() ) { char[] cbuf = new char[ nChar ]; - byte[] bbuf = new byte[ nChar ]; - byte[] padding = new byte[ nChar ]; /*anonymousConstructor*/ { attMap.put( "arraysize", arraysize ); @@ -662,16 +666,16 @@ public void encodeToStream( Object val, DataOutput out ) int climit = Math.min( str.length(), nChar ); int ic = 0; for ( ; ic < climit; ic++ ) { - bbuf[ ic ] = (byte) str.charAt( ic ); + cwrite.writeChar( out, str.charAt( ic ) ); } for ( ; ic < nChar; ic++ ) { - bbuf[ ic ] = (byte) '\0'; + cwrite.writeChar( out, '\0' ); } - out.write( bbuf ); } if ( ! isVariable ) { - for ( ; is < nString; is++ ) { - out.write( padding ); + int nc = ( nString - is ) * nChar; + for ( int ic = 0; ic < nc; ic++ ) { + cwrite.writeChar( out, '\0' ); } } } @@ -844,4 +848,61 @@ void encode1( Object array, int index, DataOutput out ) */ void pad1( DataOutput out ) throws IOException; } + + /** + * Handles character output. + */ + private static abstract class CharWriter { + private final String datatype_; + + /** + * Implementation using 1-byte characters. + * Only the lower 7 bits of a char are used, anything else is ignored. + */ + public static final CharWriter ASCII = new CharWriter( "char" ) { + public void writeChar( DataOutput out, char c ) throws IOException { + out.write( (int) c ); + } + }; + + /** + * Implementation using 2-byte characters. + * The encoding is java's character storage, which is either identical + * or similar (not sure which) to the deprecated Unicode UCS-2 encoding. + */ + public static final CharWriter UCS2 = new CharWriter( "unicodeChar" ) { + public void writeChar( DataOutput out, char c ) throws IOException { + out.writeChar( c ); + } + }; + + /** + * Constructor. + * + * @return value of VOTable datatype attribute corresponding + * to this writer's output + */ + public CharWriter( String datatype ) { + datatype_ = datatype; + } + + /** + * Returns the value of the VOTable datatype attribute corresponding + * to this writer's output. + * + * @return datatype attribute value + */ + public String getDatatype() { + return datatype_; + } + + /** + * Writes a character to an output stream. + * + * @param out destination stream + * @param c character to output + */ + public abstract void writeChar( DataOutput out, char c ) + throws IOException; + } } diff --git a/votable/src/main/uk/ac/starlink/votable/VOSerializer.java b/votable/src/main/uk/ac/starlink/votable/VOSerializer.java index d26746dfda..63d2bfca4c 100644 --- a/votable/src/main/uk/ac/starlink/votable/VOSerializer.java +++ b/votable/src/main/uk/ac/starlink/votable/VOSerializer.java @@ -232,7 +232,7 @@ public void writeParams( BufferedWriter writer ) throws IOException { pinfo.setNullable( Tables.isBlank( pvalue ) ); /* Try to write it as a typed PARAM element. */ - Encoder encoder = Encoder.getEncoder( pinfo, false ); + Encoder encoder = Encoder.getEncoder( pinfo, false, false ); if ( encoder != null ) { String valtext = encoder.encodeAsText( pvalue ); String content = encoder.getFieldContent(); @@ -660,7 +660,12 @@ private static Encoder[] getEncoders( StarTable table, Encoder[] encoders = new Encoder[ ncol ]; for ( int icol = 0; icol < ncol; icol++ ) { ColumnInfo info = table.getColumnInfo( icol ); - encoders[ icol ] = Encoder.getEncoder( info, magicNulls ); + boolean isUnicode = + "unicodeChar" + .equals( info.getAuxDatumValue( VOStarTable.DATATYPE_INFO, + String.class ) ); + encoders[ icol ] = + Encoder.getEncoder( info, magicNulls, isUnicode ); if ( encoders[ icol ] == null ) { logger.warning( "Can't serialize column " + info + " of type " + info.getContentClass().getName() ); @@ -964,7 +969,7 @@ public void writeFields( BufferedWriter writer ) throws IOException { /* Get the basic information for this column. */ Encoder encoder = Encoder.getEncoder( getTable().getColumnInfo( icol ), - true ); + true, false ); String content = encoder.getFieldContent(); Map atts = encoder.getFieldAttributes();