Skip to content

Commit

Permalink
votable: fix/improve output of unicodeChar columns
Browse files Browse the repository at this point in the history
When using votcopy to write a BINARY or BINARY2 serialized VOTable
in which some (non-empty) columns have datatype="unicodeChar",
there was a serious bug which meant the output was inconsistent
with the FIELD declaration (FIELD said datatype="unicodeChar",
but the output stream used 1-byte characters).  This rendered the
output VOTable unreadable (reported by Peter Draper).  Fixed.

Some other slight improvement to handling of unicodeChar columns too:
columns that originally had datatype="unicodeChar" (originating from
a VOTable) are now written out with the same datatype, rather than
being squashed to type char.

However, it's still a bit of a mess, partly because unicode handling
is sloppy in STIL in general, and partly because VOTable's unicodeChar
datatype value doesn't really make sense (VOTable 1.3 and earlier).
The first issue may require more work somewhere down the line,
e.g. STIL becoming aware of whether columns are Unicode or ASCII-alike,
at least for output.
The second has been acknowledged in recent VOTable discussions,
and may be resolved or improved in a future version of the VOTable
standard.
  • Loading branch information
mbtaylor committed Aug 12, 2014
1 parent 255fb30 commit d40fc77
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 25 deletions.
9 changes: 7 additions & 2 deletions topcat/src/docs/sun253.xml
Expand Up @@ -20373,8 +20373,13 @@ introduced since the last version:
<ul>
<li>Support viewing tables up to 2^31 (2 billion) rows in the table
viewer window. The previous limit was 2^27 (134 million) rows.</li>
<li>Attempting to write FITS tables with &gt;999 columns now fails
with a more helpful error message.</li>
<li>Attempting to write FITS tables with &gt;999 columns now fails
with a more helpful error message.</li>
<li>Somewhat improved Unicode handling in VOTables.
If you load a VOTable with columns marked datatype="unicodeChar"
and save it again, the columns now remain unicodeChar
instead of getting squashed to type char.
Some lurking Unicode-related issues remain.</li>
</ul>
</p></dd>

Expand Down
8 changes: 8 additions & 0 deletions ttools/src/docs/sun256.xml
Expand Up @@ -8701,6 +8701,14 @@ eds. C. Gabriel et al., ASP Conf. Ser. 351, p. 666 (2006)
from Error to Warning.</li>
<li>Attempting to write FITS tables with &gt;999 columns now fails
with a more helpful error message.</li>
<li>Improved Unicode handling in VOTables.
Fixed a serious bug in <code>votcopy</code>
that generated unreadable output to BINARY or BINARY2 serialization
if any non-empty column had datatype="unicodeChar".
Also improved behaviour when copying between tables with
unicodeChar columns; these are usually preserved now, rather
than being squashed to datatype char.
Some lurking Unicode-related issues remain.</li>
</ul>
</p></dd>

Expand Down
Expand Up @@ -216,7 +216,7 @@ else if ( "FIELD".equals( localName ) ) {

/* Fix up arraysize values here. */
if ( ( "char".equals( datatype ) ||
"unsignedChar".equals( datatype ) ) &&
"unicodeChar".equals( datatype ) ) &&
atts.getValue( "arraysize" ) == null ) {
String arraysize;
if ( strict_ ) {
Expand Down
99 changes: 80 additions & 19 deletions votable/src/main/uk/ac/starlink/votable/Encoder.java
Expand Up @@ -239,10 +239,15 @@ public void setNullString( String nullString ) {
* be encoded
* @param magicNulls if true, the returned encoder may attempt to use
* a magic value to signify null values; if false, it never will
* @param useUnicodeChar if true, character-type columns will be output
* with datatype unicodeChar, else with datatype char
* @return an encoder object which can do it
*/
public static Encoder getEncoder( ValueInfo info, boolean magicNulls ) {
public static Encoder getEncoder( ValueInfo info, boolean magicNulls,
boolean useUnicodeChar ) {

final CharWriter cwrite = useUnicodeChar ? CharWriter.UCS2
: CharWriter.ASCII;
Class clazz = info.getContentClass();
int[] dims = info.getShape();
final boolean isNullable = info.isNullable() && magicNulls;
Expand Down Expand Up @@ -370,8 +375,8 @@ public void encodeToStream( Object val, DataOutput out )
}

else if ( clazz == Character.class ) {
final int badVal = (int) '\0';
return new ScalarEncoder( info, "char", null ) {
final char badVal = '\0';
return new ScalarEncoder( info, cwrite.getDatatype(), null ) {
/* For a single character take care to add the attribute
* arraysize="1" - although this is implicit according to
* the standard, it's often left off and assumed to be
Expand All @@ -383,8 +388,8 @@ else if ( clazz == Character.class ) {
public void encodeToStream( Object val, DataOutput out )
throws IOException {
Character value = (Character) val;
out.write( value == null ? badVal
: (int) value.charValue() );
cwrite.writeChar( out, value == null ? badVal
: value.charValue() );
}
};
}
Expand Down Expand Up @@ -517,8 +522,7 @@ else if ( clazz == String.class ) {

/* Fixed length strings. */
if ( nChar > 0 ) {
final byte[] padBuf = new byte[ nChar ];
return new Encoder( info, "char" ) {
return new Encoder( info, cwrite.getDatatype() ) {
/*anonymousConstructor*/ {
attMap.put( "arraysize", Integer.toString( nChar ) );
}
Expand All @@ -534,17 +538,19 @@ public void encodeToStream( Object val, DataOutput out )
String value = val.toString();
int limit = Math.min( value.length(), nChar );
for ( ; i < limit; i++ ) {
out.write( value.charAt( i ) );
cwrite.writeChar( out, value.charAt( i ) );
}
}
out.write( padBuf, 0, nChar - i );
for ( ; i < nChar; i++ ) {
cwrite.writeChar( out, '\0' );
}
}
};
}

/* Variable length strings. */
else {
return new Encoder( info, "char" ) {
return new Encoder( info, cwrite.getDatatype() ) {
/*anonymousConstructor*/ {
attMap.put( "arraysize", "*" );
}
Expand All @@ -563,7 +569,7 @@ public void encodeToStream( Object val, DataOutput out )
int leng = value.length();
out.writeInt( leng );
for ( int i = 0 ; i < leng; i++ ) {
out.write( value.charAt( i ) );
cwrite.writeChar( out, value.charAt( i ) );
}
}
}
Expand Down Expand Up @@ -610,10 +616,8 @@ else if ( clazz == String[].class ) {
final String arraysize = sbuf.toString();
final int nString = ns;

return new Encoder( info, "char" ) {
return new Encoder( info, cwrite.getDatatype() ) {
char[] cbuf = new char[ nChar ];
byte[] bbuf = new byte[ nChar ];
byte[] padding = new byte[ nChar ];

/*anonymousConstructor*/ {
attMap.put( "arraysize", arraysize );
Expand Down Expand Up @@ -662,16 +666,16 @@ public void encodeToStream( Object val, DataOutput out )
int climit = Math.min( str.length(), nChar );
int ic = 0;
for ( ; ic < climit; ic++ ) {
bbuf[ ic ] = (byte) str.charAt( ic );
cwrite.writeChar( out, str.charAt( ic ) );
}
for ( ; ic < nChar; ic++ ) {
bbuf[ ic ] = (byte) '\0';
cwrite.writeChar( out, '\0' );
}
out.write( bbuf );
}
if ( ! isVariable ) {
for ( ; is < nString; is++ ) {
out.write( padding );
int nc = ( nString - is ) * nChar;
for ( int ic = 0; ic < nc; ic++ ) {
cwrite.writeChar( out, '\0' );
}
}
}
Expand Down Expand Up @@ -844,4 +848,61 @@ void encode1( Object array, int index, DataOutput out )
*/
void pad1( DataOutput out ) throws IOException;
}

/**
* Handles character output.
*/
private static abstract class CharWriter {
private final String datatype_;

/**
* Implementation using 1-byte characters.
* Only the lower 7 bits of a char are used, anything else is ignored.
*/
public static final CharWriter ASCII = new CharWriter( "char" ) {
public void writeChar( DataOutput out, char c ) throws IOException {
out.write( (int) c );
}
};

/**
* Implementation using 2-byte characters.
* The encoding is java's character storage, which is either identical
* or similar (not sure which) to the deprecated Unicode UCS-2 encoding.
*/
public static final CharWriter UCS2 = new CharWriter( "unicodeChar" ) {
public void writeChar( DataOutput out, char c ) throws IOException {
out.writeChar( c );
}
};

/**
* Constructor.
*
* @return value of VOTable datatype attribute corresponding
* to this writer's output
*/
public CharWriter( String datatype ) {
datatype_ = datatype;
}

/**
* Returns the value of the VOTable datatype attribute corresponding
* to this writer's output.
*
* @return datatype attribute value
*/
public String getDatatype() {
return datatype_;
}

/**
* Writes a character to an output stream.
*
* @param out destination stream
* @param c character to output
*/
public abstract void writeChar( DataOutput out, char c )
throws IOException;
}
}
11 changes: 8 additions & 3 deletions votable/src/main/uk/ac/starlink/votable/VOSerializer.java
Expand Up @@ -232,7 +232,7 @@ public void writeParams( BufferedWriter writer ) throws IOException {
pinfo.setNullable( Tables.isBlank( pvalue ) );

/* Try to write it as a typed PARAM element. */
Encoder encoder = Encoder.getEncoder( pinfo, false );
Encoder encoder = Encoder.getEncoder( pinfo, false, false );
if ( encoder != null ) {
String valtext = encoder.encodeAsText( pvalue );
String content = encoder.getFieldContent();
Expand Down Expand Up @@ -660,7 +660,12 @@ private static Encoder[] getEncoders( StarTable table,
Encoder[] encoders = new Encoder[ ncol ];
for ( int icol = 0; icol < ncol; icol++ ) {
ColumnInfo info = table.getColumnInfo( icol );
encoders[ icol ] = Encoder.getEncoder( info, magicNulls );
boolean isUnicode =
"unicodeChar"
.equals( info.getAuxDatumValue( VOStarTable.DATATYPE_INFO,
String.class ) );
encoders[ icol ] =
Encoder.getEncoder( info, magicNulls, isUnicode );
if ( encoders[ icol ] == null ) {
logger.warning( "Can't serialize column " + info + " of type " +
info.getContentClass().getName() );
Expand Down Expand Up @@ -964,7 +969,7 @@ public void writeFields( BufferedWriter writer ) throws IOException {
/* Get the basic information for this column. */
Encoder encoder =
Encoder.getEncoder( getTable().getColumnInfo( icol ),
true );
true, false );
String content = encoder.getFieldContent();
Map atts = encoder.getFieldAttributes();

Expand Down

0 comments on commit d40fc77

Please sign in to comment.