Skip to content

Commit

Permalink
Fix encoding guesser for non-UTF-8 BOM-based files. Fixes #6595 (#6596)
Browse files Browse the repository at this point in the history
* Refactor InputStreamReader handling

DRY up handling of null encoding and our special UTF-8-BOM encoding

* Fix user override of encoding guessing

* Fix encoding guess for UTF-16LE & UTF-16BE with BOM. Fixes #6595

- Don't skip BOM for non-UTF-8 cases so that it is still available for character encoding guesser
- Add tests for UTF-16LE and UTF-16BE with BOMs
- Refactor encoding guesser so that it can be used for a single file
- Enhanced format guessing tests to not assume UTF-8 and instead guess encoding
  using refactored encoding guesser

* Use UTF-8 instead of US-ASCII for test fixtures
  • Loading branch information
tfmorris committed May 13, 2024
1 parent 1233678 commit 2a38a88
Show file tree
Hide file tree
Showing 14 changed files with 70 additions and 58 deletions.
5 changes: 3 additions & 2 deletions main/src/com/google/refine/importers/FixedWidthImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@

package com.google.refine.importers;

import static com.google.refine.importing.ImportingUtilities.getInputStreamReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -184,7 +185,7 @@ static private ArrayList<Object> getCells(String line, int[] widths) {
static public int[] guessColumnWidths(File file, String encoding) {
try {
InputStream is = new FileInputStream(file);
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader);

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ public void parseOneFile(
if (useInputStream) {
parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions);
} else {
// Although this is called "common" encoding, it may represent the user's override of the encoding
String commonEncoding = JSONUtilities.getString(options, "encoding", null);
if (commonEncoding != null && commonEncoding.isEmpty()) {
commonEncoding = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -242,7 +241,7 @@ static public class Separator {

static public CsvFormat guessFormat(File file, String encoding) {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
CsvParserSettings settings = new CsvParserSettings();
// We could provide a set of delimiters to consider below if we wanted to restrict this
Expand All @@ -265,7 +264,7 @@ static public Separator guessSeparator(File file, String encoding) {
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
try {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {

List<Separator> separators = new ArrayList<>();
Expand Down
4 changes: 1 addition & 3 deletions main/src/com/google/refine/importers/TextFormatGuesser.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import com.google.common.base.CharMatcher;
Expand All @@ -56,8 +55,7 @@ public String guess(File file, String encoding, String seedFormat) {
}

InputStream bis = new BoundedInputStream(fis, 64 * 1024); // TODO: This seems like a lot
try (BufferedReader reader = new BufferedReader(
encoding != null ? new InputStreamReader(bis, encoding) : new InputStreamReader(bis))) {
try (BufferedReader reader = new BufferedReader(ImportingUtilities.getInputStreamReader(bis, encoding))) {
int totalChars = 0;
long openBraces = 0;
int closeBraces = 0;
Expand Down
48 changes: 28 additions & 20 deletions main/src/com/google/refine/importing/EncodingGuesser.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
*/
public final class EncodingGuesser {

public static final String UTF_8_BOM = "UTF-8-BOM";
public static final String UTF_8_BOM = "UTF-8-BOM"; // Fake encoding for weird Microsoft UFT-8 with BOM

public static void guess(final ImportingJob job)
throws IOException {
Expand All @@ -33,37 +33,45 @@ public static void guess(final ImportingJob job)
if (fileRecords != null) {
// TODO: If different files have different encodings, we're only able to present a single
// encoding to the user currently. Should we check for conflicts? Warn the user?
for (int i = 0; i < fileRecords.size(); i++) {
ObjectNode record = JSONUtilities.getObjectElement(fileRecords, i);
String encoding = ImportingUtilities.getEncoding(record);
if (StringUtils.isBlank(encoding)) {
String location = JSONUtilities.getString(record, "location", null);
if (location != null) {
try (UnicodeBOMInputStream is = new UnicodeBOMInputStream(
new FileInputStream(new File(job.getRawDataDir(), location)))) {
String detected = UniversalDetector.detectCharset(is);
UnicodeBOMInputStream.BOM bom = is.getBOM();
if (UnicodeBOMInputStream.BOM.UTF_8.equals(bom)) {
detected = UTF_8_BOM;
}
if (detected != null) {
JSONUtilities.safePut(record, "encoding", detected);
}
}
}
guessFilesEncodings(job, fileRecords);
}
}
}

private static void guessFilesEncodings(ImportingJob job, ArrayNode fileRecords) throws IOException {
for (int i = 0; i < fileRecords.size(); i++) {
ObjectNode record = JSONUtilities.getObjectElement(fileRecords, i);
String encoding = ImportingUtilities.getEncoding(record);
if (StringUtils.isBlank(encoding)) {
String location = JSONUtilities.getString(record, "location", null);
if (location != null) {
String detected = guessEncoding(job.getRawDataDir(), location);
if (detected != null) {
JSONUtilities.safePut(record, "encoding", detected);
}
}
}
}
}

public static String guessEncoding(File dir, String location) throws IOException {
try (UnicodeBOMInputStream is = new UnicodeBOMInputStream(
new FileInputStream(new File(dir, location)), false)) {
String detected = UniversalDetector.detectCharset(is);
if (UnicodeBOMInputStream.BOM.UTF_8.equals(is.getBOM())) {
detected = UTF_8_BOM;
}
return detected;
}
}

/**
* uses the first found encoding in the file records as initial encoding and put them into the options
*
* @param fileRecords
* @param options
*/
public final static void guessInitialEncoding(final List<ObjectNode> fileRecords, final ObjectNode options) {
public static void guessInitialEncoding(final List<ObjectNode> fileRecords, final ObjectNode options) {
if (fileRecords != null) {
for (ObjectNode record : fileRecords) {
String encoding = JSONUtilities.getString(record, "encoding", null);
Expand Down
41 changes: 21 additions & 20 deletions main/src/com/google/refine/importing/ImportingUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -564,30 +564,22 @@ static public Reader getFileReader(File file, ObjectNode fileRecord, String comm
}

static public Reader getReaderFromStream(InputStream inputStream, ObjectNode fileRecord, String commonEncoding) {
// FIXME: commonEncoding may represent user's override of guessed encoding, so should be used in preference
// to the guessed encoding(s). But, what to do if we have multiple files with different encodings?
// (very unlikely, but still possible)
String encoding = getEncoding(fileRecord);
if (encoding == null) {
if (commonEncoding != null && !commonEncoding.equals(encoding)) {
logger.info("Overriding guessed encoding {} with user's choice: {}", encoding, commonEncoding);
encoding = commonEncoding;
}
if (encoding != null) {

// Special case for UTF-8 with BOM
if (EncodingGuesser.UTF_8_BOM.equals(encoding)) {
try {
return new InputStreamReader(new UnicodeBOMInputStream(inputStream, true), UTF_8);
} catch (IOException e) {
throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
}
} else {
try {
return new InputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// This should never happen since they picked from a list of supported encodings
throw new RuntimeException("Unsupported encoding: " + encoding, e);
}
}

try {
return getInputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// This should never happen since they picked from a list of supported encodings
throw new RuntimeException("Unsupported encoding: " + encoding, e);
} catch (IOException e) {
throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
}
return new InputStreamReader(inputStream);
}

static public File getFile(ImportingJob job, ObjectNode fileRecord) {
Expand Down Expand Up @@ -1208,4 +1200,13 @@ static public ProjectMetadata createProjectMetadata(ObjectNode optionObj) {
pm.setEncoding(encoding);
return pm;
}

public static InputStreamReader getInputStreamReader(InputStream is, String encoding) throws IOException {
if (encoding == null) {
return new InputStreamReader(is);
} else if (EncodingGuesser.UTF_8_BOM.equals(encoding)) { // Handle our fake UTF-8 with BOM encoding
return new InputStreamReader(new UnicodeBOMInputStream(is, true), UTF_8);
}
return new InputStreamReader(is, encoding);
}
}
2 changes: 1 addition & 1 deletion main/src/com/google/refine/io/FileHistoryEntryManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ protected void loadChange(HistoryEntry historyEntry, File file) throws Exception
Pool pool = new Pool();
ZipEntry poolEntry = zipFile.getEntry("pool.txt");
if (poolEntry != null) {
pool.load(new InputStreamReader(
pool.load(new InputStreamReader( // TODO: Missing encoding here
zipFile.getInputStream(poolEntry)));
} // else, it's a legacy project file

Expand Down
2 changes: 1 addition & 1 deletion main/tests/cypress/cypress/support/openrefine_api.js
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ Cypress.Commands.add('loadProject', (fixture, projectName, tagName) => {
// It's conflicting though, breaking up the CSV files
// It is a hack to parse out CSV files in the openrefine while creating a project with tags
const options = {
encoding: 'US-ASCII',
encoding: 'UTF-8',
separator: ',',
ignoreLines: -1,
headerLines: 1,
Expand Down
Binary file added main/tests/data/example-utf16be-bom.tsv
Binary file not shown.
Binary file added main/tests/data/example-utf16le-bom.tsv
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ protected void parseOneInputStreamAsReader(
ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
List<Exception> exceptions = new ArrayList<Exception>();

Reader reader = new InputStreamReader(inputStream);
Reader reader = new InputStreamReader(inputStream); // FIXME: Why no encoding here
parser.parseOneFile(
project,
metadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;

import com.google.refine.importing.EncodingGuesser;
import com.google.refine.importing.FormatGuesser;

public class TextFormatGuesserTests extends ImporterTest {
Expand Down Expand Up @@ -91,31 +92,32 @@ public void xlsTextGuessTest() throws FileNotFoundException, IOException {
}

@Test
public void csvGuesserTest() {
public void csvGuesserTest() throws IOException {
extensionGuesserTests("csv", "text/line-based");
}

@Test
public void tsvGuesserTest() {
public void tsvGuesserTest() throws IOException {
extensionGuesserTests("tsv", "text/line-based");
}

@Test(enabled = false) // FIXME: Our JSON guesser doesn't work on small files
public void jsonGuesserTest() {
public void jsonGuesserTest() throws IOException {
extensionGuesserTests("json", "text/json");
}

@Test
public void xmlGuesserTest() {
public void xmlGuesserTest() throws IOException {
extensionGuesserTests("xml", "text/xml");
}

private void extensionGuesserTests(String extension, String expectedFormat) {
private void extensionGuesserTests(String extension, String expectedFormat) throws IOException {
String dir = ClassLoader.getSystemResource("food.csv").getPath();
dir = dir.substring(0, dir.lastIndexOf('/'));
File testDataDir = new File(dir);
for (String testFile : testDataDir.list(new PatternFilenameFilter(".+\\." + extension))) {
String format = guesser.guess(new File(dir, testFile), "UTF-8", "text");
String encoding = EncodingGuesser.guessEncoding(testDataDir, testFile);
String format = guesser.guess(new File(dir, testFile), encoding, "text");
assertEquals(format, expectedFormat, "Format guess failed for " + testFile);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public void testEncodingGuesser() throws IOException {

checkEncoding("example-latin1.tsv", "windows-1252"); // close enough - these overlap a lot
checkEncoding("example-utf8.tsv", "utf-8");
checkEncoding("example-utf16le-bom.tsv", "utf-16le");
checkEncoding("example-utf16be-bom.tsv", "utf-16be");
checkEncoding("csv-with-bom.csv", "utf-8-bom");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ public void testImportCompressedFiles() throws IOException, URISyntaxException {
InputStream is = ImportingUtilities.tryOpenAsCompressedFile(tmp, null, null);
Assert.assertNotNull(is, "Failed to open compressed file: " + filename);

reader = new InputStreamReader(is);
reader = new InputStreamReader(is); // TODO: This needs an encoding
Iterable<CSVRecord> records = CSVFormat.DEFAULT.parse(reader);

Assert.assertEquals(StreamSupport.stream(records.spliterator(), false).count(), LINES * 2,
Expand Down

0 comments on commit 2a38a88

Please sign in to comment.