Fix encoding guesser for non-UTF-8 BOM-based files. Fixes #6595 (#6596)

* Refactor InputStreamReader handling DRY up handling of null encoding and our special UTF-8-BOM encoding * Fix user override of encoding guessing * Fix encoding guess for UTF-16LE & UTF-16BE with BOM. Fixes #6595 - Don't skip BOM for non-UTF-8 cases so that it is still available for character encoding guesser - Add tests for UTF-16LE and UTF-16BE with BOMs - Refactor encoding guesser so that it can be used for a single file - Enhanced format guessing tests to not assume UTF-8 and instead guess encoding using refactored encoding guesser * Use UTF-8 instead of US-ASCII for test fixtures
OpenRefine · May 13, 2024 · 2a38a88 · 2a38a88
1 parent 1233678
commit 2a38a88
Show file tree

Hide file tree

Showing 14 changed files with 70 additions and 58 deletions.
diff --git a/main/src/com/google/refine/importers/FixedWidthImporter.java b/main/src/com/google/refine/importers/FixedWidthImporter.java
@@ -27,11 +27,12 @@
 
 package com.google.refine.importers;
 
+import static com.google.refine.importing.ImportingUtilities.getInputStreamReader;
+
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
@@ -184,7 +185,7 @@ static private ArrayList<Object> getCells(String line, int[] widths) {
     static public int[] guessColumnWidths(File file, String encoding) {
         try {
             InputStream is = new FileInputStream(file);
-            Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
+            Reader reader = getInputStreamReader(is, encoding);
             LineNumberReader lineNumberReader = new LineNumberReader(reader);
 
             try {

diff --git a/main/src/com/google/refine/importers/ImportingParserBase.java b/main/src/com/google/refine/importers/ImportingParserBase.java
@@ -138,6 +138,7 @@ public void parseOneFile(
                 if (useInputStream) {
                     parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions);
                 } else {
+                    // Although this is called "common" encoding, it may represent the user's override of the encoding
                     String commonEncoding = JSONUtilities.getString(options, "encoding", null);
                     if (commonEncoding != null && commonEncoding.isEmpty()) {
                         commonEncoding = null;

diff --git a/main/src/com/google/refine/importers/SeparatorBasedImporter.java b/main/src/com/google/refine/importers/SeparatorBasedImporter.java
@@ -37,7 +37,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.LineNumberReader;
 import java.io.Reader;
 import java.io.UnsupportedEncodingException;
@@ -242,7 +241,7 @@ static public class Separator {
 
     static public CsvFormat guessFormat(File file, String encoding) {
         try (InputStream is = new FileInputStream(file);
-                Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
+                Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
                 LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
             CsvParserSettings settings = new CsvParserSettings();
             // We could provide a set of delimiters to consider below if we wanted to restrict this
@@ -265,7 +264,7 @@ static public Separator guessSeparator(File file, String encoding) {
     static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
         try {
             try (InputStream is = new FileInputStream(file);
-                    Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
+                    Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
                     LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
 
                 List<Separator> separators = new ArrayList<>();

diff --git a/main/src/com/google/refine/importers/TextFormatGuesser.java b/main/src/com/google/refine/importers/TextFormatGuesser.java
@@ -32,7 +32,6 @@
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.InputStreamReader;
 import java.io.UnsupportedEncodingException;
 
 import com.google.common.base.CharMatcher;
@@ -56,8 +55,7 @@ public String guess(File file, String encoding, String seedFormat) {
             }
 
             InputStream bis = new BoundedInputStream(fis, 64 * 1024); // TODO: This seems like a lot
-            try (BufferedReader reader = new BufferedReader(
-                    encoding != null ? new InputStreamReader(bis, encoding) : new InputStreamReader(bis))) {
+            try (BufferedReader reader = new BufferedReader(ImportingUtilities.getInputStreamReader(bis, encoding))) {
                 int totalChars = 0;
                 long openBraces = 0;
                 int closeBraces = 0;

diff --git a/main/src/com/google/refine/importing/EncodingGuesser.java b/main/src/com/google/refine/importing/EncodingGuesser.java
@@ -23,7 +23,7 @@
  */
 public final class EncodingGuesser {
 
-    public static final String UTF_8_BOM = "UTF-8-BOM";
+    public static final String UTF_8_BOM = "UTF-8-BOM"; // Fake encoding for weird Microsoft UFT-8 with BOM
 
     public static void guess(final ImportingJob job)
             throws IOException {
@@ -33,37 +33,45 @@ public static void guess(final ImportingJob job)
             if (fileRecords != null) {
                 // TODO: If different files have different encodings, we're only able to present a single
                 // encoding to the user currently. Should we check for conflicts? Warn the user?
-                for (int i = 0; i < fileRecords.size(); i++) {
-                    ObjectNode record = JSONUtilities.getObjectElement(fileRecords, i);
-                    String encoding = ImportingUtilities.getEncoding(record);
-                    if (StringUtils.isBlank(encoding)) {
-                        String location = JSONUtilities.getString(record, "location", null);
-                        if (location != null) {
-                            try (UnicodeBOMInputStream is = new UnicodeBOMInputStream(
-                                    new FileInputStream(new File(job.getRawDataDir(), location)))) {
-                                String detected = UniversalDetector.detectCharset(is);
-                                UnicodeBOMInputStream.BOM bom = is.getBOM();
-                                if (UnicodeBOMInputStream.BOM.UTF_8.equals(bom)) {
-                                    detected = UTF_8_BOM;
-                                }
-                                if (detected != null) {
-                                    JSONUtilities.safePut(record, "encoding", detected);
-                                }
-                            }
-                        }
+                guessFilesEncodings(job, fileRecords);
+            }
+        }
+    }
+
+    private static void guessFilesEncodings(ImportingJob job, ArrayNode fileRecords) throws IOException {
+        for (int i = 0; i < fileRecords.size(); i++) {
+            ObjectNode record = JSONUtilities.getObjectElement(fileRecords, i);
+            String encoding = ImportingUtilities.getEncoding(record);
+            if (StringUtils.isBlank(encoding)) {
+                String location = JSONUtilities.getString(record, "location", null);
+                if (location != null) {
+                    String detected = guessEncoding(job.getRawDataDir(), location);
+                    if (detected != null) {
+                        JSONUtilities.safePut(record, "encoding", detected);
                     }
                 }
             }
         }
     }
 
+    public static String guessEncoding(File dir, String location) throws IOException {
+        try (UnicodeBOMInputStream is = new UnicodeBOMInputStream(
+                new FileInputStream(new File(dir, location)), false)) {
+            String detected = UniversalDetector.detectCharset(is);
+            if (UnicodeBOMInputStream.BOM.UTF_8.equals(is.getBOM())) {
+                detected = UTF_8_BOM;
+            }
+            return detected;
+        }
+    }
+
     /**
      * uses the first found encoding in the file records as initial encoding and put them into the options
      * 
      * @param fileRecords
      * @param options
      */
-    public final static void guessInitialEncoding(final List<ObjectNode> fileRecords, final ObjectNode options) {
+    public static void guessInitialEncoding(final List<ObjectNode> fileRecords, final ObjectNode options) {
         if (fileRecords != null) {
             for (ObjectNode record : fileRecords) {
                 String encoding = JSONUtilities.getString(record, "encoding", null);

diff --git a/main/src/com/google/refine/importing/ImportingUtilities.java b/main/src/com/google/refine/importing/ImportingUtilities.java
@@ -564,30 +564,22 @@ static public Reader getFileReader(File file, ObjectNode fileRecord, String comm
     }
 
     static public Reader getReaderFromStream(InputStream inputStream, ObjectNode fileRecord, String commonEncoding) {
+        // FIXME: commonEncoding may represent user's override of guessed encoding, so should be used in preference
+        // to the guessed encoding(s). But, what to do if we have multiple files with different encodings?
+        // (very unlikely, but still possible)
         String encoding = getEncoding(fileRecord);
-        if (encoding == null) {
+        if (commonEncoding != null && !commonEncoding.equals(encoding)) {
+            logger.info("Overriding guessed encoding {} with user's choice: {}", encoding, commonEncoding);
             encoding = commonEncoding;
         }
-        if (encoding != null) {
-
-            // Special case for UTF-8 with BOM
-            if (EncodingGuesser.UTF_8_BOM.equals(encoding)) {
-                try {
-                    return new InputStreamReader(new UnicodeBOMInputStream(inputStream, true), UTF_8);
-                } catch (IOException e) {
-                    throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
-                }
-            } else {
-                try {
-                    return new InputStreamReader(inputStream, encoding);
-                } catch (UnsupportedEncodingException e) {
-                    // This should never happen since they picked from a list of supported encodings
-                    throw new RuntimeException("Unsupported encoding: " + encoding, e);
-                }
-            }
-
+        try {
+            return getInputStreamReader(inputStream, encoding);
+        } catch (UnsupportedEncodingException e) {
+            // This should never happen since they picked from a list of supported encodings
+            throw new RuntimeException("Unsupported encoding: " + encoding, e);
+        } catch (IOException e) {
+            throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
         }
-        return new InputStreamReader(inputStream);
     }
 
     static public File getFile(ImportingJob job, ObjectNode fileRecord) {
@@ -1208,4 +1200,13 @@ static public ProjectMetadata createProjectMetadata(ObjectNode optionObj) {
         pm.setEncoding(encoding);
         return pm;
     }
+
+    public static InputStreamReader getInputStreamReader(InputStream is, String encoding) throws IOException {
+        if (encoding == null) {
+            return new InputStreamReader(is);
+        } else if (EncodingGuesser.UTF_8_BOM.equals(encoding)) { // Handle our fake UTF-8 with BOM encoding
+            return new InputStreamReader(new UnicodeBOMInputStream(is, true), UTF_8);
+        }
+        return new InputStreamReader(is, encoding);
+    }
 }
diff --git a/main/src/com/google/refine/io/FileHistoryEntryManager.java b/main/src/com/google/refine/io/FileHistoryEntryManager.java
@@ -92,7 +92,7 @@ protected void loadChange(HistoryEntry historyEntry, File file) throws Exception
             Pool pool = new Pool();
             ZipEntry poolEntry = zipFile.getEntry("pool.txt");
             if (poolEntry != null) {
-                pool.load(new InputStreamReader(
+                pool.load(new InputStreamReader( // TODO: Missing encoding here
                         zipFile.getInputStream(poolEntry)));
             } // else, it's a legacy project file
 

diff --git a/main/tests/cypress/cypress/support/openrefine_api.js b/main/tests/cypress/cypress/support/openrefine_api.js
@@ -204,7 +204,7 @@ Cypress.Commands.add('loadProject', (fixture, projectName, tagName) => {
     // It's conflicting though, breaking up the CSV files
     // It is a hack to parse out CSV files in the openrefine while creating a project with tags
     const options = {
-      encoding: 'US-ASCII',
+      encoding: 'UTF-8',
       separator: ',',
       ignoreLines: -1,
       headerLines: 1,

diff --git a/main/tests/data/example-utf16be-bom.tsv b/main/tests/data/example-utf16be-bom.tsv
diff --git a/main/tests/data/example-utf16le-bom.tsv b/main/tests/data/example-utf16le-bom.tsv
diff --git a/main/tests/server/src/com/google/refine/importers/ImporterTest.java b/main/tests/server/src/com/google/refine/importers/ImporterTest.java
@@ -175,7 +175,7 @@ protected void parseOneInputStreamAsReader(
         ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
         List<Exception> exceptions = new ArrayList<Exception>();
 
-        Reader reader = new InputStreamReader(inputStream);
+        Reader reader = new InputStreamReader(inputStream); // FIXME: Why no encoding here
         parser.parseOneFile(
                 project,
                 metadata,

diff --git a/main/tests/server/src/com/google/refine/importers/TextFormatGuesserTests.java b/main/tests/server/src/com/google/refine/importers/TextFormatGuesserTests.java
@@ -53,6 +53,7 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 import org.testng.annotations.BeforeTest;
 import org.testng.annotations.Test;
 
+import com.google.refine.importing.EncodingGuesser;
 import com.google.refine.importing.FormatGuesser;
 
 public class TextFormatGuesserTests extends ImporterTest {
@@ -91,31 +92,32 @@ public void xlsTextGuessTest() throws FileNotFoundException, IOException {
     }
 
     @Test
-    public void csvGuesserTest() {
+    public void csvGuesserTest() throws IOException {
         extensionGuesserTests("csv", "text/line-based");
     }
 
     @Test
-    public void tsvGuesserTest() {
+    public void tsvGuesserTest() throws IOException {
         extensionGuesserTests("tsv", "text/line-based");
     }
 
     @Test(enabled = false) // FIXME: Our JSON guesser doesn't work on small files
-    public void jsonGuesserTest() {
+    public void jsonGuesserTest() throws IOException {
         extensionGuesserTests("json", "text/json");
     }
 
     @Test
-    public void xmlGuesserTest() {
+    public void xmlGuesserTest() throws IOException {
         extensionGuesserTests("xml", "text/xml");
     }
 
-    private void extensionGuesserTests(String extension, String expectedFormat) {
+    private void extensionGuesserTests(String extension, String expectedFormat) throws IOException {
         String dir = ClassLoader.getSystemResource("food.csv").getPath();
         dir = dir.substring(0, dir.lastIndexOf('/'));
         File testDataDir = new File(dir);
         for (String testFile : testDataDir.list(new PatternFilenameFilter(".+\\." + extension))) {
-            String format = guesser.guess(new File(dir, testFile), "UTF-8", "text");
+            String encoding = EncodingGuesser.guessEncoding(testDataDir, testFile);
+            String format = guesser.guess(new File(dir, testFile), encoding, "text");
             assertEquals(format, expectedFormat, "Format guess failed for " + testFile);
         }
     }

diff --git a/main/tests/server/src/com/google/refine/importing/EncodingGuesserTests.java b/main/tests/server/src/com/google/refine/importing/EncodingGuesserTests.java
@@ -52,6 +52,8 @@ public void testEncodingGuesser() throws IOException {
 
         checkEncoding("example-latin1.tsv", "windows-1252"); // close enough - these overlap a lot
         checkEncoding("example-utf8.tsv", "utf-8");
+        checkEncoding("example-utf16le-bom.tsv", "utf-16le");
+        checkEncoding("example-utf16be-bom.tsv", "utf-16be");
         checkEncoding("csv-with-bom.csv", "utf-8-bom");
     }
 

diff --git a/main/tests/server/src/com/google/refine/importing/ImportingUtilitiesTests.java b/main/tests/server/src/com/google/refine/importing/ImportingUtilitiesTests.java
@@ -481,7 +481,7 @@ public void testImportCompressedFiles() throws IOException, URISyntaxException {
             InputStream is = ImportingUtilities.tryOpenAsCompressedFile(tmp, null, null);
             Assert.assertNotNull(is, "Failed to open compressed file: " + filename);
 
-            reader = new InputStreamReader(is);
+            reader = new InputStreamReader(is); // TODO: This needs an encoding
             Iterable<CSVRecord> records = CSVFormat.DEFAULT.parse(reader);
 
             Assert.assertEquals(StreamSupport.stream(records.spliterator(), false).count(), LINES * 2,