From 076f836b939d47b7fe434212c68a7f35b1c02882 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Thu, 9 May 2024 22:27:05 -0400 Subject: [PATCH] Refactor InputStreamReader handling DRY up handling of null encoding and our special UTF-8-BOM encoding --- .../refine/importers/FixedWidthImporter.java | 5 +-- .../importers/SeparatorBasedImporter.java | 5 ++- .../refine/importers/TextFormatGuesser.java | 4 +-- .../refine/importing/ImportingUtilities.java | 35 +++++++++---------- .../refine/io/FileHistoryEntryManager.java | 2 +- .../google/refine/importers/ImporterTest.java | 2 +- .../importing/ImportingUtilitiesTests.java | 2 +- 7 files changed, 25 insertions(+), 30 deletions(-) diff --git a/main/src/com/google/refine/importers/FixedWidthImporter.java b/main/src/com/google/refine/importers/FixedWidthImporter.java index 9293add94dd4..21af4da32020 100644 --- a/main/src/com/google/refine/importers/FixedWidthImporter.java +++ b/main/src/com/google/refine/importers/FixedWidthImporter.java @@ -27,11 +27,12 @@ package com.google.refine.importers; +import static com.google.refine.importing.ImportingUtilities.getInputStreamReader; + import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Reader; import java.io.UnsupportedEncodingException; @@ -184,7 +185,7 @@ static private ArrayList getCells(String line, int[] widths) { static public int[] guessColumnWidths(File file, String encoding) { try { InputStream is = new FileInputStream(file); - Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is); + Reader reader = getInputStreamReader(is, encoding); LineNumberReader lineNumberReader = new LineNumberReader(reader); try { diff --git a/main/src/com/google/refine/importers/SeparatorBasedImporter.java b/main/src/com/google/refine/importers/SeparatorBasedImporter.java index b2f6596f6b20..52f057f4d8f7 100644 --- a/main/src/com/google/refine/importers/SeparatorBasedImporter.java +++ b/main/src/com/google/refine/importers/SeparatorBasedImporter.java @@ -37,7 +37,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Reader; import java.io.UnsupportedEncodingException; @@ -242,7 +241,7 @@ static public class Separator { static public CsvFormat guessFormat(File file, String encoding) { try (InputStream is = new FileInputStream(file); - Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is); + Reader reader = ImportingUtilities.getInputStreamReader(is, encoding); LineNumberReader lineNumberReader = new LineNumberReader(reader)) { CsvParserSettings settings = new CsvParserSettings(); // We could provide a set of delimiters to consider below if we wanted to restrict this @@ -265,7 +264,7 @@ static public Separator guessSeparator(File file, String encoding) { static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) { try { try (InputStream is = new FileInputStream(file); - Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is); + Reader reader = ImportingUtilities.getInputStreamReader(is, encoding); LineNumberReader lineNumberReader = new LineNumberReader(reader)) { List separators = new ArrayList<>(); diff --git a/main/src/com/google/refine/importers/TextFormatGuesser.java b/main/src/com/google/refine/importers/TextFormatGuesser.java index 9d313ffaadfc..cfcdda54b651 100644 --- a/main/src/com/google/refine/importers/TextFormatGuesser.java +++ b/main/src/com/google/refine/importers/TextFormatGuesser.java @@ -32,7 +32,6 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import com.google.common.base.CharMatcher; @@ -56,8 +55,7 @@ public String guess(File file, String encoding, String seedFormat) { } InputStream bis = new BoundedInputStream(fis, 64 * 1024); // TODO: This seems like a lot - try (BufferedReader reader = new BufferedReader( - encoding != null ? new InputStreamReader(bis, encoding) : new InputStreamReader(bis))) { + try (BufferedReader reader = new BufferedReader(ImportingUtilities.getInputStreamReader(bis, encoding))) { int totalChars = 0; long openBraces = 0; int closeBraces = 0; diff --git a/main/src/com/google/refine/importing/ImportingUtilities.java b/main/src/com/google/refine/importing/ImportingUtilities.java index 23ecb0d5c136..1b84e83047aa 100644 --- a/main/src/com/google/refine/importing/ImportingUtilities.java +++ b/main/src/com/google/refine/importing/ImportingUtilities.java @@ -568,26 +568,14 @@ static public Reader getReaderFromStream(InputStream inputStream, ObjectNode fil if (encoding == null) { encoding = commonEncoding; } - if (encoding != null) { - - // Special case for UTF-8 with BOM - if (EncodingGuesser.UTF_8_BOM.equals(encoding)) { - try { - return new InputStreamReader(new UnicodeBOMInputStream(inputStream, true), UTF_8); - } catch (IOException e) { - throw new RuntimeException("Exception from UnicodeBOMInputStream", e); - } - } else { - try { - return new InputStreamReader(inputStream, encoding); - } catch (UnsupportedEncodingException e) { - // This should never happen since they picked from a list of supported encodings - throw new RuntimeException("Unsupported encoding: " + encoding, e); - } - } - + try { + return getInputStreamReader(inputStream, encoding); + } catch (UnsupportedEncodingException e) { + // This should never happen since they picked from a list of supported encodings + throw new RuntimeException("Unsupported encoding: " + encoding, e); + } catch (IOException e) { + throw new RuntimeException("Exception from UnicodeBOMInputStream", e); } - return new InputStreamReader(inputStream); } static public File getFile(ImportingJob job, ObjectNode fileRecord) { @@ -1208,4 +1196,13 @@ static public ProjectMetadata createProjectMetadata(ObjectNode optionObj) { pm.setEncoding(encoding); return pm; } + + public static InputStreamReader getInputStreamReader(InputStream is, String encoding) throws IOException { + if (encoding == null) { + return new InputStreamReader(is); + } else if (EncodingGuesser.UTF_8_BOM.equals(encoding)) { // Handle our fake UTF-8 with BOM encoding + return new InputStreamReader(new UnicodeBOMInputStream(is), UTF_8); + } + return new InputStreamReader(is, encoding); + } } diff --git a/main/src/com/google/refine/io/FileHistoryEntryManager.java b/main/src/com/google/refine/io/FileHistoryEntryManager.java index db1bef9751ba..4879a8420385 100644 --- a/main/src/com/google/refine/io/FileHistoryEntryManager.java +++ b/main/src/com/google/refine/io/FileHistoryEntryManager.java @@ -92,7 +92,7 @@ protected void loadChange(HistoryEntry historyEntry, File file) throws Exception Pool pool = new Pool(); ZipEntry poolEntry = zipFile.getEntry("pool.txt"); if (poolEntry != null) { - pool.load(new InputStreamReader( + pool.load(new InputStreamReader( // TODO: Missing encoding here zipFile.getInputStream(poolEntry))); } // else, it's a legacy project file diff --git a/main/tests/server/src/com/google/refine/importers/ImporterTest.java b/main/tests/server/src/com/google/refine/importers/ImporterTest.java index bb59c8f0e522..ec66497143df 100644 --- a/main/tests/server/src/com/google/refine/importers/ImporterTest.java +++ b/main/tests/server/src/com/google/refine/importers/ImporterTest.java @@ -175,7 +175,7 @@ protected void parseOneInputStreamAsReader( ImportColumnGroup rootColumnGroup = new ImportColumnGroup(); List exceptions = new ArrayList(); - Reader reader = new InputStreamReader(inputStream); + Reader reader = new InputStreamReader(inputStream); // FIXME: Why no encoding here parser.parseOneFile( project, metadata, diff --git a/main/tests/server/src/com/google/refine/importing/ImportingUtilitiesTests.java b/main/tests/server/src/com/google/refine/importing/ImportingUtilitiesTests.java index 9d1e1fc28e6d..8e6851236c0a 100644 --- a/main/tests/server/src/com/google/refine/importing/ImportingUtilitiesTests.java +++ b/main/tests/server/src/com/google/refine/importing/ImportingUtilitiesTests.java @@ -481,7 +481,7 @@ public void testImportCompressedFiles() throws IOException, URISyntaxException { InputStream is = ImportingUtilities.tryOpenAsCompressedFile(tmp, null, null); Assert.assertNotNull(is, "Failed to open compressed file: " + filename); - reader = new InputStreamReader(is); + reader = new InputStreamReader(is); // TODO: This needs an encoding Iterable records = CSVFormat.DEFAULT.parse(reader); Assert.assertEquals(StreamSupport.stream(records.spliterator(), false).count(), LINES * 2,