Skip to content

Commit

Permalink
Refactor InputStreamReader handling
Browse files Browse the repository at this point in the history
DRY up handling of null encoding and our special UTF-8-BOM encoding
  • Loading branch information
tfmorris committed May 10, 2024
1 parent a814235 commit 076f836
Show file tree
Hide file tree
Showing 7 changed files with 25 additions and 30 deletions.
5 changes: 3 additions & 2 deletions main/src/com/google/refine/importers/FixedWidthImporter.java
Expand Up @@ -27,11 +27,12 @@

package com.google.refine.importers;

import static com.google.refine.importing.ImportingUtilities.getInputStreamReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -184,7 +185,7 @@ static private ArrayList<Object> getCells(String line, int[] widths) {
static public int[] guessColumnWidths(File file, String encoding) {
try {
InputStream is = new FileInputStream(file);
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader);

try {
Expand Down
Expand Up @@ -37,7 +37,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -242,7 +241,7 @@ static public class Separator {

static public CsvFormat guessFormat(File file, String encoding) {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
CsvParserSettings settings = new CsvParserSettings();
// We could provide a set of delimiters to consider below if we wanted to restrict this
Expand All @@ -265,7 +264,7 @@ static public Separator guessSeparator(File file, String encoding) {
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
try {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {

List<Separator> separators = new ArrayList<>();
Expand Down
4 changes: 1 addition & 3 deletions main/src/com/google/refine/importers/TextFormatGuesser.java
Expand Up @@ -32,7 +32,6 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import com.google.common.base.CharMatcher;
Expand All @@ -56,8 +55,7 @@ public String guess(File file, String encoding, String seedFormat) {
}

InputStream bis = new BoundedInputStream(fis, 64 * 1024); // TODO: This seems like a lot
try (BufferedReader reader = new BufferedReader(
encoding != null ? new InputStreamReader(bis, encoding) : new InputStreamReader(bis))) {
try (BufferedReader reader = new BufferedReader(ImportingUtilities.getInputStreamReader(bis, encoding))) {
int totalChars = 0;
long openBraces = 0;
int closeBraces = 0;
Expand Down
35 changes: 16 additions & 19 deletions main/src/com/google/refine/importing/ImportingUtilities.java
Expand Up @@ -568,26 +568,14 @@ static public Reader getReaderFromStream(InputStream inputStream, ObjectNode fil
if (encoding == null) {
encoding = commonEncoding;
}
if (encoding != null) {

// Special case for UTF-8 with BOM
if (EncodingGuesser.UTF_8_BOM.equals(encoding)) {
try {
return new InputStreamReader(new UnicodeBOMInputStream(inputStream, true), UTF_8);
} catch (IOException e) {
throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
}
} else {
try {
return new InputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// This should never happen since they picked from a list of supported encodings
throw new RuntimeException("Unsupported encoding: " + encoding, e);
}
}

try {
return getInputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// This should never happen since they picked from a list of supported encodings
throw new RuntimeException("Unsupported encoding: " + encoding, e);
} catch (IOException e) {
throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
}
return new InputStreamReader(inputStream);
}

static public File getFile(ImportingJob job, ObjectNode fileRecord) {
Expand Down Expand Up @@ -1208,4 +1196,13 @@ static public ProjectMetadata createProjectMetadata(ObjectNode optionObj) {
pm.setEncoding(encoding);
return pm;
}

public static InputStreamReader getInputStreamReader(InputStream is, String encoding) throws IOException {
if (encoding == null) {
return new InputStreamReader(is);
} else if (EncodingGuesser.UTF_8_BOM.equals(encoding)) { // Handle our fake UTF-8 with BOM encoding
return new InputStreamReader(new UnicodeBOMInputStream(is), UTF_8);
}
return new InputStreamReader(is, encoding);
}
}
2 changes: 1 addition & 1 deletion main/src/com/google/refine/io/FileHistoryEntryManager.java
Expand Up @@ -92,7 +92,7 @@ protected void loadChange(HistoryEntry historyEntry, File file) throws Exception
Pool pool = new Pool();
ZipEntry poolEntry = zipFile.getEntry("pool.txt");
if (poolEntry != null) {
pool.load(new InputStreamReader(
pool.load(new InputStreamReader( // TODO: Missing encoding here
zipFile.getInputStream(poolEntry)));
} // else, it's a legacy project file

Expand Down
Expand Up @@ -175,7 +175,7 @@ protected void parseOneInputStreamAsReader(
ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
List<Exception> exceptions = new ArrayList<Exception>();

Reader reader = new InputStreamReader(inputStream);
Reader reader = new InputStreamReader(inputStream); // FIXME: Why no encoding here
parser.parseOneFile(
project,
metadata,
Expand Down
Expand Up @@ -481,7 +481,7 @@ public void testImportCompressedFiles() throws IOException, URISyntaxException {
InputStream is = ImportingUtilities.tryOpenAsCompressedFile(tmp, null, null);
Assert.assertNotNull(is, "Failed to open compressed file: " + filename);

reader = new InputStreamReader(is);
reader = new InputStreamReader(is); // TODO: This needs an encoding
Iterable<CSVRecord> records = CSVFormat.DEFAULT.parse(reader);

Assert.assertEquals(StreamSupport.stream(records.spliterator(), false).count(), LINES * 2,
Expand Down

0 comments on commit 076f836

Please sign in to comment.