Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix encoding guesser for non-UTF-8 BOM-based files. Fixes #6595 #6596

Merged
merged 5 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 3 additions & 2 deletions main/src/com/google/refine/importers/FixedWidthImporter.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@

package com.google.refine.importers;

import static com.google.refine.importing.ImportingUtilities.getInputStreamReader;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -184,7 +185,7 @@ static private ArrayList<Object> getCells(String line, int[] widths) {
static public int[] guessColumnWidths(File file, String encoding) {
try {
InputStream is = new FileInputStream(file);
Reader reader = (encoding != null) ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader);

try {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ public void parseOneFile(
if (useInputStream) {
parseOneFile(project, metadata, job, fileSource, inputStream, limit, options, exceptions);
} else {
// Although this is called "common" encoding, it may represent the user's override of the encoding
String commonEncoding = JSONUtilities.getString(options, "encoding", null);
if (commonEncoding != null && commonEncoding.isEmpty()) {
commonEncoding = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.io.UnsupportedEncodingException;
Expand Down Expand Up @@ -242,7 +241,7 @@ static public class Separator {

static public CsvFormat guessFormat(File file, String encoding) {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {
CsvParserSettings settings = new CsvParserSettings();
// We could provide a set of delimiters to consider below if we wanted to restrict this
Expand All @@ -265,7 +264,7 @@ static public Separator guessSeparator(File file, String encoding) {
static public Separator guessSeparator(File file, String encoding, boolean handleQuotes) {
try {
try (InputStream is = new FileInputStream(file);
Reader reader = encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
Reader reader = ImportingUtilities.getInputStreamReader(is, encoding);
LineNumberReader lineNumberReader = new LineNumberReader(reader)) {

List<Separator> separators = new ArrayList<>();
Expand Down
4 changes: 1 addition & 3 deletions main/src/com/google/refine/importers/TextFormatGuesser.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;

import com.google.common.base.CharMatcher;
Expand All @@ -56,8 +55,7 @@ public String guess(File file, String encoding, String seedFormat) {
}

InputStream bis = new BoundedInputStream(fis, 64 * 1024); // TODO: This seems like a lot
try (BufferedReader reader = new BufferedReader(
encoding != null ? new InputStreamReader(bis, encoding) : new InputStreamReader(bis))) {
try (BufferedReader reader = new BufferedReader(ImportingUtilities.getInputStreamReader(bis, encoding))) {
int totalChars = 0;
long openBraces = 0;
int closeBraces = 0;
Expand Down
48 changes: 28 additions & 20 deletions main/src/com/google/refine/importing/EncodingGuesser.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
*/
public final class EncodingGuesser {

public static final String UTF_8_BOM = "UTF-8-BOM";
public static final String UTF_8_BOM = "UTF-8-BOM"; // Fake encoding for weird Microsoft UFT-8 with BOM

public static void guess(final ImportingJob job)
throws IOException {
Expand All @@ -33,37 +33,45 @@ public static void guess(final ImportingJob job)
if (fileRecords != null) {
// TODO: If different files have different encodings, we're only able to present a single
// encoding to the user currently. Should we check for conflicts? Warn the user?
for (int i = 0; i < fileRecords.size(); i++) {
ObjectNode record = JSONUtilities.getObjectElement(fileRecords, i);
String encoding = ImportingUtilities.getEncoding(record);
if (StringUtils.isBlank(encoding)) {
String location = JSONUtilities.getString(record, "location", null);
if (location != null) {
try (UnicodeBOMInputStream is = new UnicodeBOMInputStream(
new FileInputStream(new File(job.getRawDataDir(), location)))) {
String detected = UniversalDetector.detectCharset(is);
UnicodeBOMInputStream.BOM bom = is.getBOM();
if (UnicodeBOMInputStream.BOM.UTF_8.equals(bom)) {
detected = UTF_8_BOM;
}
if (detected != null) {
JSONUtilities.safePut(record, "encoding", detected);
}
}
}
guessFilesEncodings(job, fileRecords);
}
}
}

private static void guessFilesEncodings(ImportingJob job, ArrayNode fileRecords) throws IOException {
for (int i = 0; i < fileRecords.size(); i++) {
ObjectNode record = JSONUtilities.getObjectElement(fileRecords, i);
String encoding = ImportingUtilities.getEncoding(record);
if (StringUtils.isBlank(encoding)) {
String location = JSONUtilities.getString(record, "location", null);
if (location != null) {
String detected = guessEncoding(job.getRawDataDir(), location);
if (detected != null) {
JSONUtilities.safePut(record, "encoding", detected);
}
}
}
}
}

public static String guessEncoding(File dir, String location) throws IOException {
try (UnicodeBOMInputStream is = new UnicodeBOMInputStream(
new FileInputStream(new File(dir, location)), false)) {
String detected = UniversalDetector.detectCharset(is);
if (UnicodeBOMInputStream.BOM.UTF_8.equals(is.getBOM())) {
detected = UTF_8_BOM;
}
return detected;
}
}

/**
* uses the first found encoding in the file records as initial encoding and put them into the options
*
* @param fileRecords
* @param options
*/
public final static void guessInitialEncoding(final List<ObjectNode> fileRecords, final ObjectNode options) {
public static void guessInitialEncoding(final List<ObjectNode> fileRecords, final ObjectNode options) {
if (fileRecords != null) {
for (ObjectNode record : fileRecords) {
String encoding = JSONUtilities.getString(record, "encoding", null);
Expand Down
41 changes: 21 additions & 20 deletions main/src/com/google/refine/importing/ImportingUtilities.java
Original file line number Diff line number Diff line change
Expand Up @@ -564,30 +564,22 @@ static public Reader getFileReader(File file, ObjectNode fileRecord, String comm
}

static public Reader getReaderFromStream(InputStream inputStream, ObjectNode fileRecord, String commonEncoding) {
// FIXME: commonEncoding may represent user's override of guessed encoding, so should be used in preference
// to the guessed encoding(s). But, what to do if we have multiple files with different encodings?
// (very unlikely, but still possible)
String encoding = getEncoding(fileRecord);
if (encoding == null) {
if (commonEncoding != null && !commonEncoding.equals(encoding)) {
logger.info("Overriding guessed encoding {} with user's choice: {}", encoding, commonEncoding);
encoding = commonEncoding;
}
if (encoding != null) {

// Special case for UTF-8 with BOM
if (EncodingGuesser.UTF_8_BOM.equals(encoding)) {
try {
return new InputStreamReader(new UnicodeBOMInputStream(inputStream, true), UTF_8);
} catch (IOException e) {
throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
}
} else {
try {
return new InputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// This should never happen since they picked from a list of supported encodings
throw new RuntimeException("Unsupported encoding: " + encoding, e);
}
}

try {
return getInputStreamReader(inputStream, encoding);
} catch (UnsupportedEncodingException e) {
// This should never happen since they picked from a list of supported encodings
throw new RuntimeException("Unsupported encoding: " + encoding, e);
} catch (IOException e) {
throw new RuntimeException("Exception from UnicodeBOMInputStream", e);
}
return new InputStreamReader(inputStream);
}

static public File getFile(ImportingJob job, ObjectNode fileRecord) {
Expand Down Expand Up @@ -1208,4 +1200,13 @@ static public ProjectMetadata createProjectMetadata(ObjectNode optionObj) {
pm.setEncoding(encoding);
return pm;
}

public static InputStreamReader getInputStreamReader(InputStream is, String encoding) throws IOException {
if (encoding == null) {
return new InputStreamReader(is);
} else if (EncodingGuesser.UTF_8_BOM.equals(encoding)) { // Handle our fake UTF-8 with BOM encoding
return new InputStreamReader(new UnicodeBOMInputStream(is, true), UTF_8);
}
return new InputStreamReader(is, encoding);
}
}
2 changes: 1 addition & 1 deletion main/src/com/google/refine/io/FileHistoryEntryManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ protected void loadChange(HistoryEntry historyEntry, File file) throws Exception
Pool pool = new Pool();
ZipEntry poolEntry = zipFile.getEntry("pool.txt");
if (poolEntry != null) {
pool.load(new InputStreamReader(
pool.load(new InputStreamReader( // TODO: Missing encoding here
zipFile.getInputStream(poolEntry)));
} // else, it's a legacy project file

Expand Down
2 changes: 1 addition & 1 deletion main/tests/cypress/cypress/support/openrefine_api.js
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ Cypress.Commands.add('loadProject', (fixture, projectName, tagName) => {
// It's conflicting though, breaking up the CSV files
// It is a hack to parse out CSV files in the openrefine while creating a project with tags
const options = {
encoding: 'US-ASCII',
encoding: 'UTF-8',
separator: ',',
ignoreLines: -1,
headerLines: 1,
Expand Down
Binary file added main/tests/data/example-utf16be-bom.tsv
Binary file not shown.
Binary file added main/tests/data/example-utf16le-bom.tsv
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ protected void parseOneInputStreamAsReader(
ImportColumnGroup rootColumnGroup = new ImportColumnGroup();
List<Exception> exceptions = new ArrayList<Exception>();

Reader reader = new InputStreamReader(inputStream);
Reader reader = new InputStreamReader(inputStream); // FIXME: Why no encoding here
parser.parseOneFile(
project,
metadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
import org.testng.annotations.BeforeTest;
import org.testng.annotations.Test;

import com.google.refine.importing.EncodingGuesser;
import com.google.refine.importing.FormatGuesser;

public class TextFormatGuesserTests extends ImporterTest {
Expand Down Expand Up @@ -91,31 +92,32 @@ public void xlsTextGuessTest() throws FileNotFoundException, IOException {
}

@Test
public void csvGuesserTest() {
public void csvGuesserTest() throws IOException {
extensionGuesserTests("csv", "text/line-based");
}

@Test
public void tsvGuesserTest() {
public void tsvGuesserTest() throws IOException {
extensionGuesserTests("tsv", "text/line-based");
}

@Test(enabled = false) // FIXME: Our JSON guesser doesn't work on small files
public void jsonGuesserTest() {
public void jsonGuesserTest() throws IOException {
extensionGuesserTests("json", "text/json");
}

@Test
public void xmlGuesserTest() {
public void xmlGuesserTest() throws IOException {
extensionGuesserTests("xml", "text/xml");
}

private void extensionGuesserTests(String extension, String expectedFormat) {
private void extensionGuesserTests(String extension, String expectedFormat) throws IOException {
String dir = ClassLoader.getSystemResource("food.csv").getPath();
dir = dir.substring(0, dir.lastIndexOf('/'));
File testDataDir = new File(dir);
for (String testFile : testDataDir.list(new PatternFilenameFilter(".+\\." + extension))) {
String format = guesser.guess(new File(dir, testFile), "UTF-8", "text");
String encoding = EncodingGuesser.guessEncoding(testDataDir, testFile);
String format = guesser.guess(new File(dir, testFile), encoding, "text");
assertEquals(format, expectedFormat, "Format guess failed for " + testFile);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ public void testEncodingGuesser() throws IOException {

checkEncoding("example-latin1.tsv", "windows-1252"); // close enough - these overlap a lot
checkEncoding("example-utf8.tsv", "utf-8");
checkEncoding("example-utf16le-bom.tsv", "utf-16le");
checkEncoding("example-utf16be-bom.tsv", "utf-16be");
checkEncoding("csv-with-bom.csv", "utf-8-bom");
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ public void testImportCompressedFiles() throws IOException, URISyntaxException {
InputStream is = ImportingUtilities.tryOpenAsCompressedFile(tmp, null, null);
Assert.assertNotNull(is, "Failed to open compressed file: " + filename);

reader = new InputStreamReader(is);
reader = new InputStreamReader(is); // TODO: This needs an encoding
Iterable<CSVRecord> records = CSVFormat.DEFAULT.parse(reader);

Assert.assertEquals(StreamSupport.stream(records.spliterator(), false).count(), LINES * 2,
Expand Down