Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enhance CSV parsing #1705

Merged
merged 1 commit into from May 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Expand Up @@ -11,6 +11,7 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
Expand Down Expand Up @@ -329,16 +330,34 @@ public CsvRow row(int index) {
}

/**
* Checks if the file contains a header.
* Checks if the header is known.
* <p>
* Matching is case insensitive.
*
* @return the header row
* @param header the column header to match
* @return true if the header is known
*/
public boolean containsHeader(String header) {
return searchHeaders.containsKey(header.toLowerCase(Locale.ENGLISH));
}

/**
* Checks if the header pattern is known.
* <p>
* Matching is case insensitive.
*
* @param headerPattern the header pattern to match
* @return true if the header is known
*/
public boolean containsHeader(Pattern headerPattern) {
for (int i = 0; i < headers.size(); i++) {
if (headerPattern.matcher(headers.get(i)).matches()) {
return true;
}
}
return false;
}

//-------------------------------------------------------------------------
/**
* Checks if this CSV file equals another.
Expand Down
Expand Up @@ -15,6 +15,7 @@
import java.util.NoSuchElementException;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
Expand Down Expand Up @@ -359,6 +360,38 @@ public List<CsvRow> nextBatch(int count) {
return rows;
}

/**
* Returns the next batch of rows from the CSV file using a predicate to determine the rows.
* <p>
* This is useful for CSV files where information is grouped with an identifier or key.
* For example, a variable notional trade file might have one row for the trade followed by
* multiple rows for the variable aspects, all grouped by a common trade identifier.
* In general, callers should peek or read the first row and use information within it to
* create the selector:
* <pre>
* while (it.hasNext()) {
* CsvRow first = it.peek();
* String id = first.getValue("ID");
* List&lt;CsvRow&gt; batch = it.nextBatch(row -&gt; row.getValue("ID").equals(id));
* // process batch
* }
* <pre>
* This will return a batch of rows where the selector returns true for the row.
* An empty list is returned if the selector returns false for the first row.
*
* @param selector selects whether a row is part of the batch or part of the next batch
* @return the next batch of rows, as determined by the selector
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public List<CsvRow> nextBatch(Predicate<CsvRow> selector) {
List<CsvRow> rows = new ArrayList<>();
while (hasNext() && selector.test(peek())) {
rows.add(next());
}
return rows;
}

/**
* Throws an exception as remove is not supported.
*
Expand Down
Expand Up @@ -91,6 +91,7 @@ public void test_of_empty_no_header() {
assertEquals(csvFile.headers().size(), 0);
assertEquals(csvFile.rowCount(), 0);
assertEquals(csvFile.containsHeader("Foo"), false);
assertEquals(csvFile.containsHeader(Pattern.compile("Foo")), false);
}

public void test_of_empty_with_header() {
Expand All @@ -102,6 +103,7 @@ public void test_of_simple_no_header() {
assertEquals(csvFile.headers().size(), 0);
assertEquals(csvFile.rowCount(), 4);
assertEquals(csvFile.containsHeader("Foo"), false);
assertEquals(csvFile.containsHeader(Pattern.compile("Foo")), false);
assertEquals(csvFile.row(0).lineNumber(), 1);
assertEquals(csvFile.row(1).lineNumber(), 2);
assertEquals(csvFile.row(2).lineNumber(), 3);
Expand Down Expand Up @@ -129,6 +131,7 @@ public void test_of_simple_no_header_tabs() {
CsvFile csvFile = CsvFile.of(CharSource.wrap(CSV1T), false, '\t');
assertEquals(csvFile.headers().size(), 0);
assertEquals(csvFile.containsHeader("Foo"), false);
assertEquals(csvFile.containsHeader(Pattern.compile("Foo")), false);
assertEquals(csvFile.rowCount(), 3);
assertEquals(csvFile.row(0).lineNumber(), 1);
assertEquals(csvFile.row(1).lineNumber(), 2);
Expand All @@ -152,6 +155,8 @@ public void test_of_simple_with_header() {
CsvFile csvFile = CsvFile.of(CharSource.wrap(CSV1), true);
assertEquals(csvFile.containsHeader("Foo"), false);
assertEquals(csvFile.containsHeader("h1"), true);
assertEquals(csvFile.containsHeader(Pattern.compile("Foo")), false);
assertEquals(csvFile.containsHeader(Pattern.compile("h[0-9]")), true);
ImmutableList<String> headers = csvFile.headers();
assertEquals(headers.size(), 2);
assertEquals(headers.get(0), "h1");
Expand Down
Expand Up @@ -58,6 +58,15 @@ public class CsvIteratorTest {
"h1,h2\n" +
"r1,r2\n";

private final String CSV5GROUPED = "" +
"id,value\n" +
"1,a\n" +
"1,b\n" +
"1,c\n" +
"2,mm\n" +
"3,yyy\n" +
"3,zzz";

//-------------------------------------------------------------------------
public void test_of_ioException() {
assertThrows(
Expand Down Expand Up @@ -302,6 +311,28 @@ public void test_nextBatch2() {
}
}

//-------------------------------------------------------------------------
public void nextBatch_predicate() {
try (CsvIterator csvFile = CsvIterator.of(CharSource.wrap(CSV5GROUPED), true)) {
ImmutableList<String> headers = csvFile.headers();
assertEquals(headers.size(), 2);
assertEquals(headers.get(0), "id");
assertEquals(headers.get(1), "value");
int batches = 0;
int total = 0;
while (csvFile.hasNext()) {
CsvRow first = csvFile.peek();
String id = first.getValue("id");
List<CsvRow> batch = csvFile.nextBatch(row -> row.getValue("id").equals(id));
assertEquals(batch.stream().map(row -> row.getValue("id")).distinct().count(), 1);
batches++;
total += batch.size();
}
assertEquals(batches, 3);
assertEquals(total, 6);
}
}

//-------------------------------------------------------------------------
public void test_asStream_empty_no_header() {
try (CsvIterator csvFile = CsvIterator.of(CharSource.wrap(""), false)) {
Expand Down