-
Notifications
You must be signed in to change notification settings - Fork 272
/
CsvIterator.java
450 lines (422 loc) · 14.8 KB
/
CsvIterator.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
/*
* Copyright (C) 2016 - present by OpenGamma Inc. and the OpenGamma group of companies
*
* Please see distribution for license.
*/
package com.opengamma.strata.collect.io;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.NoSuchElementException;
import java.util.Spliterator;
import java.util.Spliterators;
import java.util.function.Predicate;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.PeekingIterator;
import com.google.common.io.CharSource;
import com.opengamma.strata.collect.ArgChecker;
import com.opengamma.strata.collect.Unchecked;
/**
* Iterator over the rows of a CSV file.
* <p>
* Provides the ability to iterate over a CSV file together with the ability to parse it from a {@link CharSource}.
* The separator may be specified, allowing TSV files (tab-separated) and other similar formats to be parsed.
* See {@link CsvFile} for more details of the CSV format.
* <p>
* This class processes the CSV file row-by-row.
* To load the entire CSV file into memory, use {@link CsvFile}.
* <p>
* This class must be used in a try-with-resources block to ensure that the underlying CSV file is closed:
* <pre>
* try (CsvIterator csvIterator = CsvIterator.of(source, true)) {
* // use the CsvIterator
* }
* </pre>
* One way to use the iterable is with the for-each loop, using {@link #asIterable()}:
* <pre>
* try (CsvIterator csvIterator = CsvIterator.of(source, true)) {
* for (CsvRow row : csvIterator.asIterable()) {
* // process the row
* }
* }
* </pre>
* This class also allows the headers to be obtained without reading the whole CSV file:
* <pre>
* try (CsvIterator csvIterator = CsvIterator.of(source, true)) {
* ImmutableList{@literal <String>} headers = csvIterator.headers();
* }
* </pre>
*/
public final class CsvIterator implements AutoCloseable, PeekingIterator<CsvRow> {
/**
* The buffered reader.
*/
private final BufferedReader reader;
/**
* The separator
*/
private final char separator;
/**
* The header row, ordered as the headers appear in the file.
*/
private final ImmutableList<String> headers;
/**
* The header map, transformed for case-insensitive searching.
*/
private final ImmutableMap<String, Integer> searchHeaders;
/**
* The next row.
*/
private CsvRow nextRow;
/**
* The current line number in the source file.
*/
private int currentLineNumber;
//------------------------------------------------------------------------
/**
* Parses the specified source as a CSV file, using a comma as the separator.
*
* @param source the source to read as CSV
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvIterator of(CharSource source, boolean headerRow) {
return of(source, headerRow, ',');
}
/**
* Parses the specified source as a CSV file where the separator is specified and might not be a comma.
* <p>
* This overload allows the separator to be controlled.
* For example, a tab-separated file is very similar to a CSV file, the only difference is the separator.
*
* @param source the source to read as CSV
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @param separator the separator used to separate each field, typically a comma, but a tab is sometimes used
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvIterator of(CharSource source, boolean headerRow, char separator) {
ArgChecker.notNull(source, "source");
@SuppressWarnings("resource")
BufferedReader reader = Unchecked.wrap(() -> source.openBufferedStream());
return create(reader, headerRow, separator);
}
/**
* Parses the specified reader as a CSV file, using a comma as the separator.
* <p>
* The caller is responsible for closing the reader, such as by calling {@link #close()}.
*
* @param reader the file reader
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvIterator of(Reader reader, boolean headerRow) {
return of(reader, headerRow, ',');
}
/**
* Parses the specified reader as a CSV file where the separator is specified and might not be a comma.
* <p>
* This overload allows the separator to be controlled.
* For example, a tab-separated file is very similar to a CSV file, the only difference is the separator.
* <p>
* The caller is responsible for closing the reader, such as by calling {@link #close()}.
*
* @param reader the file reader
* @param headerRow whether the source has a header row, an empty source must still contain the header
* @param separator the separator used to separate each field, typically a comma, but a tab is sometimes used
* @return the CSV file
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public static CsvIterator of(Reader reader, boolean headerRow, char separator) {
ArgChecker.notNull(reader, "reader");
@SuppressWarnings("resource")
BufferedReader breader = reader instanceof BufferedReader ? (BufferedReader) reader : new BufferedReader(reader);
return create(breader, headerRow, separator);
}
// create the iterator
private static CsvIterator create(BufferedReader breader, boolean headerRow, char separator) {
try {
if (!headerRow) {
return new CsvIterator(breader, separator, ImmutableList.of(), ImmutableMap.of(), 0);
}
String line = breader.readLine();
int lineNumber = 1;
while (line != null) {
ImmutableList<String> headers = CsvFile.parseLine(line, separator);
if (!headers.isEmpty()) {
return new CsvIterator(breader, separator, headers, CsvFile.buildSearchHeaders(headers), lineNumber);
}
line = breader.readLine();
lineNumber++;
}
throw new IllegalArgumentException("Could not read header row from empty CSV file");
} catch (RuntimeException ex) {
try {
breader.close();
} catch (IOException ex2) {
ex.addSuppressed(ex2);
}
throw ex;
} catch (IOException ex) {
try {
breader.close();
} catch (IOException ex2) {
ex.addSuppressed(ex2);
}
throw new UncheckedIOException(ex);
}
}
//------------------------------------------------------------------------
/**
* Restricted constructor.
*
* @param reader the buffered reader
* @param headers the header row
* @param searchHeaders the search headers
*/
private CsvIterator(
BufferedReader reader,
char separator,
ImmutableList<String> headers,
ImmutableMap<String, Integer> searchHeaders,
int currentLineNumber) {
this.reader = reader;
this.separator = separator;
this.headers = headers;
this.searchHeaders = searchHeaders;
this.currentLineNumber = currentLineNumber;
}
//------------------------------------------------------------------------
/**
* Gets the header row.
* <p>
* If there is no header row, an empty list is returned.
*
* @return the header row
*/
public ImmutableList<String> headers() {
return headers;
}
/**
* Checks if the header is present in the file.
* <p>
* Matching is case insensitive.
*
* @param header the column header to match
* @return true if the header is present
*/
public boolean containsHeader(String header) {
return searchHeaders.containsKey(header.toLowerCase(Locale.ENGLISH));
}
/**
* Checks if the headers are present in the file.
* <p>
* Matching is case insensitive.
*
* @param headers the column headers to match
* @return true if all the headers are present
*/
public boolean containsHeaders(Collection<String> headers) {
return headers.stream().allMatch(this::containsHeader);
}
/**
* Checks if the header pattern is present in the file.
* <p>
* Matching is case insensitive.
*
* @param headerPattern the header pattern to match
* @return true if the header is present
*/
public boolean containsHeader(Pattern headerPattern) {
for (int i = 0; i < headers.size(); i++) {
if (headerPattern.matcher(headers.get(i)).matches()) {
return true;
}
}
return false;
}
/**
* Returns an {@code Iterable} that wraps this iterator.
* <p>
* Unlike most {@code Iterable} implementations, the method {@link Iterable#iterator()}
* can only be called once. This is intended for use with the for-each loop.
* <pre>
* try (CsvIterator csvIterator = CsvIterator.of(source, true)) {
* for (CsvRow row : csvIterator.asIterable()) {
* // process the row
* }
* }
* </pre>
*
* @return this iterator as an {@code Iterable}
*/
public Iterable<CsvRow> asIterable() {
return () -> this;
}
/**
* Returns a stream that wraps this iterator.
* <p>
* The stream will process any remaining rows in the CSV file.
* As such, it is recommended that callers should use this method or the iterator methods and not both.
*
* @return the stream wrapping this iterator
*/
public Stream<CsvRow> asStream() {
Spliterator<CsvRow> spliterator =
Spliterators.spliteratorUnknownSize(this, Spliterator.ORDERED | Spliterator.NONNULL);
return StreamSupport.stream(spliterator, false);
}
//-------------------------------------------------------------------------
/**
* Checks whether there is another row in the CSV file.
*
* @return true if there is another row, false if not
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
@Override
public boolean hasNext() {
if (nextRow != null) {
return true;
} else {
String line = null;
while ((line = Unchecked.wrap(() -> reader.readLine())) != null) {
currentLineNumber++;
ImmutableList<String> fields = CsvFile.parseLine(line, separator);
if (!fields.isEmpty()) {
nextRow = new CsvRow(headers, searchHeaders, currentLineNumber, fields);
return true;
}
}
return false;
}
}
/**
* Peeks the next row from the CSV file without changing the iteration position.
*
* @return the peeked row
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
* @throws NoSuchElementException if the end of file has been reached
*/
@Override
public CsvRow peek() {
if (nextRow != null || hasNext()) {
return nextRow;
} else {
throw new NoSuchElementException("CsvIterator has reached the end of the file");
}
}
/**
* Returns the next row from the CSV file.
*
* @return the next row
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
* @throws NoSuchElementException if the end of file has been reached
*/
@Override
public CsvRow next() {
if (nextRow != null || hasNext()) {
CsvRow row = nextRow;
nextRow = null;
return row;
} else {
throw new NoSuchElementException("CsvIterator has reached the end of the file");
}
}
/**
* Returns the next batch of rows from the CSV file.
* <p>
* This will return up to the specified number of rows from the file at the current iteration point.
* An empty list is returned if there are no more rows.
*
* @param count the number of rows to try and get, negative returns an empty list
* @return the next batch of rows, up to the number requested
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public List<CsvRow> nextBatch(int count) {
List<CsvRow> rows = new ArrayList<>();
for (int i = 0; i < count; i++) {
if (hasNext()) {
rows.add(next());
}
}
return rows;
}
/**
* Returns the next batch of rows from the CSV file using a predicate to determine the rows.
* <p>
* This is useful for CSV files where information is grouped with an identifier or key.
* For example, a variable notional trade file might have one row for the trade followed by
* multiple rows for the variable aspects, all grouped by a common trade identifier.
* In general, callers should peek or read the first row and use information within it to
* create the selector:
* <pre>
* while (it.hasNext()) {
* CsvRow first = it.peek();
* String id = first.getValue("ID");
* List<CsvRow> batch = it.nextBatch(row -> row.getValue("ID").equals(id));
* // process batch
* }
* </pre>
* This will return a batch of rows where the selector returns true for the row.
* An empty list is returned if the selector returns false for the first row.
*
* @param selector selects whether a row is part of the batch or part of the next batch
* @return the next batch of rows, as determined by the selector
* @throws UncheckedIOException if an IO exception occurs
* @throws IllegalArgumentException if the file cannot be parsed
*/
public List<CsvRow> nextBatch(Predicate<CsvRow> selector) {
List<CsvRow> rows = new ArrayList<>();
while (hasNext() && selector.test(peek())) {
rows.add(next());
}
return rows;
}
/**
* Throws an exception as remove is not supported.
*
* @throws UnsupportedOperationException always
*/
@Override
public void remove() {
throw new UnsupportedOperationException("CsvIterator does not support remove()");
}
/**
* Closes the underlying reader.
*
* @throws UncheckedIOException if an IO exception occurs
*/
@Override
public void close() {
Unchecked.wrap(() -> reader.close());
}
//-------------------------------------------------------------------------
/**
* Returns a string describing the CSV iterator.
*
* @return the descriptive string
*/
@Override
public String toString() {
return "CsvIterator" + headers.toString();
}
}